c - Benchmarking functions -
i've written c code - below - have benchmark of functions. main purpose of benchmark test these functions on avr @ tiny85, on pc (for i'm using atmega168 instead of @ tiny85 - same).
this benchmark executes big number of loops each function has test , "void" function receives same parameters of function tested, executes return. @ end of loops of each functions writes label , time expressed in usec. time duration of loops function specified label.
i might think if subtract benchmark time of "void" function benchmark time of function tested , divide result number of loops, i've sufficient information duration of function tested. not true because interrupts (even measure time).
in case think benchmark able indicate me fastest function. what's opinion? have suggestion about?
here output example:
void 2110168 norm 2121500 base 2337196 basl 2450964 basw 2333980 ant4 2235236 ant5 2242904 unro 2270484 unrl 2590444 vect 2754188 vesw 2732472
the link between label , function may seen looking @ lookup table "static fntest_t fnt" in benchmark code.
the code i've reported below may compiled on pc using gcc 64bit (32bit few modifications because warnings) or on avr using arduino environment / avr-gcc.
below benchmark code. type test_t used in code "typedefined" uint16_t in umul.h file (the purpose of typedef change type of values managed/returned functions, possible few of those!)
#ifdef __avr__ #include <arduino.h> #include <hardwareserial.h> #include <string.h> #else #include <stdio.h> #include <stdint.h> #include <stdlib.h> #include <sys/time.h> #include "timefn.h" #endif #include "umul.h" #ifndef unused #define unused(x) (void)(x) #endif typedef test_t fn_t(test_t a,test_t b); typedef struct fntest_s { fn_t * fn; char * msg; } fntest_t; test_t nullfn(test_t a,test_t b); #ifndef __avr__ uint32_t micros(); #endif static fntest_t fnt[]={ {nullfn,(char *)"void"}, {umul16_normal,(char *)"normal"}, {umul16_base,(char *)"base"}, {umul16_baseandlogic,(char *)"basl"}, {umul16_baseswap,(char *)"basw"}, {umul16_antonio4,(char *)"ant4"}, {umul16_antonio5,(char *)"ant5"}, {umul16_unrolled,(char *)"unro"}, {umul16_unrolledandlogic,(char *)"unrl"}, {umul16_vect,(char *)"vect"}, {umul16_vectswap,(char *)"vesw"} }; #ifndef __avr__ uint32_t micros() { struct timeval t; gettimeofday(&t,null); return (t.tv_sec*1000000ul+t.tv_usec); } #endif test_t nullfn(test_t a,test_t b) { unused(a);unused(b); return 0; } test_t umultry() { #ifdef __avr__ #define runs 20000 static char strbuf[50]; #else #define runs 10000000 #endif unsigned int i,j,k; uint32_t x; test_t ix,iy; static test_t z[16]; for(j=0;j<5;j++) { for(k=0;k<sizeof(fnt)/sizeof(fntest_t);k++) { x=micros();srand(x); for(i=0;i<runs;i++) { ix=rand();iy=rand(); z[i&0xf]+=fnt[k].fn(ix,iy); } x=micros()-x; #ifdef __avr__ sprintf(strbuf,"%s %lu\n",fnt[k].msg, x); serial.print(strbuf); #else printf("%s %u\n",fnt[k].msg, x); #endif } for(i=0;i<16;i++) { z[0]+=z[i]; /* avoid warn unused , optimizations don't evaluate z[]*/ } #ifdef __avr__ serial.println("----------------"); #else puts("----------------"); #endif } return z[0]; } #ifdef __avr__ void setup() { serial.begin(115200); serial.println(f("starting...")); } void loop() { umultry(); for(;;); } #else int main(void) { puts("starting..."); return umultry(); } #endif
here functions tested:
#include "umul.h" test_t umul16_normal(test_t a, test_t b) { return a*b; } test_t umul16_unrolled(test_t a, test_t b) { test_t result=0; #define umul16_step(a, b, shift) \ if ((b) & (1u << (shift))) result += (a<<shift); umul16_step(a, b, 0); umul16_step(a, b, 1); umul16_step(a, b, 2); umul16_step(a, b, 3); umul16_step(a, b, 4); umul16_step(a, b, 5); umul16_step(a, b, 6); umul16_step(a, b, 7); umul16_step(a, b, 8); umul16_step(a, b, 9); umul16_step(a, b, 10); umul16_step(a, b, 11); umul16_step(a, b, 12); umul16_step(a, b, 13); umul16_step(a, b, 14); umul16_step(a, b, 15); return result; #undef umul16_step } test_t umul16_unrolledandlogic(test_t a, test_t b) { test_t result=0; #define umul16_step(a, b, shift) \ /* if ((b) & (1u << (shift))) result += (a<<shift);*/\ result+= ((0 - !(!((b&(1u<<(shift)))))) & (a<<(shift))); umul16_step(a, b, 0); umul16_step(a, b, 1); umul16_step(a, b, 2); umul16_step(a, b, 3); umul16_step(a, b, 4); umul16_step(a, b, 5); umul16_step(a, b, 6); umul16_step(a, b, 7); umul16_step(a, b, 8); umul16_step(a, b, 9); umul16_step(a, b, 10); umul16_step(a, b, 11); umul16_step(a, b, 12); umul16_step(a, b, 13); umul16_step(a, b, 14); umul16_step(a, b, 15); return result; #undef umul16_step } test_t umul16_antonio5(test_t a, test_t b) { test_t res = 0; uint8_t b0 = b & 0xff; //this should optimized away uint8_t b1 = b >>8; //this should optimized away //swapping doesn't make sense anymore if ( (b1 & 1) ) res+=(test_t)((uint8_t)(a && 0xff))*256; //hopefully compiler understands has add low 8bit register of high 8bit register of res if ( (b0 & 1) ) res+=a; b1>>=1; b0>>=1; while (b0) {///n cycles, maximum 7 a+=a; if ( (b1 & 1) ) res+=(test_t)((uint8_t)(a & 0xff))*256; if ( (b0 & 1) ) res+=a; b1>>=1; b0>>=1; //i try put last 1 leave carry flag in desired state } uint8_t a0 = & 0xff; //again, not real copy register selection while (b1) {///p cycles, maximum 7 - n cycles a0+=a0; if ( (b1 & 1) ) res+=(test_t) a0 * 256; b1>>=1; } return res; } test_t umul16_base(test_t a, test_t b) { test_t res=0; while (b) { if ( (b & 1) ) res+=a; b>>=1; a+=a; } return res; } test_t umul16_baseandlogic(test_t a, test_t b) { test_t res=0; while (b) { //if ( (b & 1) ) // res+=a; res+= ((0 - !(!(b&1))) & a); b>>=1; a+=a; } return res; } test_t umul16_baseswap(test_t a, test_t b) { test_t res; if (a<b) { res=a; a=b; b=res; } res=0; while (b) { if ( (b & 1) ) res+=a; b>>=1; a+=a; } return res; } test_t umul16_antonio4(test_t a, test_t b) { uint8_t res1 = 0; uint8_t a0 = & 0xff; //this needs copy data uint8_t b0 = b & 0xff; //this should optimized away uint8_t b1 = b >>8; //this should optimized away //here a0 , b1 swapped (to have b1 < a0) if ( (b1 & 1) ) res1+=a0; b1>>=1; while (b1) {///maximum 7 cycles a0+=a0; if ( (b1 & 1) ) res1+=a0; b1>>=1; } test_t res = (test_t) res1 * 256; //should optimized away, it's not copy! //here swapping wouldn't make sense if ( (b0 & 1) ) res+=a; b0>>=1; while (b0) {///maximum 7 cycles a+=a; if ( (b0 & 1) ) res+=a; b0>>=1; } return res; } test_t umul16_vect(test_t a, test_t b) { test_t c[2]; c[0]=0;c[1]=a;a=0; while (b) { a+=c[(b & 1)]; b>>=1; c[1]+=c[1]; } return a; } test_t umul16_vectswap(test_t a, test_t b) { test_t c[2]; if (a<b) { c[1]=b; b=a; a=c[1]; } c[0]=0;c[1]=a;a=0; while (b) { a+=c[(b & 1)]; b>>=1; c[1]+=c[1]; } return a; } test_t udiv_(test_t n,test_t d, test_t *r) { test_t q = 0,i,r_; r_=0; if (d == 0) return (test_t)-1u; //error i= ( (test_t)(1) << ((sizeof(n)<<3)-1) ); (;i!=0;i>>=1) { r_ <<= 1; if (n&i) r_ |= 1; if (r_ >= d) { r_ -= d; q |= i; } } if (r!=null) *r=r_; return q; }
here include file umul.h functions tested:
#ifndef __umul_h #define __umul_h #ifdef __avr_attiny85__ typedef signed int int8_t __attribute__((__mode__(__qi__))); typedef unsigned int uint8_t __attribute__((__mode__(__qi__))); typedef signed int int16_t __attribute__ ((__mode__ (__hi__))); typedef unsigned int uint16_t __attribute__ ((__mode__ (__hi__))); typedef signed int int32_t __attribute__ ((__mode__ (__si__))); typedef unsigned int uint32_t __attribute__ ((__mode__ (__si__))); typedef signed int int64_t __attribute__((__mode__(__di__))); typedef unsigned int uint64_t __attribute__((__mode__(__di__))); #define null 0 #else #include <stdlib.h> #include <stdint.h> #endif typedef uint16_t test_t; #ifdef __cplusplus extern "c" { #endif test_t umul16_normal(test_t a, test_t b); test_t umul16_unrolled(test_t a, test_t b); test_t umul16_unrolledandlogic(test_t a, test_t b); test_t umul16_antonio5(test_t a, test_t b); test_t umul16_base(test_t a, test_t b); test_t umul16_baseswap(test_t a, test_t b); test_t umul16_antonio4(test_t a, test_t b); test_t umul16_vect(test_t a, test_t b); test_t umul16_vectswap(test_t a, test_t b); test_t umul16_baseandlogic(test_t a, test_t b); test_t udiv_(test_t n,test_t d, test_t *r); } // extern "c" #endif #endif
normally remove influence of interruptions repeat test few times , keep fastest response measurement.
the repetition needed complex cpus x86 remove dependency on current cache content , branch predictor statistics.
on modern cpus important sure clock fixed (most modern cpus automatically modulate clock reduce heating/consumption when cpu idling , may require time clock control logic full speed).
Comments
Post a Comment