c - Benchmarking functions -


i've written c code - below - have benchmark of functions. main purpose of benchmark test these functions on avr @ tiny85, on pc (for i'm using atmega168 instead of @ tiny85 - same).

this benchmark executes big number of loops each function has test , "void" function receives same parameters of function tested, executes return. @ end of loops of each functions writes label , time expressed in usec. time duration of loops function specified label.

i might think if subtract benchmark time of "void" function benchmark time of function tested , divide result number of loops, i've sufficient information duration of function tested. not true because interrupts (even measure time).

in case think benchmark able indicate me fastest function. what's opinion? have suggestion about?

here output example:

void 2110168 norm 2121500 base 2337196 basl 2450964 basw 2333980 ant4 2235236 ant5 2242904 unro 2270484 unrl 2590444 vect 2754188 vesw 2732472 

the link between label , function may seen looking @ lookup table "static fntest_t fnt" in benchmark code.

the code i've reported below may compiled on pc using gcc 64bit (32bit few modifications because warnings) or on avr using arduino environment / avr-gcc.

below benchmark code. type test_t used in code "typedefined" uint16_t in umul.h file (the purpose of typedef change type of values managed/returned functions, possible few of those!)

#ifdef __avr__ #include <arduino.h> #include <hardwareserial.h> #include <string.h> #else #include <stdio.h> #include <stdint.h> #include <stdlib.h> #include <sys/time.h> #include "timefn.h" #endif  #include "umul.h"  #ifndef unused #define unused(x) (void)(x) #endif  typedef test_t fn_t(test_t a,test_t b);  typedef struct fntest_s {     fn_t * fn;     char * msg; } fntest_t;  test_t nullfn(test_t a,test_t b); #ifndef __avr__ uint32_t micros(); #endif  static fntest_t fnt[]={     {nullfn,(char *)"void"},     {umul16_normal,(char *)"normal"},     {umul16_base,(char *)"base"},     {umul16_baseandlogic,(char *)"basl"},     {umul16_baseswap,(char *)"basw"},     {umul16_antonio4,(char *)"ant4"},     {umul16_antonio5,(char *)"ant5"},     {umul16_unrolled,(char *)"unro"},     {umul16_unrolledandlogic,(char *)"unrl"},     {umul16_vect,(char *)"vect"},     {umul16_vectswap,(char *)"vesw"} };  #ifndef __avr__ uint32_t micros() {     struct timeval t;     gettimeofday(&t,null);      return (t.tv_sec*1000000ul+t.tv_usec); } #endif  test_t nullfn(test_t a,test_t b) {     unused(a);unused(b);     return 0; }  test_t umultry() { #ifdef __avr__ #define runs 20000 static char strbuf[50]; #else #define runs 10000000 #endif      unsigned int i,j,k;     uint32_t x;      test_t ix,iy;     static test_t z[16];      for(j=0;j<5;j++) {         for(k=0;k<sizeof(fnt)/sizeof(fntest_t);k++) {             x=micros();srand(x);             for(i=0;i<runs;i++) {                 ix=rand();iy=rand();                 z[i&0xf]+=fnt[k].fn(ix,iy);             }             x=micros()-x; #ifdef __avr__             sprintf(strbuf,"%s %lu\n",fnt[k].msg, x);             serial.print(strbuf); #else             printf("%s %u\n",fnt[k].msg, x); #endif         }         for(i=0;i<16;i++) {             z[0]+=z[i]; /* avoid warn unused , optimizations don't evaluate z[]*/         }  #ifdef __avr__         serial.println("----------------"); #else         puts("----------------"); #endif     }      return z[0]; }  #ifdef __avr__ void setup() {     serial.begin(115200);     serial.println(f("starting...")); }  void loop() {     umultry();     for(;;); } #else int main(void) {     puts("starting...");     return umultry(); } #endif 

here functions tested:

#include "umul.h"  test_t umul16_normal(test_t a, test_t b) {     return a*b; }  test_t umul16_unrolled(test_t a, test_t b) { test_t result=0;  #define umul16_step(a, b, shift) \     if ((b) & (1u << (shift))) result += (a<<shift);      umul16_step(a, b, 0);     umul16_step(a, b, 1);     umul16_step(a, b, 2);     umul16_step(a, b, 3);     umul16_step(a, b, 4);     umul16_step(a, b, 5);     umul16_step(a, b, 6);     umul16_step(a, b, 7);     umul16_step(a, b, 8);     umul16_step(a, b, 9);     umul16_step(a, b, 10);     umul16_step(a, b, 11);     umul16_step(a, b, 12);     umul16_step(a, b, 13);     umul16_step(a, b, 14);     umul16_step(a, b, 15);      return result; #undef umul16_step }  test_t umul16_unrolledandlogic(test_t a, test_t b) { test_t result=0; #define umul16_step(a, b, shift) \ /*    if ((b) & (1u << (shift))) result += (a<<shift);*/\     result+=  ((0 - !(!((b&(1u<<(shift)))))) & (a<<(shift)));      umul16_step(a, b, 0);     umul16_step(a, b, 1);     umul16_step(a, b, 2);     umul16_step(a, b, 3);     umul16_step(a, b, 4);     umul16_step(a, b, 5);     umul16_step(a, b, 6);     umul16_step(a, b, 7);     umul16_step(a, b, 8);     umul16_step(a, b, 9);     umul16_step(a, b, 10);     umul16_step(a, b, 11);     umul16_step(a, b, 12);     umul16_step(a, b, 13);     umul16_step(a, b, 14);     umul16_step(a, b, 15);      return result; #undef umul16_step }  test_t umul16_antonio5(test_t a, test_t b) {     test_t res = 0;      uint8_t b0 = b & 0xff; //this should optimized away     uint8_t b1 = b >>8; //this should optimized away      //swapping doesn't make sense anymore     if ( (b1 & 1) )         res+=(test_t)((uint8_t)(a && 0xff))*256;     //hopefully compiler understands has add low 8bit register of high 8bit register of res      if ( (b0 & 1) )         res+=a;      b1>>=1;     b0>>=1;     while (b0) {///n cycles, maximum 7         a+=a;         if ( (b1 & 1) )             res+=(test_t)((uint8_t)(a & 0xff))*256;         if ( (b0 & 1) )             res+=a;         b1>>=1;         b0>>=1; //i try put last 1 leave carry flag in desired state     }      uint8_t a0 = & 0xff; //again, not real copy register selection      while (b1) {///p cycles, maximum 7 - n cycles         a0+=a0;         if ( (b1 & 1) )             res+=(test_t) a0 * 256;         b1>>=1;     }     return res; }  test_t umul16_base(test_t a, test_t b) {     test_t res=0;      while (b) {         if ( (b & 1) )             res+=a;         b>>=1;         a+=a;     }      return res; }  test_t umul16_baseandlogic(test_t a, test_t b) {     test_t res=0;      while (b) {         //if ( (b & 1) )         //    res+=a;         res+=  ((0 - !(!(b&1))) & a);         b>>=1;         a+=a;     }      return res; }  test_t umul16_baseswap(test_t a, test_t b) {     test_t res;      if (a<b) {         res=a;         a=b;         b=res;     }      res=0;     while (b) {         if ( (b & 1) )             res+=a;         b>>=1;         a+=a;     }      return res; }  test_t umul16_antonio4(test_t a, test_t b) {     uint8_t res1 = 0;      uint8_t a0 = & 0xff; //this needs copy data     uint8_t b0 = b & 0xff; //this should optimized away     uint8_t b1 = b >>8; //this should optimized away      //here a0 , b1 swapped (to have b1 < a0)     if ( (b1 & 1) )         res1+=a0;     b1>>=1;     while (b1) {///maximum 7 cycles         a0+=a0;         if ( (b1 & 1) )             res1+=a0;         b1>>=1;     }      test_t res = (test_t) res1 * 256; //should optimized away, it's not copy!      //here swapping wouldn't make sense     if ( (b0 & 1) )         res+=a;     b0>>=1;     while (b0) {///maximum 7 cycles         a+=a;         if ( (b0 & 1) )             res+=a;         b0>>=1;     }      return res; }  test_t umul16_vect(test_t a, test_t b) {     test_t c[2];      c[0]=0;c[1]=a;a=0;     while (b) {         a+=c[(b & 1)];         b>>=1;         c[1]+=c[1];     }      return a; }  test_t umul16_vectswap(test_t a, test_t b) {     test_t c[2];      if (a<b) {         c[1]=b;         b=a;         a=c[1];     }      c[0]=0;c[1]=a;a=0;     while (b) {         a+=c[(b & 1)];         b>>=1;         c[1]+=c[1];     }      return a; }  test_t udiv_(test_t n,test_t d, test_t *r) {     test_t q = 0,i,r_;      r_=0;     if (d == 0) return (test_t)-1u; //error      i= ( (test_t)(1) << ((sizeof(n)<<3)-1) );     (;i!=0;i>>=1) {         r_ <<= 1;          if (n&i)             r_ |= 1;          if (r_ >= d) {             r_ -= d;             q |= i;         }     }     if (r!=null)         *r=r_;      return q; } 

here include file umul.h functions tested:

#ifndef __umul_h #define __umul_h  #ifdef __avr_attiny85__ typedef signed int int8_t __attribute__((__mode__(__qi__))); typedef unsigned int uint8_t __attribute__((__mode__(__qi__))); typedef signed int int16_t __attribute__ ((__mode__ (__hi__))); typedef unsigned int uint16_t __attribute__ ((__mode__ (__hi__))); typedef signed int int32_t __attribute__ ((__mode__ (__si__))); typedef unsigned int uint32_t __attribute__ ((__mode__ (__si__))); typedef signed int int64_t __attribute__((__mode__(__di__))); typedef unsigned int uint64_t __attribute__((__mode__(__di__)));  #define null 0 #else #include <stdlib.h> #include <stdint.h> #endif  typedef uint16_t test_t;  #ifdef __cplusplus extern "c" { #endif  test_t umul16_normal(test_t a, test_t b); test_t umul16_unrolled(test_t a, test_t b); test_t umul16_unrolledandlogic(test_t a, test_t b); test_t umul16_antonio5(test_t a, test_t b); test_t umul16_base(test_t a, test_t b); test_t umul16_baseswap(test_t a, test_t b); test_t umul16_antonio4(test_t a, test_t b); test_t umul16_vect(test_t a, test_t b); test_t umul16_vectswap(test_t a, test_t b); test_t umul16_baseandlogic(test_t a, test_t b);     test_t udiv_(test_t n,test_t d, test_t *r); } // extern "c" #endif  #endif 

normally remove influence of interruptions repeat test few times , keep fastest response measurement.

the repetition needed complex cpus x86 remove dependency on current cache content , branch predictor statistics.

on modern cpus important sure clock fixed (most modern cpus automatically modulate clock reduce heating/consumption when cpu idling , may require time clock control logic full speed).


Comments

Popular posts from this blog

jquery - How do you format the date used in the popover widget title of FullCalendar? -

asp.net mvc - SSO between MVCForum and Umbraco7 -

Python Tkinter keyboard using bind -