codecs.org Part of GStreamer Family
home
SpeciaLib
libcodec
bitstream
Hyperopt
nopcodes
 
SourceForge Logo
 
 
 
 
 
 

What's the problem?

The presence or absence of a single fprintf at the end of a routine used to time various implementations of the motion-compensation routines causes significant variance in the resulting timings. This makes no sense primarily because the code in question is well outside even the outer timing loops.

Show me the code!

The timing function calls the code under test (via function pointer) in a set of loops. The first inner loop calls the routine once, while the second calls it twice. The cycle times for each loop are calculated, and compared to the two best times gathered in the outer loop. Finally, the difference between the two best loop times is taken, in an attepmt to isolate the cycles used by the function itself from the cycles used in loop overhead. The function attempts to simulate conditions typically found in an MPEG parser, hence the 720-byte wide buffers.

typedef void (*__sl_mcomp_U8_U8)(uint8_t *dst,int dst_stride,uint8_t *src,int src_stride);

SL_TIME uint32_t __sl_mcomp_time_U8_U8(_sl_funcptr_def *fpdef, void *func) {
  uint8_t dst[720*32];
  uint8_t src[720*32];
  int i,j;
  __sl_mcomp_U8_U8 testfunc = (__sl_mcomp_U8_U8)func;
  uint64_t start,stop;
  uint32_t cycles1,cycles2;
  uint32_t besttime1 = 0xffffffff, besttime2 = 0xffffffff;

  for (i=0;i<8;i++) {
    rdtscll(start);
    for (j=0;j<8;j++)
      (testfunc)(dst+8+(720*8),720,src+8+(720*8),720);
    rdtscll(stop);
    cycles1 = (uint32_t)(stop-start);

    rdtscll(start);
    for (j=0;j<8;j++) {
      (testfunc)(dst+8+(720*8),720,src+8+(720*8),720);
      (testfunc)(dst+8+(720*8),720,src+8+(720*8),720);
    }
    rdtscll(stop);
    cycles2 = (uint32_t)(stop-start);

    if (cycles1 < besttime1) besttime1 = cycles1;
    if (cycles2 < besttime2) besttime2 = cycles2;
  }

  fprintf(stderr,"besttime1 is %d, besttime2 is %d\n",besttime1,besttime2);
  return (besttime2 - besttime1);
}

The functions being called in this particular example are normally used in an MPEG decoder to place a block from a reference frame into the current picture. In this case the block is located at a half-pixel offset in the X direction, so averaging of two consecutive source pixels is necessary.

SL_INT_FUNC void __sl_mcomp_put_X_U8_U8_8x8__c(
    uint8_t *dst,int dst_stride,uint8_t *src,int src_stride) {
  int x,y;
  int dst_jump = dst_stride - 8;
  int src_jump = src_stride - 8;

  for (y=0;y&t;8;y++) {
    for (x=0;x<8;x++)
      *dst++ = sl_clip_u8((*src++ + *(src+1) + 1)/2);
    dst += dst_jump;
    src += src_jump;
  }
}
SL_INT_FUNC void __sl_mcomp_put_X_U8_U8_8x8__x86_mmx(
    uint8_t *dst,int dst_stride,uint8_t *src,int src_stride) {
  int y;

  for (y=0;y<8;y+=2) {
    movq_m2r(*src,mm1);
    movq_m2r(*(src+1),mm2);
    movq_r2r(mm1,mm3);

    pxor_r2r(mm2,mm1);          // (x^y)
    pand_m2r(_sl_mcomp_qword_byte01,mm1);       // trim zero bit before reg-wide shift
    psrlq_i2r(1,mm1);           // (x^y)>>1
    pand_r2r(mm2,mm3);          // calculate (x&y) into mm3
    paddb_r2r(mm3,mm1);         // add (x&y) and (x^y)>>1 into mm3

    movq_r2m(mm1,*dst);

    movq_m2r(*(src+src_stride),mm4);
    movq_m2r(*(src+1+src_stride),mm5);
    movq_r2r(mm4,mm6);

    pxor_r2r(mm5,mm4);          // (x^y)
    pand_m2r(_sl_mcomp_qword_byte01,mm4);       // trim zero bit before reg-wide shift
    psrlq_i2r(1,mm4);           // (x^y)>>1
    pand_r2r(mm5,mm6);          // calculate (x&y) into mm3
    paddb_r2r(mm6,mm4);         // add (x&y) and (x^y)>>1 into mm3

    movq_r2m(mm4,*(dst+dst_stride));

    dst += dst_stride*2;
    src += src_stride*2;
  }
}
SL_INT_FUNC void __sl_mcomp_put_X_U8_U8_8x8__x86_sse(
    uint8_t *dst,int dst_stride,uint8_t *src,int src_stride) {
  int y;

  movq_m2r(*src,mm1);
  pavgb_m2r(*(src+1),mm1);
  movq_r2m(mm1,*dst);

  movq_m2r(*(src+src_stride),mm2);   
  pavgb_m2r(*(src+1+src_stride),mm2);   
  movq_r2m(mm2,*(dst+dst_stride));   

  movq_m2r(*(src+src_stride*2),mm2); 
  pavgb_m2r(*(src+1+src_stride*2),mm2); 
  movq_r2m(mm2,*(dst+dst_stride*2)); 

  movq_m2r(*(src+src_stride*4),mm2);
  pavgb_m2r(*(src+1+src_stride*4),mm2);
  movq_r2m(mm2,*(dst+dst_stride*4));

  src += src_stride*7; 
  dst += dst_stride*7; 
  src_stride = -src_stride;
  dst_stride = -dst_stride;

  movq_m2r(*src,mm1);
  pavgb_m2r(*(src+1),mm1);
  movq_r2m(mm1,*dst);
        
  movq_m2r(*(src+src_stride),mm2);  
  pavgb_m2r(*(src+1+src_stride),mm2);  
  movq_r2m(mm2,*(dst+dst_stride));     

  movq_m2r(*(src+src_stride*2),mm2);
  pavgb_m2r(*(src+1+src_stride*2),mm2);
  movq_r2m(mm2,*(dst+dst_stride*2));   

  movq_m2r(*(src+src_stride*4),mm2);
  pavgb_m2r(*(src+1+src_stride*4),mm2);
  movq_r2m(mm2,*(dst+dst_stride*4));
}

And the assembly?


Version without fprintf                                                             Version with fprintf

00008b60 <__sl_mcomp_time_U8_U8>:                                                    00008b60 <__sl_mcomp_time_U8_U8>:
SL_TIME uint32_t __sl_mcomp_time_U8_U8(_sl_funcptr_def *fpdef, void *func) { SL_TIME uint32_t __sl_mcomp_time_U8_U8(_sl_funcptr_def *fpdef, void *func) {
8b60: 55 push %ebp 8b60: 55 push %ebp
8b61: 89 e5 mov %esp,%ebp 8b61: 89 e5 mov %esp,%ebp
8b63: 57 push %edi 8b63: 57 push %edi
8b64: 56 push %esi 8b64: 56 push %esi
8b65: 81 ec 30 b4 00 00 sub $0xb430,%esp 8b65: 53 push %ebx
8b66: 81 ec 3c b4 00 00 sub $0xb43c,%esp
8b6c: e8 00 00 00 00 call 8b71 <__sl_mcomp_time_U8_U8+0x11>
8b71: 5b pop %ebx
8b72: 81 c3 6b 26 00 00 add $0x266b,%ebx
8b6b: 8b 45 0c mov 0xc(%ebp),%eax 8b78: 8b 45 0c mov 0xc(%ebp),%eax
uint8_t dst[720*32]; uint8_t dst[720*32];
uint8_t src[720*32]; uint8_t src[720*32];
int i,j; int i,j;
__sl_mcomp_U8_U8 testfunc = (__sl_mcomp_U8_U8)func; __sl_mcomp_U8_U8 testfunc = (__sl_mcomp_U8_U8)func;
uint64_t start,stop; uint64_t start,stop;
uint32_t cycles1,cycles2; uint32_t cycles1,cycles2;
uint32_t besttime1 = 0xffffffff, besttime2 = 0xffffffff; uint32_t besttime1 = 0xffffffff, besttime2 = 0xffffffff;
8b6e: ba ff ff ff ff mov $0xffffffff,%edx 8b7b: ba ff ff ff ff mov $0xffffffff,%edx
8b73: 89 85 f0 4b ff ff mov %eax,0xffff4bf0(%ebp) 8b80: 89 85 e0 4b ff ff mov %eax,0xffff4be0(%ebp)
8b79: 89 95 dc 4b ff ff mov %edx,0xffff4bdc(%ebp) 8b86: 89 95 cc 4b ff ff mov %edx,0xffff4bcc(%ebp)
for (i=0;i<8;i++) { for (i=0;i<8;i++) {
8b7f: 8d 85 80 62 ff ff lea 0xffff6280(%ebp),%eax 8b8c: 8d 85 70 62 ff ff lea 0xffff6270(%ebp),%eax
8b85: 8d 95 80 bc ff ff lea 0xffffbc80(%ebp),%edx 8b92: 8d 95 70 bc ff ff lea 0xffffbc70(%ebp),%edx
8b8b: c7 85 e0 4b ff ff ff movl $0xffffffff,0xffff4be0(%ebp) 8b98: c7 85 d0 4b ff ff ff movl $0xffffffff,0xffff4bd0(%ebp)
8b92: ff ff ff 8b9f: ff ff ff
8b95: 89 85 d8 4b ff ff mov %eax,0xffff4bd8(%ebp) 8ba2: 89 85 c8 4b ff ff mov %eax,0xffff4bc8(%ebp)
8b9b: 89 95 d4 4b ff ff mov %edx,0xffff4bd4(%ebp) 8ba8: 89 95 c4 4b ff ff mov %edx,0xffff4bc4(%ebp)
8ba1: c7 85 f4 4b ff ff 07 movl $0x7,0xffff4bf4(%ebp) 8bae: c7 85 e4 4b ff ff 07 movl $0x7,0xffff4be4(%ebp)
8ba8: 00 00 00 8bb5: 00 00 00
8bab: 90 nop
rdtscll(start); rdtscll(start);
8bac: 0f 31 rdtsc 8bb8: 0f 31 rdtsc
8bae: 89 85 e8 4b ff ff mov %eax,0xffff4be8(%ebp) 8bba: 89 85 d8 4b ff ff mov %eax,0xffff4bd8(%ebp)
8bb4: 89 95 ec 4b ff ff mov %edx,0xffff4bec(%ebp) 8bc0: 89 95 dc 4b ff ff mov %edx,0xffff4bdc(%ebp)
for (j=0;j<8;j++) for (j=0;j<8;j++)
8bba: be 07 00 00 00 mov $0x7,%esi 8bc6: be 07 00 00 00 mov $0x7,%esi
8bbf: 90 nop 8bcb: 90 nop
(testfunc)(dst+8+(720*8),720,src+8+(720*8),720); (testfunc)(dst+8+(720*8),720,src+8+(720*8),720);
8bc0: 68 d0 02 00 00 push $0x2d0 8bcc: 68 d0 02 00 00 push $0x2d0
8bc5: ff b5 d8 4b ff ff pushl 0xffff4bd8(%ebp) 8bd1: ff b5 c8 4b ff ff pushl 0xffff4bc8(%ebp)
8bcb: 68 d0 02 00 00 push $0x2d0 8bd7: 68 d0 02 00 00 push $0x2d0
8bd0: ff b5 d4 4b ff ff pushl 0xffff4bd4(%ebp) 8bdc: ff b5 c4 4b ff ff pushl 0xffff4bc4(%ebp)
8bd6: ff 95 f0 4b ff ff call *0xffff4bf0(%ebp) 8be2: ff 95 e0 4b ff ff call *0xffff4be0(%ebp)
8bdc: 83 c4 10 add $0x10,%esp 8be8: 83 c4 10 add $0x10,%esp
8bdf: 4e dec %esi 8beb: 4e dec %esi
8be0: 79 de jns 8bc0 <__sl_mcomp_time_U8_U8+0x60> 8bec: 79 de jns 8bcc <__sl_mcomp_time_U8_U8+0x6c>
rdtscll(stop); rdtscll(stop);
8be2: 0f 31 rdtsc 8bee: 0f 31 rdtsc
cycles1 = (uint32_t)(stop-start); cycles1 = (uint32_t)(stop-start);
8be4: 2b 85 e8 4b ff ff sub 0xffff4be8(%ebp),%eax 8bf0: 2b 85 d8 4b ff ff sub 0xffff4bd8(%ebp),%eax
8bea: 89 85 e4 4b ff ff mov %eax,0xffff4be4(%ebp) 8bf6: 89 85 d4 4b ff ff mov %eax,0xffff4bd4(%ebp)
rdtscll(start); rdtscll(start);
8bf0: 0f 31 rdtsc 8bfc: 0f 31 rdtsc
8bf2: 89 95 ec 4b ff ff mov %edx,0xffff4bec(%ebp) 8bfe: 89 95 dc 4b ff ff mov %edx,0xffff4bdc(%ebp)
for (j=0;j<8;j++) { for (j=0;j<8;j++) {
8bf8: 8d 95 80 62 ff ff lea 0xffff6280(%ebp),%edx 8c04: 8d 95 70 62 ff ff lea 0xffff6270(%ebp),%edx
8bfe: 89 85 e8 4b ff ff mov %eax,0xffff4be8(%ebp) 8c0a: 89 85 d8 4b ff ff mov %eax,0xffff4bd8(%ebp)
8c04: 89 95 d0 4b ff ff mov %edx,0xffff4bd0(%ebp) 8c10: 89 95 c0 4b ff ff mov %edx,0xffff4bc0(%ebp)
8c0a: 8d bd 80 bc ff ff lea 0xffffbc80(%ebp),%edi 8c16: 8d bd 70 bc ff ff lea 0xffffbc70(%ebp),%edi
8c10: be 07 00 00 00 mov $0x7,%esi 8c1c: be 07 00 00 00 mov $0x7,%esi
8c15: 8d 76 00 lea 0x0(%esi),%esi 8c21: 8d 76 00 lea 0x0(%esi),%esi
(testfunc)(dst+8+(720*8),720,src+8+(720*8),720); (testfunc)(dst+8+(720*8),720,src+8+(720*8),720);
8c18: 68 d0 02 00 00 push $0x2d0 8c24: 68 d0 02 00 00 push $0x2d0
8c1d: ff b5 d0 4b ff ff pushl 0xffff4bd0(%ebp) 8c29: ff b5 c0 4b ff ff pushl 0xffff4bc0(%ebp)
8c23: 68 d0 02 00 00 push $0x2d0 8c2f: 68 d0 02 00 00 push $0x2d0
8c28: 57 push %edi 8c34: 57 push %edi
8c29: ff 95 f0 4b ff ff call *0xffff4bf0(%ebp) 8c35: ff 95 e0 4b ff ff call *0xffff4be0(%ebp)
(testfunc)(dst+8+(720*8),720,src+8+(720*8),720); (testfunc)(dst+8+(720*8),720,src+8+(720*8),720);
8c2f: 68 d0 02 00 00 push $0x2d0 8c3b: 68 d0 02 00 00 push $0x2d0
8c34: ff b5 d0 4b ff ff pushl 0xffff4bd0(%ebp) 8c40: ff b5 c0 4b ff ff pushl 0xffff4bc0(%ebp)
8c3a: 68 d0 02 00 00 push $0x2d0 8c46: 68 d0 02 00 00 push $0x2d0
8c3f: 57 push %edi 8c4b: 57 push %edi
8c40: ff 95 f0 4b ff ff call *0xffff4bf0(%ebp) 8c4c: ff 95 e0 4b ff ff call *0xffff4be0(%ebp)
8c46: 83 c4 20 add $0x20,%esp 8c52: 83 c4 20 add $0x20,%esp
8c49: 4e dec %esi 8c55: 4e dec %esi
8c4a: 79 cc jns 8c18 <__sl_mcomp_time_U8_U8+0xb8> 8c56: 79 cc jns 8c24 <__sl_mcomp_time_U8_U8+0xc4>
} }
rdtscll(stop); rdtscll(stop);
8c4c: 0f 31 rdtsc 8c58: 0f 31 rdtsc
cycles2 = (uint32_t)(stop-start); cycles2 = (uint32_t)(stop-start);
if (cycles1 < besttime1) besttime1 = cycles1; if (cycles1 < besttime1) besttime1 = cycles1;
8c4e: 8b 95 e0 4b ff ff mov 0xffff4be0(%ebp),%edx 8c5a: 8b 95 d0 4b ff ff mov 0xffff4bd0(%ebp),%edx
8c54: 2b 85 e8 4b ff ff sub 0xffff4be8(%ebp),%eax 8c60: 2b 85 d8 4b ff ff sub 0xffff4bd8(%ebp),%eax
8c5a: 39 95 e4 4b ff ff cmp %edx,0xffff4be4(%ebp) 8c66: 39 95 d4 4b ff ff cmp %edx,0xffff4bd4(%ebp)
8c60: 73 0c jae 8c6e <__sl_mcomp_time_U8_U8+0x10e> 8c6c: 73 0c jae 8c7a <__sl_mcomp_time_U8_U8+0x11a>
8c62: 8b 95 e4 4b ff ff mov 0xffff4be4(%ebp),%edx 8c6e: 8b 95 d4 4b ff ff mov 0xffff4bd4(%ebp),%edx
8c68: 89 95 e0 4b ff ff mov %edx,0xffff4be0(%ebp) 8c74: 89 95 d0 4b ff ff mov %edx,0xffff4bd0(%ebp)
if (cycles2 < besttime2) besttime2 = cycles2; if (cycles2 < besttime2) besttime2 = cycles2;
8c6e: 3b 85 dc 4b ff ff cmp 0xffff4bdc(%ebp),%eax 8c7a: 3b 85 cc 4b ff ff cmp 0xffff4bcc(%ebp),%eax
8c74: 73 06 jae 8c7c <__sl_mcomp_time_U8_U8+0x11c> 8c80: 73 06 jae 8c88 <__sl_mcomp_time_U8_U8+0x128>
8c76: 89 85 dc 4b ff ff mov %eax,0xffff4bdc(%ebp) 8c82: 89 85 cc 4b ff ff mov %eax,0xffff4bcc(%ebp)
8c7c: ff 8d f4 4b ff ff decl 0xffff4bf4(%ebp) 8c88: ff 8d e4 4b ff ff decl 0xffff4be4(%ebp)
8c82: 0f 89 24 ff ff ff jns 8bac <__sl_mcomp_time_U8_U8+0x4c> 8c8e: 0f 89 24 ff ff ff jns 8bb8 <__sl_mcomp_time_U8_U8+0x58>
} }
// fprintf(stderr,"besttime1 is %d, besttime2 is %d\n",besttime1,besttime2); fprintf(stderr,"besttime1 is %d, besttime2 is %d\n",besttime1,besttime2);
8c94: ff b5 cc 4b ff ff pushl 0xffff4bcc(%ebp)
8c9a: ff b5 d0 4b ff ff pushl 0xffff4bd0(%ebp)
8ca0: 8d 83 24 dc ff ff lea 0xffffdc24(%ebx),%eax
8ca6: 50 push %eax
8ca7: 8b 83 38 00 00 00 mov 0x38(%ebx),%eax
8cad: ff 30 pushl (%eax)
8caf: e8 e8 b1 ff ff call 3e9c <_init+0x38>
return (besttime2 - besttime1); return (besttime2 - besttime1);
8c88: 8b 85 e0 4b ff ff mov 0xffff4be0(%ebp),%eax 8cb4: 8b 85 d0 4b ff ff mov 0xffff4bd0(%ebp),%eax
8c8e: 29 85 dc 4b ff ff sub %eax,0xffff4bdc(%ebp) 8cba: 29 85 cc 4b ff ff sub %eax,0xffff4bcc(%ebp)
8c94: 8b 85 dc 4b ff ff mov 0xffff4bdc(%ebp),%eax 8cc0: 8b 85 cc 4b ff ff mov 0xffff4bcc(%ebp),%eax
} 8cc6: 83 c4 10 add $0x10,%esp
8c9a: 8d 65 f8 lea 0xfffffff8(%ebp),%esp }
8c9d: 5e pop %esi 8cc9: 8d 65 f4 lea 0xfffffff4(%ebp),%esp
8c9e: 5f pop %edi 8ccc: 5b pop %ebx
8c9f: 5d pop %ebp 8ccd: 5e pop %esi
8ca0: c3 ret 8cce: 5f pop %edi
8ca1: 8d 76 00 lea 0x0(%esi),%esi 8ccf: 5d pop %ebp
8cd0: c3 ret
8cd1: 8d 76 00 lea 0x0(%esi),%esi

Results


                                                                                     besttime1 is 4384, besttime2 is 8676
C routine takes 3904 cycles                                                          C routine takes 4292 cycles
                                                                                     besttime1 is 942, besttime2 is 1724
MMX routine takes 889 cycles                                                         MMX routine takes 782 cycles
                                                                                     besttime1 is 777, besttime2 is 1486
SSE routine takes 710 cycles                                                         SSE routine takes 709 cycles