Why are my MMX codes running slower than the c++ codes(in green)? result is the same. only speed difference
void tom::add(void* btr)
{
__declspec(align(8))short* b =(short*)btr;
int j;
__m64*b1 = (__m64*)b;
j=0;
__m64 f0 = _mm_set_pi16(b[j+12],b[j+8],b[j+4],b[j]);
__m64 f1 = _mm_set_pi16(b[j+13],b[j+9],b[j+5],b[j+1]);
__m64 f2 = _mm_set_pi16(b[j+14],b[j+10],b[j+6],b[j+2]);
__m64 f3 = _mm_set_pi16(b[j+15],b[j+11],b[j+7],b[j+3]);
for(j = 0; j < 4; j+=4)
{
__m64 s0 =_mm_add_pi16(f0,f3);
__m64 s3 =_mm_sub_pi16(f0,f3);
__m64 s1 =_mm_add_pi16(f1,f2);
__m64 s2 =_mm_sub_pi16(f1,f2);
*(&b1[j]) =_mm_add_pi16(s0,s1);
*(&b1[j+2]) =_mm_sub_pi16(s0,s1);
*(&b1[j+1]) =_mm_add_pi16(s2,_mm_slli_pi16(s3, 1));
*(&b1[j+3]) =_mm_sub_pi16(s3,_mm_slli_pi16(s2, 1));
}
_mm_empty();
}