There are three bugs on your code:
- you have declared
B1
and B2
as matrixes of int
(then each element is 32 bit in size), but your assembler routine works on 16 bit values, then you should declare your matrixes as short
- you get and set the matrixes at addresses
B
, B+8
, B+12
and B+16
; the right offsets are 0, 8, 16 and 24 - after the first step you should load again the matrix
B1
onto the mm1
, mm2
, mm3
and mm4
registers
See the code below, now it works properly:
#include <emmintrin.h>
#include <iostream>
#include <iomanip>
using namespace std;
int _tmain(int argc, _TCHAR* argv[])
{
short B1[4][4]; short B2[4][4];
int n=0;
for (int i = 0; i < 4; i++)
for (int j = 0; j < 4; j++, n++)
B1[i][j] = n;
__asm{
movq mm1, B1
movq mm2, B1+8
movq mm3, B1+16
movq mm4, B1+24
punpcklwd mm1, mm2
punpcklwd mm3, mm4
movq mm5, mm1
punpckldq mm1, mm3
punpckhdq mm5, mm3
movq B2, mm1
movq B2+8, mm5
movq mm1, B1
movq mm2, B1+8
movq mm3, B1+16
movq mm4, B1+24
punpckhwd mm1, mm2
punpckhwd mm3, mm4
movq mm5, mm1
punpckldq mm1, mm3
punpckhdq mm5, mm3
movq B2+16, mm1
movq B2+24, mm5
emms
}
for (int i = 0; i < 4; i++)
{
for (int j = 0; j < 4; j++)
cout << B1[i][j] << " ";
cout << endl;
}
cout << endl;
for (int i = 0; i < 4; i++)
{
for (int j = 0; j < 4; j++)
cout << B2[i][j] << " ";
cout << endl;
}
return 0;
}</iomanip></iostream></emmintrin.h>