Click here to Skip to main content
15,914,066 members
Please Sign up or sign in to vote.
0.00/5 (No votes)
See more:
This is a matrix transpose using punpcklo, punpckhi.
why is the output B2[4][4] WRONG? SUGEST IMPROVEMENT wrong? Suggest Improvement.
C++
#include "stdafx.h"
#include "emmintrin.h"
#include <iostream>
#include <iomanip>
using namespace std;


int _tmain(int argc, _TCHAR* argv[])
{
	int B1[4][4]={1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};// matrix to be transposed
	int B2[4][4];// transposed matrix
	int n=0;
	for (int i=0;i<4;i++)
	  for (int j=0;j<4;j++)
		{
			B1[i][j]=n; n++;
		}
__asm{
movq mm1, B1
movq mm2, B1+8
movq mm3, B1+12
movq mm4, B1+16
//step one
punpcklwd mm1, mm2
punpcklwd mm3, mm4
movq mm5, mm1// copy mm1 into mm5
punpckldq mm1, mm3
punpckhdq mm5, mm3
// Move result to B2
movq B2, mm1
movq B2+8, mm0
//step two
punpckhwd mm1, mm2
punpckhwd mm3, mm4
movq mm5, mm1// copy mm1 into mm5
punpckldq mm1, mm3
punpckhdq mm5, mm3
// move result to B2
movq B2+12, mm1
movq B2+16, mm0
emms
}
	for(int i = 0; i<4; i++){
		for(int j = 0; j<4; j++) cout << B2[i][j] << " ";
			cout << endl;
	}
return 0;
}
Posted
Updated 9-Jul-10 0:08am
v4
Comments
Sandeep Mewara 9-Jul-10 6:08am    
Don't type in full caps, it's considered shouting and looks rude.
SMART LUBOBYA 9-Jul-10 8:15am    
sorry, did not know.

why not just do B2[i][j] = B1[j][i] ?
 
Share this answer
 
There are three bugs on your code:


  1. you have declared B1 and B2 as matrixes of int (then each element is 32 bit in size), but your assembler routine works on 16 bit values, then you should declare your matrixes as short
  2. you get and set the matrixes at addresses B, B+8, B+12 and B+16; the right offsets are 0, 8, 16 and 24
  3. after the first step you should load again the matrix B1 onto the mm1, mm2, mm3 and mm4 registers


See the code below, now it works properly:

C++
#include <emmintrin.h>
#include <iostream>
#include <iomanip>

using namespace std;

int _tmain(int argc, _TCHAR* argv[])
{
   short B1[4][4]; // matrix to be transposed
   short B2[4][4]; // transposed matrix
   
   int n=0;
   for (int i = 0; i < 4; i++)
      for (int j = 0; j < 4; j++, n++)
         B1[i][j] = n;

   __asm{
      movq mm1, B1
      movq mm2, B1+8
      movq mm3, B1+16
      movq mm4, B1+24

      // Step one
      punpcklwd mm1, mm2
      punpcklwd mm3, mm4
      movq mm5, mm1
      punpckldq mm1, mm3
      punpckhdq mm5, mm3

      // Move result to B2 (first two rows)
      movq B2, mm1
      movq B2+8, mm5

      movq mm1, B1
      movq mm2, B1+8
      movq mm3, B1+16
      movq mm4, B1+24

      // Step two
      punpckhwd mm1, mm2
      punpckhwd mm3, mm4
      movq mm5, mm1
      punpckldq mm1, mm3
      punpckhdq mm5, mm3

      // Move result to B2 (last two lines)
      movq B2+16, mm1
      movq B2+24, mm5

      emms
   }


   for (int i = 0; i < 4; i++)
   {
      for (int j = 0; j < 4; j++)
         cout << B1[i][j] << " ";
      cout << endl;
   }

   cout << endl;

   for (int i = 0; i < 4; i++)
   {
      for (int j = 0; j < 4; j++)
         cout << B2[i][j] << " ";
      cout << endl;
   }

   return 0;
}</iomanip></iostream></emmintrin.h>
 
Share this answer
 
Comments
SMART LUBOBYA 9-Jul-10 18:37pm    
thanks sauro, good answer really
SMART LUBOBYA 9-Jul-10 18:51pm    
Reason for my vote of 5
detailed answer.showed where i went wrong.very educative.

This content, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)



CodeProject, 20 Bay Street, 11th Floor Toronto, Ontario, Canada M5J 2N8 +1 (416) 849-8900