|
|
Comments and Discussions
|
|
 |

|
I make some changes in the DataTransferOptimised function, delete some instructions and reorder the instructions to SEMI-avoid RAW hazzard's. Here is the time for some vector's. I put the last DataTransferOptimised function. Sorry about my bad english.
IntSize C O1 O2 O3
1100000 34990 34660 34399 34380
1200000 38655 38185 37865 37844
1300000 43092 42551 42161 42141
int DataTransferOptimised3(int* piDst, int* piSrc, unsigned long SizeInBytes)
{
_asm
{
pusha;
begin3:
mov ecx,SizeInBytes;
mov edi,piDst;
mov esi,piSrc;
mov ebx,0;
begina3:
body3:
movdqa xmm1,[esi+ebx];
cmp ecx,ebx;
jz end3;
movdqa [edi+ebx],xmm1;
add ebx,16;
bodya3:
jmp begina3;
end3:
popa;
}
return 0;
}
|
|
|
|

|
I think you could further improve the optimized version if you made it so that there is only one branch per loop iteration, instead of two.
Put the test (the "does ecx == 0?" code) at the bottom of the body instead of the top, and just put a jump to the test at the beginning. Then you only have one compare and jump per loop iteration.
Something like this:
_asm
{
// remember for cleanup
pusha;
begin:
// init counter to SizeInBytes
mov ecx,SizeInBytes;
// get destination pointer
mov edi,piDst;
// get source pointer
mov esi,piSrc;
// jump to the test the first time
jmp test;
body:
// calculate offset
mov ebx,SizeInBytes;
sub ebx,ecx;
// copy source's content to 128 bits registers
movdqa xmm1,[esi+ebx];
// copy 128 bits registers to destination
movdqa [edi+ebx],xmm1;
// we've done "1 packed == 4 * sizeof(int)" already.
sub ecx,16;
test:
// check if counter is 0, yes end loop.
cmp ecx,0;
jnz body; // I have no idea if JNZ is right, but you get the idea
end:
// cleanup
popa;
}
--kbrafford
-- modified at 17:30 Friday 9th November, 2007
|
|
|
|
|

|
I think this optimization is not correct. On my Pentium 4 3Ghz it shows
following perfomance:
== Typical Transfer of 10 * 1048576 times of 4 bytes data ==
Time Elapsed = 31 msec
== Optimised Transfer of 10 * 1048576 times of 4 bytes data ==
Time Elapsed = 32 msec
Rerun? (y/n)
|
|
|
|

|
first of all, do u install processor pack?
secondly i would like say is, if u turn on opmization in modern compiler, it will help u to optimise the typical transfer code too. because the code is too simple to optimise.
if u modify to do the code slightly, u may see the different.
other way is, turn off the optimisation during ur release compile option.
then u will see the different too.
the article fail to provide you to show the absolute different between typical and optimise before of a few factors, one of them is modern compile do the job for you since the sample code is too simple. or we have to add some complication to the code, for example add some encrytion and formula to avoid automatic optimised by compiler.
have a nice day.
from,
-= aLbert =-
|
|
|
|

|
Using the movntdq instruction instead of movdqa for writing back to main memory gives a significant speed improvement.
movntdq writes directly back to main memory bypassing the cache.
The code below with movntdq moves data at about 91% of the theoretical maximum on my P4 2.4GHz Celeron with DDR333 RAM.
Note that using movntdq is slower than movdqa when the entire array will fit in the cache but about 40% faster when the array is much larger than the cache.
The code when using movdqa runs at about the same speed as the memcpy function.
int DataTransferOptimised(int* piDst, int* piSrc, unsigned long SizeInBytes)
{
unsigned long dwNumElements = SizeInBytes / sizeof(int);
unsigned long dwNumPacks = dwNumElements / (128/(sizeof(int)*8));
_asm
{
pusha;
begin:
mov ecx,SizeInBytes;
mov edi,piDst;
mov esi,piSrc;
begina:
cmp ecx,0;
jz end;
body:
movdqa xmm1, [esi]; //load from memory.
add esi, 16;
movntdq [edi], xmm1; //write back.
add edi, 16;
/* movdqa [edi], xmm1;
add edi, 16;*/
bodya:
sub ecx,16; //decrement counter.
jmp begina;
end:
popa;
}
return 0;
}
|
|
|
|

|
I have applied SSE memory copy to 800*600*sizeof(short) memory block.
(because..nowadays I have abosbed in 800*600(16BIT) resolution PC game programing.)
My test results are defferent from above sample.
and other memory block copy tests has same result.
Above sample results are related to CPU cache.
If source & destination pointer pointed by DataTransferOptimised func" are chaged then
SSE memory copy has no cache gains.
My test program here.
//---------------------------------------------------------------------------
#include < stdio.h >
#include < stdlib.h >
#include < conio.h >
#include < malloc.h >
#ifndef __STANDARD_TYPEDEFS__
#define __STANDARD_TYPEDEFS__
typedef unsigned char U8;
typedef unsigned short U16;
typedef unsigned int U32;
#endif
//MEMCPY32 is not satefy in miss-aligned memory.
void __fastcall MEMCPY32(U16 *lpiDst, U16 *lpiSrc, int nSize)
{
_asm
{
mov ecx, nSize
mov edi, lpiDst
shr ecx, 2
mov esi, lpiSrc
rep movsd
}
}
unsigned long g_dwCLOCK_HI;
unsigned long g_dwCLOCK_LO;
//reset clock counter.
void StartClockCounter(void)
{
_asm
{
rdtsc
mov g_dwCLOCK_HI, EDX
mov g_dwCLOCK_LO, EAX
}
}
//get clock counter.
unsigned long lGetClockCounter(void)
{
_asm
{
rdtsc
sub EDX, g_dwCLOCK_HI
sub EAX, g_dwCLOCK_LO
mov g_dwCLOCK_HI, EDX
mov g_dwCLOCK_LO, EAX
}
return g_dwCLOCK_LO;
}
int DataTransferTypical(int* piDst, int* piSrc, unsigned long SizeInBytes)
{
unsigned long dwNumElements = SizeInBytes / sizeof(int);
for(int i = 0; i < dwNumElements; i++)
{
// i is offset.
*(piDst + i) = *(piSrc + i);
}
return 0;
}
int DataTransferOptimised(int* piDst, int* piSrc, unsigned long SizeInBytes)
{
unsigned long dwNumElements = SizeInBytes / sizeof(int);
// not really using it, just for debuging. it keeps number of looping.
// it also means number of packed data.
unsigned long dwNumPacks = dwNumElements / (128/(sizeof(int)*8));
_asm
{
// remember for cleanup
pusha;
begin:
// init counter to SizeInBytes
mov ecx,SizeInBytes;
// get destination pointer
mov edi,piDst;
// get source pointer
mov esi,piSrc;
begina:
// check if counter is 0, yes end loop.
cmp ecx,0;
jz end;
body:
// calculate offset
mov ebx,SizeInBytes;
sub ebx,ecx;
// copy source's content to 128 bits registers
movdqa xmm1,[esi+ebx];
// copy 128 bits registers to destination
movdqa [edi+ebx],xmm1;
bodya:
// we've done "1 packed == 4 * sizeof(int)" already.
sub ecx,16;
jmp begina;
end:
// cleanup
popa;
}
return 0;
}
void main(void)
{
int n, nLoop;
int nBufWidth, nBufHeight;
int nDataSize;
char chKey;
unsigned long Time1, Time2;
U16 *lp1, *lp2;
U16 *lp1_aligned, *lp2_aligned;
nBufWidth = 800;
nBufHeight = 600;
nDataSize = nBufWidth*nBufHeight*sizeof(U16); //make 800*600 (16BIT) Buffer.
lp1_aligned = (U16 *)_aligned_malloc(nDataSize, 16);
lp2_aligned = (U16 *)_aligned_malloc(nDataSize, 16);
nLoop = nBufHeight;
do
{
lp1 = lp1_aligned;
lp2 = lp2_aligned;
StartClockCounter();
for(n=0; n < nLoop; n++)
{
MEMCPY32(lp1, lp2, nBufWidth*2);
//32BIT block copy by using MOVSD.
lp1 += nBufWidth;
lp2 += nBufWidth;
}
Time1 = lGetClockCounter();
printf("Elapsed Time1:%d\n", Time1);
lp1 = lp1_aligned;
lp2 = lp2_aligned;
StartClockCounter();
for(n=0; n < nLoop; n++)
{
DataTransferOptimised((int *)lp1, (int *)lp2, nBufWidth*2);
lp1 += nBufWidth;
lp2 += nBufWidth;
}
Time2 = lGetClockCounter();
printf("Elapsed Time2:%d\n", Time2);
if (Time2 < Time1)
printf("%.2f%% Faster.\n",
(float)((float)Time1/(float)Time2)*100-100.);
else
printf("%.2f%% Slower.\n",
(float)((float)Time2/(float)Time1)*100-100.);
printf("More?(y/n)");
chKey = getche();
printf("\n\n");
}while(chKey == 'y');
_aligned_free(lp1_aligned);
_aligned_free(lp2_aligned);
}
//---------------------------------------------------------------------------
|
|
|
|

|
yes unicon..
practically, if u wish to applied this to production code. u may need to rewrite those function to cater for its header and trailer separately.. =) then all the body u can transfer with 128bit registers...
from,
-= aLbert =-
|
|
|
|

|
You can optimize it more by using all MMX registers for the data transfer and also use non-temporal moves to bypass the cache.
See this link:
http://www.joryanick.com/memcpy.htm[^]
SGI had a really good article about memcpy that i can't find anymore...
|
|
|
|

|
interesting site! =) thanks for sharing!
anyway, i have cut and paste one of their code to try out.
here's the result of profiling...
MMXMemoryCopy is using 64bits register (mm0.. mm7)
DataTransferOptimised is using 128bits register (xmm0.. xmm7)
Profile: Function timing, sorted by time
Date: Fri Oct 08 00:40:16 2004
Program Statistics
------------------
Command line at 2004 Oct 08 00:40: "E:\j2\net\code project\InLocal\Fast Data Transfer\Release\Fast Data Transfer"
Total time: 3099.147 millisecond
Time outside of functions: 14.947 millisecond
Call depth: 2
Total functions: 4
Total hits: 61
Function coverage: 100.0%
Overhead Calculated 24
Overhead Average 24
Module Statistics for fast data transfer.exe
--------------------------------------------
Time in module: 3084.200 millisecond
Percent of time in module: 100.0%
Functions in module: 4
Hits in module: 61
Module function coverage: 100.0%
Func Func+Child Hit
Time % Time % Count Function
---------------------------------------------------------
2270.111 73.6 3084.200 100.0 1 _main (data)
277.546 9.0 277.546 9.0 20 DataTransferTypical(int *,int *,unsigned long) (data)
269.881 8.8 269.881 8.8 20 MMXMemoryCopy(void *,void *,unsigned long) (data)
266.663 8.6 266.663 8.6 20 DataTransferOptimised(int *,int *,unsigned long) (data)
from,
-= aLbert =-
|
|
|
|

|
Nice to see that some people are still coding asm now and then. I've optimized the code some more, removing a jump in the process and simplified the offset calculation.
int DataTransferOptimised(void* piDst, void* piSrc, unsigned long SizeInBytes)
{
_asm
{
// remember for cleanup
pusha;
// init counter to SizeInBytes
mov ecx,SizeInBytes;
// get destination pointer
mov edi,piDst;
// get source pointer
mov esi,piSrc;
xor ebx,ebx
// check if counter is 0, yes end loop.
or ecx,ecx
jz end;
body:
// copy source's content to 128 bits registers
movdqa xmm1,[esi+ebx];
// copy 128 bits registers to destination
movdqa [edi+ebx],xmm1;
add ebx,16 // next offset
// we've done "1 packed == 4 * sizeof(int)" already.
sub ecx,16
jnz body;
end:
// cleanup
popa;
}
return 0;
}
|
|
|
|

|
nice try.. u also can try this:
add esi,16
add edi,16
hehe...
from,
-= aLbert =-
|
|
|
|

|
Yeah tought about that, but not sure that is faster then the esi+ebx
|
|
|
|

|
should be.. esi+ebx doesn't have the absolute addr yet, addition is needed.
what do u think? if u not sure, i not sure, then i have to ask my friend who work in intel... haha..
from,
-= aLbert =-
|
|
|
|

|
Well then this would be even faster. But I think it is interresting to know if using edi+ebx is slower or not.
int DataTransferOptimised(void* piDst, void* piSrc, unsigned long SizeInBytes)
{
_asm
{
// remember for cleanup
pusha;
begin:
// init counter to SizeInBytes
mov ecx,SizeInBytes;
// get destination pointer
mov edi,piDst;
// get source pointer
mov esi,piSrc;
mov eax,16
begina:
// check if counter is 0, yes end loop.
or ecx,ecx
jz end;
body:
// copy source's content to 128 bits registers
movdqa xmm1,[esi];
add esi,eax // next offset
// copy 128 bits registers to destination
movdqa [edi],xmm1;
add edi,eax
bodya:
// we've done "1 packed == 4 * sizeof(int)" already.
sub ecx,eax
jnz body;
end:
// cleanup
popa;
}
return 0;
}
|
|
|
|

|
ohya.. "add esi,eax" will faster than "add esi,16" (should be)
but how much clock cycle is needed for register to register and memory to register? may be same?
anyway, can we make this run faster than memcpy? please refer to below feedback.. why the SIMD is so sucks up? because of its lantecy? as been said, movdqa took 6 clock cycle to complete. Zzz..
nice try my friend... =)
|
|
|
|

|
From register it's faster. because the opcode is much smaller. for add edi,eax only one or two bytes are fetched. (have to look it up if it's one or two bytes). The add edi,16 is atleast 5 bytes to fetch.
I didn't test if the smd is slower.. I will check it. Maybe it has a big opcode?
|
|
|
|

|
Instead of using a for loop to copy data, try using memcpy and see what performance that gives. I'd be interested to see the results.
Regards,
Simon Hughes
|
|
|
|

|
have u try to debug into memcpy.asm?
i wonder how it does the copying... =) i dun think it is using xmm registers...
from,
-= aLbert =-
|
|
|
|

|
If I recall correctly, in debug builds memcpy is implemented as a loop (though a bit more optimized than the one in the article). In release builds with intrinsic functions enabled (/Oi switch), the memcpy call is replaced by a few assembly instructions using 'rep stosd/b' to copy data in 32-bit blocks (and then any trailing bytes). Note that the actual copying is done internally by the CPU within the 'rep stosd/b' instruction, and can thus be expected to be about as fast as you get on the regular instruction set.
That having been said, I have not benchmarked it against MMX or SSE. Since the real strength of MMX/SSE is the SIMD features, faster memory access would merely be a secondary bonus. Has anyone benchmarked against intrinsic memcpy, and learned which method is faster for straight data transfer?
|
|
|
|

|
Hi!
I just added the following few lines to the code:
// initialize
memset(piSrc, 1, dwDataSizeInBytes);
memset(piDst, 0, dwDataSizeInBytes);
dwTimeStart = clock();
for(i = 0; i < ITERATION; i++)
memcpy(piDst, piSrc, dwDataSizeInBytes);
dwTimeEnd = clock();
printf("== Simple memcpy of %d * %d times of %d bytes data ==\nTime Elapsed = %d msec\n\n", ITERATION, DATA_SIZE, sizeof(int), dwTimeEnd - dwTimeStart);
Compiled under VS.NET 2003 on a Mobile Pentium 4, 2.66 GHz.
Here are my results (ran both versions 3 times each):
***** DEBUG - VERSION *****
== Typical Transfer of 10 * 1048576 times of 4 bytes data ==
Time Elapsed = 141 msec
== Optimised Transfer of 10 * 1048576 times of 4 bytes data ==
Time Elapsed = 62 msec
== Simple memcpy of 10 * 1048576 times of 4 bytes data ==
Time Elapsed = 62 msec
== Typical Transfer of 10 * 1048576 times of 4 bytes data ==
Time Elapsed = 140 msec
== Optimised Transfer of 10 * 1048576 times of 4 bytes data ==
Time Elapsed = 63 msec
== Simple memcpy of 10 * 1048576 times of 4 bytes data ==
Time Elapsed = 63 msec
== Typical Transfer of 10 * 1048576 times of 4 bytes data ==
Time Elapsed = 125 msec
== Optimised Transfer of 10 * 1048576 times of 4 bytes data ==
Time Elapsed = 78 msec
== Simple memcpy of 10 * 1048576 times of 4 bytes data ==
Time Elapsed = 79 msec
***************************
***** RELEASE - VERSION *****
== Typical Transfer of 10 * 1048576 times of 4 bytes data ==
Time Elapsed = 78 msec
== Optimised Transfer of 10 * 1048576 times of 4 bytes data ==
Time Elapsed = 94 msec
== Simple memcpy of 10 * 1048576 times of 4 bytes data ==
Time Elapsed = 78 msec
== Typical Transfer of 10 * 1048576 times of 4 bytes data ==
Time Elapsed = 78 msec
== Optimised Transfer of 10 * 1048576 times of 4 bytes data ==
Time Elapsed = 62 msec
== Simple memcpy of 10 * 1048576 times of 4 bytes data ==
Time Elapsed = 78 msec
== Typical Transfer of 10 * 1048576 times of 4 bytes data ==
Time Elapsed = 62 msec
== Optimised Transfer of 10 * 1048576 times of 4 bytes data ==
Time Elapsed = 78 msec
== Simple memcpy of 10 * 1048576 times of 4 bytes data ==
Time Elapsed = 62 msec
*****************************
Interesting how the for-loop (Typical Transfer) is optimized in release-version by the VC7.1 compiler.
Also interesting the poor results of the "optimized" code compared to a simple memcpy...
Regards,
Martin
|
|
|
|
|

|
The standard "memcpy" call will run as so fast as DataTransferOptimised.
The memcpy source code is not secret - it is included into MS VC++ as memcpy.asm file.
AG
|
|
|
|

|
Hello,
if I press ReRun 4 times Then the result after 4 run is this
----------------------------------------------------------------------------
== Typical Transfer of 10 * 1048576 times of 4 bytes data ==
Time Elapsed = 140 msec
== Optimised Transfer of 10 * 1048576 times of 4 bytes data ==
Time Elapsed = 141 msec
Rerun? (y/n)
--------------------------------------------------------------------------
So where is the optimization...
---------------COMPLETE TEST-----------------------------------------
== Typical Transfer of 10 * 1048576 times of 4 bytes data ==
Time Elapsed = 156 msec
== Optimised Transfer of 10 * 1048576 times of 4 bytes data ==
Time Elapsed = 125 msec
Rerun? (y/n) y
== Typical Transfer of 10 * 1048576 times of 4 bytes data ==
Time Elapsed = 141 msec
== Optimised Transfer of 10 * 1048576 times of 4 bytes data ==
Time Elapsed = 125 msec
Rerun? (y/n) y
== Typical Transfer of 10 * 1048576 times of 4 bytes data ==
Time Elapsed = 156 msec
== Optimised Transfer of 10 * 1048576 times of 4 bytes data ==
Time Elapsed = 140 msec
Rerun? (y/n) y
== Typical Transfer of 10 * 1048576 times of 4 bytes data ==
Time Elapsed = 140 msec
== Optimised Transfer of 10 * 1048576 times of 4 bytes data ==
Time Elapsed = 140 msec
Rerun? (y/n) y
== Typical Transfer of 10 * 1048576 times of 4 bytes data ==
Time Elapsed = 141 msec
== Optimised Transfer of 10 * 1048576 times of 4 bytes data ==
Time Elapsed = 141 msec
Rerun? (y/n) y
== Typical Transfer of 10 * 1048576 times of 4 bytes data ==
Time Elapsed = 141 msec
== Optimised Transfer of 10 * 1048576 times of 4 bytes data ==
Time Elapsed = 125 msec
Rerun? (y/n) y
== Typical Transfer of 10 * 1048576 times of 4 bytes data ==
Time Elapsed = 140 msec
== Optimised Transfer of 10 * 1048576 times of 4 bytes data ==
Time Elapsed = 141 msec
Rerun? (y/n)
------------------ENDS HERE-----------------------------------------
cheers
Balkrishna Talele
|
|
|
|

|
what machine u are using? as stated in the article, it should be intel p3 or newer.
when u do debuging, please try to step through both function, u will notice the difference.
typical one it will copy one int per loop ( sizeof(int)=4bytes )
where optimised one it will copy four int per loop ( 4*sizeof(int)=16bytes )
in your watch window, watch "piDst,101". then u will see how it is changing...
u must bear in mind that this optimisation method is machine dependant, which means that if your hardware not support, you won't able to see the different.
from,
-= aLbert =-
|
|
|
|
 |
|
|
General News Suggestion Question Bug Answer Joke Rant Admin
|
A beginner's introduction to one of the optimization methods.
| Type | Article |
| Licence | |
| First Posted | 3 Oct 2004 |
| Views | 91,437 |
| Bookmarked | 31 times |
|
|