Add your own alternative version
Stats
55.2K views 284 downloads 28 bookmarked
Posted
11 Feb 2008

Comments and Discussions



Hello,
here is a source for the modified bubble sort method:
<br />
#include <stdio.h><br />
#include <stdlib.h><br />
#include <time.h><br />
#include <windows.h><br />
<br />
<br />
#define MAX_ELEMENTS 65536<br />
#define SUBARRAY_SIZE 256<br />
<br />
#define _min(a,b) ((a) < (b) ? (a) : (b))<br />
<br />
<br />
int bs(int* lpSrc, int iNumberElements);<br />
int mbs(int* lpSrc, int iNumberElements, int iSubArraySize);<br />
void output(char* lpszFile, int* lpSrc, int iNumberElements);<br />
void InitTimer();<br />
double GetTime();<br />
<br />
<br />
double g_fFrequency = 0.0;<br />
<br />
<br />
int main()<br />
{<br />
InitTimer();<br />
<br />
srand((unsigned)time(NULL));<br />
int* lpData = new int[MAX_ELEMENTS];<br />
int* lpTempData = new int[MAX_ELEMENTS];<br />
for (int i=0; i<MAX_ELEMENTS; i++)<br />
{<br />
lpData[i] = rand() % MAX_ELEMENTS;<br />
lpTempData[i] = lpData[i];<br />
}<br />
<br />
output("original.txt", lpData, MAX_ELEMENTS);<br />
<br />
double bsStartTime = GetTime();<br />
bs(lpTempData, MAX_ELEMENTS);<br />
double bsEndTime = GetTime();<br />
printf("BubbleSort total time: %.2f [ms]\r\n", (bsEndTimebsStartTime));<br />
output("sorted_bs.txt", lpTempData, MAX_ELEMENTS);<br />
<br />
double mbsStartTime = GetTime();<br />
mbs(lpData, MAX_ELEMENTS, SUBARRAY_SIZE);<br />
double mbsEndTime = GetTime();<br />
printf("ModifiedBubbleSort total time: %.2f [ms]\r\nPress ENTER to continue...", (mbsEndTimembsStartTime));<br />
output("sorted_mbs.txt", lpData, MAX_ELEMENTS);<br />
<br />
delete lpData;<br />
delete lpTempData;<br />
<br />
getchar();<br />
<br />
return 0;<br />
}<br />
<br />
int bs(int* lpSrc, int iNumberElements)<br />
{<br />
int iResult = 0;<br />
<br />
if (lpSrc != NULL)<br />
{<br />
for (int i=0; i<iNumberElements1; i++)<br />
{<br />
for (int j=i+1; j<iNumberElements; j++)<br />
{<br />
if (lpSrc[i] > lpSrc[j])<br />
{<br />
int iTemp = lpSrc[i];<br />
lpSrc[i] = lpSrc[j];<br />
lpSrc[j] = iTemp;<br />
}<br />
}<br />
}<br />
<br />
return iResult;<br />
}<br />
else<br />
{<br />
iResult = 1;<br />
}<br />
<br />
return iResult;<br />
}<br />
<br />
int mbs(int* lpSrc, int iNumberElements, int iSubArraySize)<br />
{<br />
int iResult = 0;<br />
<br />
int iMaxSubArraySize = iNumberElements / 2;<br />
if ((lpSrc != NULL) && (iSubArraySize > 0) && (iSubArraySize <= iMaxSubArraySize))<br />
{<br />
int iStart = 0;<br />
int iEnd = _min(iNumberElements, iStart + iSubArraySize);<br />
bool bFirstRun = true;<br />
while (bFirstRun)<br />
{<br />
for (int i=iStart; i<iEnd1; i++)<br />
{<br />
for (int j=i+1; j<iEnd; j++)<br />
{<br />
if (lpSrc[i] > lpSrc[j])<br />
{<br />
int iTemp = lpSrc[i];<br />
lpSrc[i] = lpSrc[j];<br />
lpSrc[j] = iTemp;<br />
}<br />
}<br />
}<br />
iStart = iEnd;<br />
iEnd = _min(iNumberElements, iStart + iSubArraySize);<br />
if (iStart >= iNumberElements)<br />
{<br />
bFirstRun = false;<br />
}<br />
}<br />
<br />
int* lpTemp = new int[iNumberElements];<br />
<br />
int* lpSrc1 = lpSrc;<br />
int* lpSrc2 = lpSrc;<br />
iStart = iSubArraySize;<br />
iEnd = _min(iNumberElements, iStart + iSubArraySize);<br />
bool bSecondRun = true;<br />
while (bSecondRun)<br />
{<br />
int k = 0;<br />
int i = 0;<br />
int j = iStart;<br />
while (true)<br />
{<br />
if (i < iStart)<br />
{<br />
if (j < iEnd)<br />
{<br />
if (lpSrc1[i] <= lpSrc2[j])<br />
{<br />
lpTemp[k++] = lpSrc1[i];<br />
i++;<br />
}<br />
else<br />
{<br />
lpTemp[k++] = lpSrc2[j];<br />
j++;<br />
}<br />
}<br />
else<br />
{<br />
lpTemp[k++] = lpSrc1[i];<br />
i++;<br />
}<br />
}<br />
else if (j < iEnd)<br />
{<br />
if (i < iStart)<br />
{<br />
if (lpSrc2[j] <= lpSrc1[i])<br />
{<br />
lpTemp[k++] = lpSrc2[j];<br />
j++;<br />
}<br />
else<br />
{<br />
lpTemp[k++] = lpSrc1[i];<br />
i++;<br />
}<br />
}<br />
else<br />
{<br />
lpTemp[k++] = lpSrc2[j];<br />
j++;<br />
}<br />
}<br />
if (k >= iEnd)<br />
{<br />
break;<br />
}<br />
}<br />
for (int l=0; l<k; l++)<br />
{<br />
lpSrc[l] = lpTemp[l];<br />
}<br />
iStart = iEnd;<br />
iEnd = _min(iNumberElements, iStart + iSubArraySize);<br />
if (iStart >= iNumberElements)<br />
{<br />
bSecondRun = false;<br />
}<br />
}<br />
delete lpTemp;<br />
}<br />
else<br />
{<br />
iResult = 1;<br />
}<br />
<br />
return iResult;<br />
}<br />
<br />
void output(char* lpszFile, int* lpSrc, int iNumberElements)<br />
{<br />
FILE* f = fopen(lpszFile, "wb");<br />
if (f != NULL)<br />
{<br />
char szText[10];<br />
for (int i=0; i<iNumberElements; i++)<br />
{<br />
sprintf(szText, "%d ", lpSrc[i]);<br />
fwrite(szText, sizeof(char), strlen(szText), f);<br />
}<br />
<br />
fclose(f);<br />
}<br />
}<br />
<br />
void InitTimer()<br />
{<br />
unsigned __int64 frequency;<br />
QueryPerformanceFrequency((LARGE_INTEGER*)&frequency);<br />
g_fFrequency = 1000.0 / (double)frequency;<br />
}<br />
<br />
double GetTime()<br />
{<br />
unsigned __int64 current;<br />
QueryPerformanceCounter((LARGE_INTEGER*)¤t);<br />
return (current * g_fFrequency);<br />
}<br />
The speed is not an issue here (MBS  Modified Bubble Sort runs 150 times faster then the original BS  Bubble Sort), although this is not an optimized version, just experimental.
Best regards,
Darkoman
"Avaritia est radix omnium malorum..."





Hello,
I was playing with different sorting methods some time ago.
The bubblesort, although it is the slowest, it is not the worst to be used sometime.
I have tried one trick to speed it up a bit:
1. I have splitted the original array (ie. 65536 items to 64 arrays of 1024 items).
2. I have sorted each subarray using bubblesort method.
3. I have joined again these subarrays in order to get the final sorted array.
I got timings that are 10 or more times better then the original bubblesort.
Also, I have used arrays with 1000000 elements and sorted it in a second or two.
So, although, the bubblesort method is a slow one it for sure can gain some speed from a good implementation.
PS.
I am looking for the implementation code.
I'll upload it here when I find it.
Best regards,
Darkoman
"Avaritia est radix omnium malorum..."





Nice article!
Well, I'm playing with SSE2 and sorting and found your article. Something that took me a night to figure out properly is how to compare and swap efficiently (read low register usage, parallel, and no branch prediction issues).
The instruction(s) you want is (are) PCMPGTD.
PCMPGTD xmm1, xmm2/m128
Compare packed doublewords in xmm1 and xmm2/m128 for greater than.
To compare 8 32bit integers, you can use 2 vectors of 4 integers each, total 8 integers to sort oneonone (1st stage of comparison.) So the code in GCC Vector Extensions would be something like:
void
cmpvec(void) {
v4_u32 x = {4,1,8,7}, y = {3,2,0,5}, r;
r = __builtin_ia32_pcmpgtd128 (x, y);
x ^= y;
y ^= x & r;
x ^= y;
}
The register swap on mask technique comes from "Exchanging Corresponding Fields of Registers" in Hacker's Delight.
This snippet will roughly compile to:
gcc g O3 simd.c o simd && gdb simd
[...]
<cmpvec+84>: callq 0x100001c6c <dyld_stub_memset>
<cmpvec+89>: movl $0x3,0x200(%rbp)
<cmpvec+99>: movl $0x2,0x1fc(%rbp)
<cmpvec+109>: movl $0x5,0x1f4(%rbp)
<cmpvec+119>: movdqa 0x200(%rbp),%xmm2
<cmpvec+127>: movdqa 0x100(%rbp),%xmm1
<cmpvec+135>: movdqa %xmm1,%xmm0
<cmpvec+139>: pcmpgtd %xmm2,%xmm0
<cmpvec+143>: pxor %xmm2,%xmm1
<cmpvec+147>: pand %xmm1,%xmm0
<cmpvec+151>: pxor %xmm2,%xmm0
<cmpvec+155>: movdqa %xmm0,0x200(%rbp)
<cmpvec+163>: pxor %xmm0,%xmm1
<cmpvec+167>: movdqa %xmm1,0x100(%rbp)
[...]
Note the generated code uses only 3 xmm registers. Next step would be doing packed shuffle to rotate the values and do it again. If you're careful the compiler can do loopunrolling and use all 16 registers almost in parallel (it could be hinted on mixing shuffle as move operations on some steps to spread out the work across the execution units and improve the pipeline).
Of course, all this would be much easier to implement with a merge sort!
Cheers.
Alecco Locco





Nice trick! How about another? Just putting ints in the right order is of pretty limited value; practical sorts need both a key and datum. A double has room for an int32, or for 7 bytes, plus a record index in the low bits. This makes a useful microsort  the final step for small partitions in a quicksort. As a plus, you can directly use MINPD and MAXPD, which is where this article started.
Bubble sort and insertion sort require branching; not great for speed. I come from the GPGPU world (boy, are the XMM registers a tiny sandbox to play in!) where the grandaddy sort algorithm is bitonic sort, Bitonic sort was ideal for the earliest GPU's used nongraphically: it requires no branching. If you use all eight XMM registers, you get a ridiculously fast sort of two sets of eight numbers; merging the two sorted sets finishes the job. The GCC/ASM code for that is here, and GCC code without explicit ASM here. Have fun, and keep those bits flipping!
"Dreams come true, not free"  S.Sondheim
modified 25Dec11 14:50pm.





Wow, did I really last comment on this almost two years ago?
If you want to see the "C" source code for a complete bitonic sort, check out
this post. If someone wants to branch the code to do an AVX version (16 YMM registers of 256 bits) I'd appreciate it.
"Dreams come true, not free"  S.Sondheim





It appears that the images in the results section are identical





Hey, good point. I just emailed the correction  it should be "result2.jpg" instead of "result1.jpg".
Actually, it should be .png, but it must have gotten converted somehow!





Don't say bubble sort is useless. It's easier to implement than other algorithms and it may not be the performance bottle neck. What's really is useless is optimising something without check first whether it's the performance bottleneck.
Anyways, nice artical. SSE is cool!





got my 5!





Nice article, I enjoyed following your exploration of SSE2. Your style is very upfront and readable, and I think we have some similarities when it comes to how we approach programming (I too harbour a secret lust to pack everything into registers... )
Now here is the tip:
Modern CPUs HATE conditional jumps, especially ones that cannot be accurately predicted in advance. That is because internally they are heavily pipelined, meaning that successive instructions begin executing before the current instruction completes, and when the CPU encounters a conditional jump it usually does not know which sequence of instructions to begin executing next because the result of the test is not "in yet". (A P4 has around 20 instructions on the go at a time!) The CPU makes a best guess at whether the jump will be taken and begins executing those instructions "speculatively"... But if the guess turns out to be wrong, the results of all that work must be discarded! So when I saw those CMP EAX,2 / JE ... sequences in your second code snippet I winced a little  each of those jumps will occur almost exactly 50% of the time, in a completely random ( = unpredictable) fashion, so you are incurring 0.5 pipeline stalls per jump.
Fortunately, there exists a much better way to do the type of thing you want to do here, without using ANY conditional jumps. Suppose you have a pair of values L and H in the low and high quadwords of xmm0 . You do the comparison using cmpltpd as usual, then what you do is create two temporary register values, one containing L in both its quadwords, the other containing H in both its quadwords. Then, by using the mask value produced by cmpltpd , you can use pand on the first value to effectively "place" L in its rightful place  the resulting value will contain L in either its low quadword or its high quadword, depending on whether L was less or greater than H, with the other quadword containing all 0bits. pandn can be used to "place" H in the opposite quadword of another register. Finally the two register values can be combined with por to produce a value containing the lesser of L and H in its low quadword, and the greater of L and H in its high quadword. This technique of "selecting out" and "combining" with bitmasks produced by comparisons is quite general.
So here is the code I propose:
movapd xmm4,xmm0
movapd xmm5,xmm0
shufpd xmm4,xmm4,1
cmpltpd xmm0,xmm4
movapd xmm6,xmm0
shufpd xmm4,xmm4,0
shufpd xmm5,xmm5,0
pand xmm0,xmm4
pandn xmm6,xmm5
por xmm0,xmm6
Although there are 10 instructions here, I'm almost certain they will execute faster than the 7 in your comparable code segment shown below, due to absence of unpredictable code flows:
order_min0:
movapd xmm4, xmm0;
shufpd xmm4, xmm4, 1;
cmpltpd xmm4, xmm0;
movmskpd eax, xmm4;
cmp eax, 2;
je order_min1;
shufpd xmm0, xmm0, 1;
order_min1:
Hope this helps!
Tim





Well, it's comments like this which pretty much make the whole effort of writing an article worthwhile.
The only thing I know knew about branch prediction is was that it exists and how to spell it.
Now I have some concrete material to go on...
I certainly will explore the great lead you've given me here, and am much obliged for it.
One thing I'm not clear on yet is how is it that the CPU doesn't just go ahead and execute both branches simultaneously? I am really ignorant about how and where a CPU stores the results of branch prediction, but it would seem to me that it could just as easily grind out both execution paths when they are this short, and reassemble the correct one into the flow after the cmp instruction.





codekaizen wrote: One thing I'm not clear on yet is how is it that the CPU doesn't just go ahead and execute both branches simultaneously? I am really ignorant about how and where a CPU stores the results of branch prediction, but it would seem to me that it could just as easily grind out both execution paths when they are this short, and reassemble the correct one into the flow after the cmp instruction.
The problem is:
1. The CPU doesn't have the resources to run both side of branches
2. It doesn't know which side of the branch to start loading into the pipeline until the compare is done
3. Therefore, it needs to start loading instructions up to 30 instructions before it finishes executing the compare
So it is impossible to know which bit of code to run, and you can't run both, so the processor has to sit doing nothing.
This is where the prediction part comes in: since the processor has probably been through this patch of code before, it can guess which way the branch will go and start taking instructions from there and running them. If it guesses right, no problem. If it's wrong, this is no problem as it would have been stalled doing nothing anyway, so it backs up to where it was.
(Note, in the Pentium 4 and other advanced processors, sometimes what you suggest does happen  both parts of the branch are run. The Pentium 4 had to do this because it's pipeline was too long and Intel were essentially wrong about processor design. This is why the Core architecture is much closer to the Pentium 3 and Pentium M  the Pentium 4 was an evolutionary deadend. Note also, the Pentium 4 had a very long pipeline to enable it to run at high clock speeds  orginially it was aimed to reach at least 5GHz, but it didn't work out, so they had to change tac.)





RobinMessage: I didn't know that the Pentium 4 sometimes takes both paths. But it makes sense in a case like this. Can't imagine how complicated the core of that Pentium 4 CPU must be  reordering instructions, checking for dependencies, renaming registers on the fly!
codekaizen, I'm glad you found my post helpful. I enjoy exploring things like this and I've picked up quite a number of "nifty" performance tricks from various places over time. Hope you continue to post about your code travels!
One more thing: looking back at the code snippet I posted, I can eliminate one movapd instruction and one register (xmm6 ):
movapd xmm4,xmm0
movapd xmm5,xmm0
shufpd xmm4,xmm4,1
cmpltpd xmm0,xmm4
shufpd xmm4,xmm4,0
shufpd xmm5,xmm5,0
andpd xmm4,xmm0
andnpd xmm0,xmm5
orpd xmm0,xmm4
I also changed pand , pandn and por to andpd , andnpd and orpd respectively  these do exactly the same thing, but apparently are faster when working with doubleprecision FP values.
I've got the optimization bug!
Tim





I check your benchmark and it's seems to deadly slow for a highlevel assembly code implementation.
65000 items take 35 seconds to sort?? That's doesn't make sense to me at all.
I'm wondering about the code or the profiling technique.





Yes, it is very slow. That is because I am using a Bubble Sort, which is almost, but not quite, the slowest sort which you can possibly imagine (only the BogoSort is worse, but it is arguably not even a sorting algorithm).
Since it is O(n^2) performance, you are doing ~ 2^16 * 2^16, or 2^32 (4.2 billion) comparisons in 20 seconds (not 35, that is the pointer table you are referring to). That is extremely fast.
It is the algorithm that is slow, not the implementation.





Just as an experiment, can you try implementing a quicksort or an insertion sort and show us how much faster it is?





It's in the works now, however it won't be done soon.
You'll be able to track the progress at http://www.codeplex.com/CodeRule as I get time to get around to it.
Even more fun will be had when more than one core can be taken advantage of. Since quicksort, for example, is a binary partitioning algorithm, you could place each partition on a separate core and watch it fly.
Of course, it really isn't very portable, and it is certainly not verifiable, but it seems it would be useful to have some of these performanceoriented primitives for the .Net platform.





i would love to see it with insertion
i have a file of over 5000000 i need to insert and sort







General News Suggestion Question Bug Answer Joke Praise Rant Admin Use Ctrl+Left/Right to switch messages, Ctrl+Up/Down to switch threads, Ctrl+Shift+Left/Right to switch pages.

