|
Hey. Nice stuff, found it useful.
I decided to try my hand at the assembler, poked around the link listed by Ante above. My result is below. In my case, I "init" the CRC in a separate call, this routine works a series of buffers as they're received until I reach the end. My table is a global variable, not a static member of the object, but otherwise it's like yours. I noted that on the site Ante links to, they claimed 158 Mbytes per second on a T'bird 1.4 Ghz. On my 1.5Ghz AMD XP I get 275 Mbytes per second, not sure if the T'bird should be that far behind or not. For me, the punch came from loading a quad word at a time, not a byte at a time.
__asm
{
mov eax, this // Load 'this'
mov ecx, [eax]this.CurrentCRC // Load running CRC from 'this'
mov edi, offset Crc32Table // Load the CRC32 table
mov esi, buf // Load buffer
xor ebx, ebx // zero ebx, used to process bytes, forming
// index into crc table
mov eax, len // get length
mov edx, eax
and eax, 3 // calc remainder after division by 4
push eax // preserve the remainder for later
shr edx, 2 // div by 4, calculating total quadwords
jz crc32tail // if zero, prepare for a tiny bit of work
crc32loop:
mov eax, [esi] // grab a quadword from buf
mov bl, al // form index entry, starting with a byte from buf
// part one of 4 in the quadword
xor bl, cl // xor against current CRC
shr ecx, 8 // shift CRC
xor ecx, [edi + ebx * 4] // xor CRC with the table's entry
// part two of 4 in the quadword
mov bl, ah // grab another byte of buf
xor bl, cl // xor against current CRC
shr ecx, 8 // shift CRC
xor ecx, [edi + ebx * 4] // xor CRC with table's entry
shr eax, 16 // shift the buf data two bytes down
// part three of 4 in the quadword
mov bl, al // grab another byte of buf
xor bl, cl // xor against current CRC
shr ecx, 8 // shift CRC
xor ecx, [edi + ebx * 4] // xor CRC with table's entry
// part four of 4 in the quadword
mov bl, ah // grab another byte of buf
xor bl, cl // xor against current CRC
shr ecx, 8 // shift CRC
xor ecx, [edi + ebx * 4] // xor CRC with table's entry
add esi, 4 // Advance the source pointer one quadword
dec edx // counting quadwords
jnz crc32loop // if more quadwords, loop
crc32tail:
pop edx // retreive the remainder of quadwords
cmp edx, 0 // check to see if it's zero
je crc32end
crc32tinyloop:
mov bl, byte ptr [esi] // grab one byte from buf
xor bl, cl // xor against current crc
shr ecx, 8 // shift crc
xor ecx, [edi + ebx * 4] // xor crc with table's entry
inc esi // increment buf pointer
dec edx // dec count
jnz crc32tinyloop // loop if not zero
crc32end:
mov eax, this
mov [eax]this.CurrentCRC, ecx // write to CurrentCRC
|
|
|
|
|
Hahaha
I've been reading way to much on the Opteron!
Sorry guys.
My post kept refering to quadwords in that should have read dwords.
Same theory, just half the size.
Now, on the Opteron - once I get one - this should really zing right through it.
|
|
|
|
|
#pragma once
#include <iostream>
#include <tchar.h>
#include <windows.h>
----------------------------------------------
<pre>
#include "stdafx.h"
#define POLY 0xEDB88320
#define SEED 0xFFFFFFFF
DWORD CRCTable[256];
int _tmain(int argc, _TCHAR* argv[])
{ DWORD crc32;
__asm ; CRCTable filling.
{ xor ebx, ebx ; bl - CRCTable entry index. Init with 0.
mov ecx, POLY ; Load polynom into ecx for speed
CRCTableLoop: ; Head of CRC table calculation main loop.
mov eax, ebx ; Load index into eax.
xor edx, edx ; edx = 0
shr eax, 1 ; Carry Falg = LSB of eax.
cmovc edx, ecx ; edx = (eax & 1) ? POLY : 0
xor eax, edx ; eax = (eax & 1) ? eax ^ POLY : eax
; The same with other bits
xor edx, edx
shr eax, 1
cmovc edx, ecx
xor eax, edx
xor edx, edx
shr eax, 1
cmovc edx, ecx
xor eax, edx
xor edx, edx
shr eax, 1
cmovc edx, ecx
xor eax, edx
xor edx, edx
shr eax, 1
cmovc edx, ecx
xor eax, edx
xor edx, edx
shr eax, 1
cmovc edx, ecx
xor eax, edx
xor edx, edx
shr eax, 1
cmovc edx, ecx
xor eax, edx
xor edx, edx
shr eax, 1
cmovc edx, ecx
xor eax, edx
mov CRCTable[4*ebx], eax ; Fill the current table entry.
inc bl ; Move to the next table entry.
jnz CRCTableLoop ; If index < 256 Then continue CRCTable values
; Else Table is full.
}
printf("CRC32\t\tFileSize\tFileName\n");
for(int i = 1; i < argc; ++i)
{ HANDLE hFile = CreateFile(argv[i], GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, 0, NULL);
if(hFile == INVALID_HANDLE_VALUE)
{ printf("Error opening\t%s\n", argv[i]);
continue;
}
HANDLE hMap = CreateFileMapping(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
if(hMap == NULL)
{ CloseHandle(hFile);
printf("Error creating map\t%s\n", argv[i]);
continue;
}
LPVOID pBuffer = MapViewOfFile(hMap, FILE_MAP_READ, 0, 0, 0);
if(pBuffer == NULL)
{ CloseHandle(hMap);
CloseHandle(hFile);
printf("Error MapViewOfFile\t%s\n", argv[i]);
continue;
}
DWORD dwSize = GetFileSize(hFile, NULL);
__asm
{ mov esi, pBuffer ; esi = buffer pointer.
mov ecx, dwSize ; ecs = buffer size.
mov eax, SEED ; Init CRC.
CalcCRC: ; Head of buffer CRC calculation main loop.
movzx ebx, byte ptr [esi] ; bl = next char, other ebx bits = 0
xor bl, al ; /Calculate
shr eax, 8 ; | current
mov ebx, CRCTable[4*ebx] ; \CRC value.
xor eax, ebx ; eax = current CRC value.
inc esi ; Move to the next char.
dec ecx ; Decrement of remaining bytes counter.
jnz CalcCRC ; IF counter > 0 THEN continue buffer CRC calculation.
not eax ; eax ^= 0xFFFFFFFF
mov crc32, eax ; save CRC
}
DWORD t3 = GetTickCount();
printf("0x%-08X\t%-8d\t%s\n", crc32, dwSize, argv[i]);
UnmapViewOfFile(pBuffer);
CloseHandle(hMap);
CloseHandle(hFile);
}
return 0;
}
|
|
|
|
|
Here's a loop I came up with:
#pragma warning(push)
#pragma warning(disable : 4035)
unsigned long Crc32_Asm(char *pBuf, unsigned nBytes,
unsigned *pCrcTbl, unsigned nCrc)
{
__asm {
mov edi,pBuf
mov ecx,nBytes
mov eax,nCrc
mov ebx,pCrcTbl
add edi,ecx
neg ecx
mov edx,eax
again:
and edx,0x000000ff
shr eax,8
movzx esi,byte ptr [edi+ecx]
xor edx,esi
mov esi,[ebx+edx*4]
xor eax,esi
inc ecx
mov edx,eax
jnz again
}
}
#pragma warning(pop)
|
|
|
|
|
Hey there,
I'm using your awesome crc generator. I have a question though: is it possible that a crc would be returns as 0, and how probable is this? I am assuming that I don't have a crc if it is equal to 0, so I'm wondering how safe is this of an assumption.
Cheers,
swinefeaster
Check out Aephid Photokeeper, the powerful digital
photo album solution at www.aephid.com.
|
|
|
|
|
A CRC of 0 is entirely possible. The most likely case for a CRC of 0 is in the case of a zero byte file. But it is also possible for a non-zero byte file to have a CRC of 0. Basically all CRC32s from 0 to 0xFFFFFFFF are possible. Thus you cannot rely on the CRC return value to determine if the CRC was successful. That's way all of my CRC functions take the CRC as a parameter and return an error code from the function. You must use that error code to check for errors, not the CRC value.
|
|
|
|
|
Thanks!
Check out Aephid Photokeeper, the powerful digital
photo album solution at www.aephid.com.
|
|
|
|
|
What if I were to wrap your crc generator such that a crc or 0 is never returned? As in, if for example:
<br />
if(errorCode == NO_ERROR)<br />
{<br />
<br />
if(Crc == 0)<br />
{<br />
Crc = 1;<br />
}<br />
}<br />
else<br />
{<br />
Crc = 0;<br />
}<br />
Would this be a fairly reasonable thing to do? Obviously, any files with the real crc of 0 or 1 would be regarded as having the same crc, but this would still be a very small likelyhood, no?
What do yo think?
swine
Check out Aephid Photokeeper, the powerful digital
photo album solution at www.aephid.com.
|
|
|
|
|
why would you wanna do that ?
besides it wouldnt be a real crc32 checksum then.
|
|
|
|
|
The original strength of the checksum is 1 in 4294967296. If you removed one of the possibilities, it would increase to 1 in 4294967295.
4294967295 / 4294967296 is 99.99999998% its original effectiveness.
BUT, why would you do that? How do you "not have a checksum"? Are you trying to cheat and steal a value as a "failure code". DON'T do that. Do this:
int SomeFunc(int someParam, unsigned *pRetCRC)
{
if (errorCode != NO_ERROR)
return SOME_ERROR_VALUE;
*pRetCRC = nTheCrc;
return SOME_SUCCESS_VALUE;
}
and call it like this:
unsigned nMyCrc;
if (SomeFunc(nSomeVar, &nMyCrc) == SOME_ERROR_VALUE)
return SOME_ERROR_VALUE;
|
|
|
|
|
swinefeaster wrote:
is it possible that a crc would be returns as 0, and how probable is this?
Quite possible.
There is a 1 in 2**32 chance
or
1 in 4,294,967,296
chance of getting a zero crc. Four billion is NOT that big.
|
|
|
|
|
What's special with the crc32 algo used in the png files? Can somebody show me som code for this, which doesn't use a LUT?!?
|
|
|
|
|
What's special with the crc32 algo used in the png files?
???
Special? There's nothing special with the CRC32 usind in PNG, it's just the standard CRC-32 polynomial. What made you think there was anything special about it?
Can somebody show me som code for this, which doesn't use a LUT?!?
You almost always use a lookup table. The difference might be that some implementations provide it explicitly while others generate it "on the fly". For an example of the latter you might want to Google just one before asking. Try "CRC-32 generate".
|
|
|
|
|
On NT platforms it may be possible to increase the performance very slightly by using 2 buffers & overlapped IO, so that the OS is loading your next buffer whilst you are calculating the CRC of the current buffer. Does anyone with a good grasp of overlapped IO care to try it?
Thanks for the absolutely excellent sample Brian!
|
|
|
|
|
Thanks for the idea. I don't know why I didn't use overlapped IO in the first place. The results of my tests are overlapped IO is and isn't faster. Let me explain. I created several new CRC32 functions that were identical to the assembly functions, except the file reading was done using overlapped IO. When I ran the tests sometimes overlapped IO is faster, but most of the time it's not. The results were all over the place, especially in the "static" class. Sometimes the overlapped IO function was faster than assembly function, but most of the time it was slower. The results in the "dynamic" class were more consistent. The assembly function was always faster than the overlapped IO. To be honest, I don't even know why there is a speed difference between the dynamic and static classes in the first place. However, one thing is true from my testing, the dynamic assembly function is still the fastest. Faster than overlapped IO and definitely faster than filemaps.
Again, these test results come as a surprise to me. I originally expected the filemap function to be the fastest, but that proved wrong do to the sequential versus random file access. But I would have expected overlapped IO to be faster. The only reason a programmer would want to use overlapped IO is because there is a lot of processing that could be done while the file is being read. My guess right now is the assembly CRC code is so efficient that it completes the CRC'ing of the previous buffer so fast, that it ends up waiting for the next buffer just as if it were synchronous IO. But the extra overhead associated with overlapped IO (creation of multiple buffers and copying the data from one buffer to the other) is enough to cause the code to run slower.
Nonetheless, I'll try and get my overlapped IO code posted just in case someone else can tweak the code so that it runs faster.
|
|
|
|
|
Hi,
I am building an SFV-checker in MFC and found a small ASM routine to calculate the CRC32 just like yours. I am trying to get overlapped I/O but havent got that working correctly as beeing faster. Would you mind pasting your code. Maybe I can do some improvments. The code I have today is fairly fast Intel but very slow on AMD. Still I/O seems to be the bottleneck rather than the CPU.
|
|
|
|
|
What if I want to run a check sum on one of my internal resource files? To add a bit of protection to my own software? How could i use this to do that?
|
|
|
|
|
Thankx!
Check out Aephid Photokeeper, the powerful digital
photo album solution at www.aephid.com.
|
|
|
|
|
I found a great resource at: http://www.ross.net/crc/ See the "Painless Guide". Math explanation given in simplified form. C code included.
Vernon Sauder
|
|
|
|
|
Could you elaborate a bit on what was done in the assembly version of the algorithm ? I am kind of curious.
The math is actually rather interesting. It involves an ex-or for each byte followed by an and, shift, and another ex-or for each bit of each byte. This makes it very computationally expensive. This is one reason that lookup tables were generated for it. I have worked with the CRC-16 algorithm but not CRC-32. I suppose that they are not significantly different. Oddly enough, I have run across a few PLC communication protocols that rely on CRC-16 for error checking.
Another odd story: back in the early 80s DEC VAXs were popular for factory automation applications. We developed a driver for the RTU protocol that utilized CRC-16 for error checking. This would absolutely swamp a VAX-750. We had a service rep come out and modify the wire-wrapped motherboard and install new firmware to support a new CRC-16 instruction performed by the CPU hardware. This sped things up quite a bit but I was shocked by what he had to go through. It was something else seeing a CPU that occupied a motherboard approximately 50x50cm and it was wire wrapped.
Those were the old days. Not necessarily good, just old.
|
|
|
|
|
He just used an assembler version of the table lookup.
And yes, 32 bit CRCs are just like 16 bit CRCs excluding the size of the polynomial and thus the results.
Tim Smith
Descartes Systems Sciences, Inc.
|
|
|
|
|
The number EX-ORed with each bit shift and mask is 0xA001 for CRC16. What is for CRC32 ?
|
|
|
|
|
Well, all CRCs are based off of the same theory of polynomial arithmetic (most of which I don't understand.) The size of the polynomial is where the name comes from. However, early on people started calling very specific CRCs based on the polynomial size. Thus we have CRCs called CRC16 and CRC32. These names don't even come close to describing all the elements of a CRC.
Here is all the different parameters for a CRC:
1. What is the polynomial?
2. Are the input bytes reflected?
3. What is the initial value of the CRC?
4. Is the final value XORed?
5. Is the polynomial reversed?
1. The polynomial specifies the mask of the CRC and the size. A polynomial such as x^16 + x^12 + x^5 + 1 has a mask of 0x1021. The X^16 isn't included in the mask. A 32 bit CRC would have an x^32 element.
2. When each bit is processed in a CRC, you can either start from the high bit and move to the low bit, or the low bit and move to the high. The latter case is know as reflected. Now, the slow way of doing a reflected CRC is to reflect the input values as they are being processed. The fast way is to reflected the whole bloody algorithm including the polynomial mask. Thus making the reflected CRC just as fast as the non-reflected. In my experience, it is more common to find reflected algorithms than non-reflected. If your CRC is being shifted left, that is a non-reflected CRC. If your CRC is being shifted to the right, that is a reflected CRC.
3. The initial value of a CRC is very important. This value is usually 0xFFFF for a 16 bit crc and 0xFFFFFFFF for a 32 bit crc. Some CRCs use a 0 initial value which isn't a very good idea since you can't tell the difference between a stream of 11 bytes of 0 and 12 bytes of zero.
4. Many algorithms XOR the final value with 0xFFFF for a 16 bit crc and 0xFFFFFFFF for a 32 bit crc. This is purely up to the whim of the programmer.
5. Well, some moron had to do it. They used a non-reversed polynomial mask in a reflected algorithm or they used a reversed polynomial mask in a non-reflected algorithm. The good news is that a reversed poly seems to work just as well.
So, to answer your question, which CRC32 are you talking about? I have seen two or three.
Tim Smith
Descartes Systems Sciences, Inc.
|
|
|
|
|
FYI:
A 6 bit CRC or 67 bit CRC is perfectly valid.
Tim Smith
Descartes Systems Sciences, Inc.
|
|
|
|
|
Thanks, that is some interesting info. The specific CRC16 I worked with was used as a checksum in the RTU communications protocol. I have seen this same protocol used with several PLCs and RTUs over the years. Regarding the issues you mentioned :
- it uses 0xFFFF as an initial value.
- it does right shifting so it is reflected.
- if the low order bit is one prior to the shift, the value is xor'ed with 0xA001
- there is no final xor done.
I had no specific CRC32 in mind. I was just curious about what the value of the xor mask was. I looked through the ziplib code that I have and I noticed that they use a lookup table also for their CRC32 so the xor mask is built in.
Anyway, thanks again.
|
|
|
|
|