|
#define _rotl_KAZE(x, n) (((x) << (n)) | ((x) >> (32-(n))))
#define HaystackThresholdSekireiTchittoGritto 961 // Quadruplet works up to this value, if bigger then BMH2 takes over.
#define NeedleThreshold2vs4TchittoGritto 22 // Should be bigger than 8. BMH2 works up to this value (inclusive), if bigger then BMH4 takes over.
#define NeedleThresholdBIGSekireiTchittoGritto 12+700 // Should be bigger than 'HasherezadeOrder'. BMH2 works up to this value (inclusive).
#define HashTableSizeSekireiTchittoGritto 17-1 // In fact the real size is -3, because it is BITwise, when 17-3=14 it means 16KB, (17-1)-3=13 it means 8KB.
char * Railgun_Sekireigan_Shriekeroo (char * pbTarget, char * pbPattern, uint32_t cbTarget, uint32_t cbPattern)
{
char * pbTargetMax = pbTarget + cbTarget;
register uint32_t ulHashPattern;
register uint32_t ulHashTarget;
signed long count;
signed long countSTATIC;
unsigned char SINGLET;
uint32_t Quadruplet2nd;
uint32_t Quadruplet3rd;
uint32_t Quadruplet4th;
uint32_t AdvanceHopperGrass;
long i; //BMH needed
int a, j;
unsigned char bm_Horspool_Order2[256*256]; // BMHSS(Elsiane) needed, 'char' limits patterns to 255, if 'long' then table becomes 256KB, grrr.
uint32_t Gulliver; // or unsigned char or unsigned short
unsigned char bm_Hasherezade_HASH[1<<(HashTableSizeSekireiTchittoGritto-3)];
uint32_t hash32;
uint32_t hash32B;
uint32_t hash32C;
if (cbPattern > cbTarget) return(NULL);
if ( cbPattern<4 ) {
pbTarget = pbTarget+cbPattern;
ulHashPattern = ( (*(char *)(pbPattern))<<8 ) + *(pbPattern+(cbPattern-1));
if ( cbPattern==3 ) {
for ( ;; ) {
if ( ulHashPattern == ( (*(char *)(pbTarget-3))<<8 ) + *(pbTarget-1) ) {
if ( *(char *)(pbPattern+1) == *(char *)(pbTarget-2) ) return((pbTarget-3));
}
if ( (char)(ulHashPattern>>8) != *(pbTarget-2) ) {
pbTarget++;
if ( (char)(ulHashPattern>>8) != *(pbTarget-2) ) pbTarget++;
}
pbTarget++;
if (pbTarget > pbTargetMax) return(NULL);
}
} else {
}
for ( ;; ) {
if ( ulHashPattern == ( (*(char *)(pbTarget-2))<<8 ) + *(pbTarget-1) ) return((pbTarget-2));
if ( (char)(ulHashPattern>>8) != *(pbTarget-1) ) pbTarget++;
pbTarget++;
if (pbTarget > pbTargetMax) return(NULL);
}
} else {
if (cbTarget<HaystackThresholdSekireiTchittoGritto) { // This value is arbitrary (don't know how exactly), it ensures (at least must) better performance than 'Boyer_Moore_Horspool'.
pbTarget = pbTarget+cbPattern;
ulHashPattern = *(uint32_t *)(pbPattern);
SINGLET = ulHashPattern & 0xFF;
Quadruplet2nd = SINGLET<<8;
Quadruplet3rd = SINGLET<<16;
Quadruplet4th = SINGLET<<24;
for ( ;; ) {
AdvanceHopperGrass = 0;
ulHashTarget = *(uint32_t *)(pbTarget-cbPattern);
if ( ulHashPattern == ulHashTarget ) { // Three unnecessary comparisons here, but 'AdvanceHopperGrass' must be calculated - it has a higher priority.
count = cbPattern-1;
while ( count && *(char *)(pbPattern+(cbPattern-count)) == *(char *)(pbTarget-count) ) {
if ( cbPattern-1==AdvanceHopperGrass+count && SINGLET != *(char *)(pbTarget-count) ) AdvanceHopperGrass++;
count--;
}
if ( count == 0) return((pbTarget-cbPattern));
} else { // The goal here: to avoid memory accesses by stressing the registers.
if ( Quadruplet2nd != (ulHashTarget & 0x0000FF00) ) {
AdvanceHopperGrass++;
if ( Quadruplet3rd != (ulHashTarget & 0x00FF0000) ) {
AdvanceHopperGrass++;
if ( Quadruplet4th != (ulHashTarget & 0xFF000000) ) AdvanceHopperGrass++;
}
}
}
AdvanceHopperGrass++;
pbTarget = pbTarget + AdvanceHopperGrass;
if (pbTarget > pbTargetMax) return(NULL);
}
} else { //if (cbTarget<HaystackThresholdSekireiTchittoGritto)
if ( cbPattern<=NeedleThresholdBIGSekireiTchittoGritto ) {
// BMH order 2:
if ( cbPattern<=NeedleThreshold2vs4TchittoGritto ) {
countSTATIC = cbPattern-2-2;
ulHashPattern = *(uint32_t *)(pbPattern); // First four bytes
//ulHashTarget = *(unsigned short *)(pbPattern+cbPattern-1-1); // Last two bytes
i=0;
//for (a=0; a < 256*256; a++) {bm_Horspool_Order2[a]= cbPattern-1;} // cbPattern-(Order-1) for Horspool; 'memset' if not optimized
//for (j=0; j < cbPattern-1; j++) bm_Horspool_Order2[*(unsigned short *)(pbPattern+j)]=j; // Rightmost appearance/position is needed
for (a=0; a < 256*256; a++) {bm_Horspool_Order2[a]=0;}
for (j=0; j < cbPattern-1; j++) bm_Horspool_Order2[*(unsigned short *)(pbPattern+j)]=1;
while (i <= cbTarget-cbPattern) {
Gulliver = 1;
if ( bm_Horspool_Order2[*(unsigned short *)&pbTarget[i+cbPattern-1-1]] != 0 ) {
//if ( Gulliver == 1 ) { // Means the Building-Block order 2 is found somewhere i.e. NO MAXIMUM SKIP
if ( bm_Horspool_Order2[*(unsigned short *)&pbTarget[i+cbPattern-1-1-2]] == 0 ) Gulliver = cbPattern-(2-1)-2; else {
if ( *(uint32_t *)&pbTarget[i] == ulHashPattern) {
count = countSTATIC; // Last two chars already matched, to be fixed with -2
while ( count !=0 && *(char *)(pbPattern+(countSTATIC-count)+4) == *(char *)(&pbTarget[i]+(countSTATIC-count)+4) )
count--;
if ( count == 0) return(pbTarget+i);
}
}
//}
// Trying to strengthen the Skip performance... here ONLY one additional lookup, for better/longer jumps more such lookups, unrolled to be added.
//if ( bm_Horspool_Order2[*(unsigned short *)&pbTarget[i+cbPattern-1-1-2]] == 0 ) Gulliver = cbPattern-(2-1)-2;
// Dummy me, the above line should play two roles (by putting it before the comparison cycle - which is nastily slow) instead of one:
// 1] If it is 0 that discards the need for further comparing - they cannot be equal.
// 2] Strengthen the skip.
} else Gulliver = cbPattern-(2-1);
i = i + Gulliver;
//GlobalI++; // Comment it, it is only for stats.
}
return(NULL);
// BMH order 4, needle should be >=8:
} else { //if ( cbPattern<=NeedleThreshold2vs4TchittoGritto )
countSTATIC = cbPattern-2-2;
ulHashPattern = *(uint32_t *)(pbPattern); // First four bytes
//ulHashTarget = *(unsigned short *)(pbPattern+cbPattern-1-1); // Last two bytes
i=0;
//for (a=0; a < 256*256; a++) {bm_Horspool_Order2[a]= cbPattern-1;} // cbPattern-(Order-1) for Horspool; 'memset' if not optimized
//for (j=0; j < cbPattern-1; j++) bm_Horspool_Order2[*(unsigned short *)(pbPattern+j)]=j; // Rightmost appearance/position is needed
for (a=0; a < 256*256; a++) {bm_Horspool_Order2[a]=0;}
// In line below we "hash" 4bytes to 2bytes i.e. 16bit table, how to compute TOTAL number of BBs, 'cbPattern - Order + 1' is the number of BBs for text 'cbPattern' bytes long, for example, for cbPattern=11 'fastest fox' and Order=4 we have BBs = 11-4+1=8:
//"fast"
//"aste"
//"stes"
//"test"
//"est "
//"st f"
//"t fo"
//" fox"
//for (j=0; j < cbPattern-4+1; j++) bm_Horspool_Order2[( *(unsigned short *)(pbPattern+j+0) + *(unsigned short *)(pbPattern+j+2) ) & ( (1<<16)-1 )]=1;
for (j=0; j < cbPattern-4+1; j++) bm_Horspool_Order2[( (*(uint32_t *)(pbPattern+j+0)>>16)+(*(uint32_t *)(pbPattern+j+0)&0xFFFF) ) & ( (1<<16)-1 )]=1;
while (i <= cbTarget-cbPattern) {
Gulliver = 1;
//if ( bm_Horspool_Order2[*(unsigned short *)&pbTarget[i+cbPattern-1-1]] != 0 ) {
//if ( bm_Horspool_Order2[( *(unsigned short *)&pbTarget[i+cbPattern-1-1-2] + *(unsigned short *)&pbTarget[i+cbPattern-1-1] ) & ( (1<<16)-1 )] != 0 ) {
// 001d2 0f b7 6c 01 fc movzx ebp, WORD PTR [-4+ecx+eax]
// 001d7 0f b7 5c 01 fe movzx ebx, WORD PTR [-2+ecx+eax]
// 001dc 03 eb add ebp, ebx
// TO-DO: The above line should be with one only memory access i.e. to fetch 4 bytes at once and then to mix the low and high part.
if ( bm_Horspool_Order2[( (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2]>>16)+(*(uint32_t *)&pbTarget[i+cbPattern-1-1-2]&0xFFFF) ) & ( (1<<16)-1 )] != 0 ) {
//if ( Gulliver == 1 ) { // Means the Building-Block order 2 is found somewhere i.e. NO MAXIMUM SKIP
if ( bm_Horspool_Order2[( (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-4]>>16)+(*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-4]&0xFFFF) ) & ( (1<<16)-1 )] == 0 ) Gulliver = cbPattern-(2-1)-2-4; else {
if ( *(uint32_t *)&pbTarget[i] == ulHashPattern) {
count = countSTATIC; // Last two chars already matched, to be fixed with -2
while ( count !=0 && *(char *)(pbPattern+(countSTATIC-count)+4) == *(char *)(&pbTarget[i]+(countSTATIC-count)+4) )
count--;
if ( count == 0) return(pbTarget+i);
}
}
//}
// Trying to strengthen the Skip performance... here ONLY one additional lookup, for better/longer jumps more such lookups, unrolled to be added.
// Since 'Piri' revision not trying anymore, instead trying to reduce main loop size while going to order 4.
//if ( bm_Horspool_Order2[*(unsigned short *)&pbTarget[i+cbPattern-1-1-2]] == 0 ) Gulliver = cbPattern-(2-1)-2;
//if ( bm_Horspool_Order2[( (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-4]>>16)+(*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-4]&0xFFFF) ) & ( (1<<16)-1 )] == 0 ) Gulliver = cbPattern-(2-1)-2-4;
// Dummy me, the above line should play two roles (by putting it before the comparison cycle - which is nastily slow) instead of one:
// 1] If it is 0 that discards the need for further comparing - they cannot be equal.
// 2] Strengthen the skip.
} else Gulliver = cbPattern-(2-1)-2; // -2 because we check the 4 rightmost bytes not 2.
i = i + Gulliver;
//GlobalI++; // Comment it, it is only for stats.
}
return(NULL);
} //if ( cbPattern<=NeedleThreshold2vs4TchittoGritto )
} else { // if ( cbPattern<=NeedleThresholdBIGSekireiTchittoGritto )
// MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() [
countSTATIC = cbPattern-2-2;
ulHashPattern = *(uint32_t *)(pbPattern); // First four bytes
//ulHashTarget = *(unsigned short *)(pbPattern+cbPattern-1-1); // Last two bytes
i=0;
for (a=0; a < 1<<(HashTableSizeSekireiTchittoGritto-3); a++) {bm_Hasherezade_HASH[a]= 0;} // to-do: 'memset' if not optimized
// cbPattern - Order + 1 i.e. number of BBs for 11 'fastest fox' 11-8+1=4: 'fastest ', 'astest f', 'stest fo', 'test fox'
for (j=0; j < cbPattern-12+1; j++) {
hash32 = (2166136261 ^ *(uint32_t *)(pbPattern+j+0)) * 709607;
hash32B = (2166136261 ^ *(uint32_t *)(pbPattern+j+4)) * 709607;
hash32C = (2166136261 ^ *(uint32_t *)(pbPattern+j+8)) * 709607;
hash32 = (hash32 ^ _rotl_KAZE(hash32C,5) ) * 709607;
hash32 = (hash32 ^ _rotl_KAZE(hash32B,5) ) * 709607;
hash32 = ( hash32 ^ (hash32 >> 16) ) & ( (1<<(HashTableSizeSekireiTchittoGritto))-1 );
bm_Hasherezade_HASH[hash32>>3]= bm_Hasherezade_HASH[hash32>>3] | (1<<(hash32&0x7));
}
while (i <= cbTarget-cbPattern) {
Gulliver = 1; // Assume minimal jump as initial value.
// The goal: to jump when the rightmost 8bytes (Order 8 Horspool) of window do not look like any of Needle prefixes i.e. are not to be found. This maximum jump equals cbPattern-(Order-1) or 11-(8-1)=4 for 'fastest fox' - a small one but for Needle 31 bytes the jump equals 31-(8-1)=24
//GlobalHashSectionExecution++; // Comment it, it is only for stats.
hash32 = (2166136261 ^ *(uint32_t *)(pbTarget+i+cbPattern-12+0)) * 709607;
hash32B = (2166136261 ^ *(uint32_t *)(pbTarget+i+cbPattern-12+4)) * 709607;
hash32C = (2166136261 ^ *(uint32_t *)(pbTarget+i+cbPattern-12+8)) * 709607;
hash32 = (hash32 ^ _rotl_KAZE(hash32C,5) ) * 709607;
hash32 = (hash32 ^ _rotl_KAZE(hash32B,5) ) * 709607;
hash32 = ( hash32 ^ (hash32 >> 16) ) & ( (1<<(HashTableSizeSekireiTchittoGritto))-1 );
if ( (bm_Hasherezade_HASH[hash32>>3] & (1<<(hash32&0x7))) ==0 )
Gulliver = cbPattern-(12-1);
if ( Gulliver == 1 ) { // Means the Building-Block order 8/12 is found somewhere i.e. NO MAXIMUM SKIP
if ( *(uint32_t *)&pbTarget[i] == ulHashPattern) {
count = countSTATIC;
while ( count !=0 && *(char *)(pbPattern+(countSTATIC-count)+4) == *(char *)(&pbTarget[i]+(countSTATIC-count)+4) )
count--;
if ( count == 0) return(pbTarget+i);
}
}
i = i + Gulliver;
//GlobalI++; // Comment it, it is only for stats.
} // while (i <= cbTarget-cbPattern)
return(NULL);
// MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() ]
} // if ( cbPattern<=NeedleThresholdBIGSekireiTchittoGritto )
} //if (cbTarget<HaystackThresholdSekireiTchittoGritto)
} //if ( cbPattern<4 )
}
|
By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.
If a file you wish to view isn't highlighted, and is a text file (not binary), please
let us know and we'll add colourisation support for it.