#define _rotl_KAZE(x, n) (((x) << (n)) | ((x) >> (32-(n))))
#define HaystackThresholdSekireiPiri 961 // Quadruplet works up to this value, if bigger then BMH2 takes over.
#define NeedleThresholdBIGSekireiPiri 12+180 // Should be bigger than 'HasherezadeOrder'. BMH2 works up to this value (inclusive).
#define HashTableSizeSekireiPiri 17-1 // In fact the real size is -3, because it is BITwise, when 17-3=14 it means 16KB, (17-1)-3=13 it means 8KB.
char * Railgun_Sekireigan_Piri (char * pbTarget, char * pbPattern, uint32_t cbTarget, uint32_t cbPattern)
{
char * pbTargetMax = pbTarget + cbTarget;
register uint32_t ulHashPattern;
register uint32_t ulHashTarget;
signed long count;
signed long countSTATIC;
unsigned char SINGLET;
uint32_t Quadruplet2nd;
uint32_t Quadruplet3rd;
uint32_t Quadruplet4th;
uint32_t AdvanceHopperGrass;
long i; //BMH needed
int a, j;
unsigned char bm_Horspool_Order2[256*256]; // BMHSS(Elsiane) needed, 'char' limits patterns to 255, if 'long' then table becomes 256KB, grrr.
uint32_t Gulliver; // or unsigned char or unsigned short
unsigned char bm_Hasherezade_HASH[1<<(HashTableSizeSekireiPiri-3)];
uint32_t hash32;
uint32_t hash32B;
uint32_t hash32C;
if (cbPattern > cbTarget) return(NULL);
if ( cbPattern<4 ) {
pbTarget = pbTarget+cbPattern;
ulHashPattern = ( (*(char *)(pbPattern))<<8 ) + *(pbPattern+(cbPattern-1));
if ( cbPattern==3 ) {
for ( ;; ) {
if ( ulHashPattern == ( (*(char *)(pbTarget-3))<<8 ) + *(pbTarget-1) ) {
if ( *(char *)(pbPattern+1) == *(char *)(pbTarget-2) ) return((pbTarget-3));
}
if ( (char)(ulHashPattern>>8) != *(pbTarget-2) ) {
pbTarget++;
if ( (char)(ulHashPattern>>8) != *(pbTarget-2) ) pbTarget++;
}
pbTarget++;
if (pbTarget > pbTargetMax) return(NULL);
}
} else {
}
for ( ;; ) {
if ( ulHashPattern == ( (*(char *)(pbTarget-2))<<8 ) + *(pbTarget-1) ) return((pbTarget-2));
if ( (char)(ulHashPattern>>8) != *(pbTarget-1) ) pbTarget++;
pbTarget++;
if (pbTarget > pbTargetMax) return(NULL);
}
} else {
if (cbTarget<HaystackThresholdSekireiPiri) { // This value is arbitrary (don't know how exactly), it ensures (at least must) better performance than 'Boyer_Moore_Horspool'.
pbTarget = pbTarget+cbPattern;
ulHashPattern = *(uint32_t *)(pbPattern);
SINGLET = ulHashPattern & 0xFF;
Quadruplet2nd = SINGLET<<8;
Quadruplet3rd = SINGLET<<16;
Quadruplet4th = SINGLET<<24;
for ( ;; ) {
AdvanceHopperGrass = 0;
ulHashTarget = *(uint32_t *)(pbTarget-cbPattern);
if ( ulHashPattern == ulHashTarget ) { // Three unnecessary comparisons here, but 'AdvanceHopperGrass' must be calculated - it has a higher priority.
count = cbPattern-1;
while ( count && *(char *)(pbPattern+(cbPattern-count)) == *(char *)(pbTarget-count) ) {
if ( cbPattern-1==AdvanceHopperGrass+count && SINGLET != *(char *)(pbTarget-count) ) AdvanceHopperGrass++;
count--;
}
if ( count == 0) return((pbTarget-cbPattern));
} else { // The goal here: to avoid memory accesses by stressing the registers.
if ( Quadruplet2nd != (ulHashTarget & 0x0000FF00) ) {
AdvanceHopperGrass++;
if ( Quadruplet3rd != (ulHashTarget & 0x00FF0000) ) {
AdvanceHopperGrass++;
if ( Quadruplet4th != (ulHashTarget & 0xFF000000) ) AdvanceHopperGrass++;
}
}
}
AdvanceHopperGrass++;
pbTarget = pbTarget + AdvanceHopperGrass;
if (pbTarget > pbTargetMax) return(NULL);
}
} else { //if (cbTarget<HaystackThresholdSekireiPiri)
if ( cbPattern<=NeedleThresholdBIGSekireiPiri ) {
countSTATIC = cbPattern-2-2;
ulHashPattern = *(uint32_t *)(pbPattern); // First four bytes
//ulHashTarget = *(unsigned short *)(pbPattern+cbPattern-1-1); // Last two bytes
i=0;
//for (a=0; a < 256*256; a++) {bm_Horspool_Order2[a]= cbPattern-1;} // cbPattern-(Order-1) for Horspool; 'memset' if not optimized
//for (j=0; j < cbPattern-1; j++) bm_Horspool_Order2[*(unsigned short *)(pbPattern+j)]=j; // Rightmost appearance/position is needed
for (a=0; a < 256*256; a++) {bm_Horspool_Order2[a]=0;}
// In line below we "hash" 4bytes to 2bytes i.e. 16bit table, how to compute TOTAL number of BBs, 'cbPattern - Order + 1' is the number of BBs for text 'cbPattern' bytes long, for example, for cbPattern=11 'fastest fox' and Order=4 we have BBs = 11-4+1=8:
//"fast"
//"aste"
//"stes"
//"test"
//"est "
//"st f"
//"t fo"
//" fox"
//for (j=0; j < cbPattern-4+1; j++) bm_Horspool_Order2[( *(unsigned short *)(pbPattern+j+0) + *(unsigned short *)(pbPattern+j+2) ) & ( (1<<16)-1 )]=1;
for (j=0; j < cbPattern-4+1; j++) bm_Horspool_Order2[( (*(uint32_t *)(pbPattern+j+0)>>16)+(*(uint32_t *)(pbPattern+j+0)&0xFFFFF) ) & ( (1<<16)-1 )]=1;
while (i <= cbTarget-cbPattern) {
Gulliver = 1;
//if ( bm_Horspool_Order2[*(unsigned short *)&pbTarget[i+cbPattern-1-1]] != 0 ) {
//if ( bm_Horspool_Order2[( *(unsigned short *)&pbTarget[i+cbPattern-1-1-2] + *(unsigned short *)&pbTarget[i+cbPattern-1-1] ) & ( (1<<16)-1 )] != 0 ) {
// 001d2 0f b7 6c 01 fc movzx ebp, WORD PTR [-4+ecx+eax]
// 001d7 0f b7 5c 01 fe movzx ebx, WORD PTR [-2+ecx+eax]
// 001dc 03 eb add ebp, ebx
// TO-DO: The above line should be with one only memory access i.e. to fetch 4 bytes at once and then to mix the low and high part.
if ( bm_Horspool_Order2[( (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2]>>16)+(*(uint32_t *)&pbTarget[i+cbPattern-1-1-2]&0xFFFFF) ) & ( (1<<16)-1 )] != 0 ) {
//if ( Gulliver == 1 ) { // Means the Building-Block order 2 is found somewhere i.e. NO MAXIMUM SKIP
if ( *(uint32_t *)&pbTarget[i] == ulHashPattern) {
count = countSTATIC; // Last two chars already matched, to be fixed with -2
while ( count !=0 && *(char *)(pbPattern+(countSTATIC-count)+4) == *(char *)(&pbTarget[i]+(countSTATIC-count)+4) )
count--;
if ( count == 0) return(pbTarget+i);
}
//}
// Trying to strengthen the Skip performance... here ONLY one additional lookup, for better/longer jumps more such lookups, unrolled to be added.
// Since 'Piri' revision not trying anymore, instead trying to reduce main loop size while going to order 4.
//if ( bm_Horspool_Order2[*(unsigned short *)&pbTarget[i+cbPattern-1-1-2]] == 0 ) Gulliver = cbPattern-(2-1)-2;
} else Gulliver = cbPattern-(2-1)-2; // -2 because we check the 4 rightmost bytes not 2.
i = i + Gulliver;
//GlobalI++; // Comment it, it is only for stats.
}
return(NULL);
// The above loop in Assembly:
/*
; mark_description "Intel(R) C++ Compiler XE for applications running on IA-32, Version 12.1.1.258 Build 20111011";
; mark_description "-O3 -FAcs -Festrstr_SHORT-SHOWDOWN_LV_WIKI_32bit_IntelV12";
.B2.29:
001d2 8b 5c 01 fc mov ebx, DWORD PTR [-4+ecx+eax]
001d6 8b eb mov ebp, ebx
001d8 c1 ed 10 shr ebp, 16
001db 03 eb add ebp, ebx
001dd 0f b7 fd movzx edi, bp
001e0 0f b6 1c 3c movzx ebx, BYTE PTR [esp+edi]
001e4 85 db test ebx, ebx
001e6 74 52 je .B2.38 ; Prob 50%
.B2.30:
001e8 3b 14 06 cmp edx, DWORD PTR [esi+eax]
001eb 0f 85 3c 04 00
00 jne .B2.83
.B2.31:
001f1 8b 9c 24 08 00
01 00 mov ebx, DWORD PTR [65544+esp]
001f8 8b eb mov ebp, ebx
001fa f7 dd neg ebp
001fc 85 db test ebx, ebx
001fe 0f 84 21 04 00
00 je .B2.82
.B2.32:
00204 8b bc 24 04 00
01 00 mov edi, DWORD PTR [65540+esp]
0020b 8b b4 24 00 00
01 00 mov esi, DWORD PTR [65536+esp]
00212 03 f8 add edi, eax
.B2.33:
00214 8a 54 35 04 mov dl, BYTE PTR [4+ebp+esi]
00218 3a 54 3d 04 cmp dl, BYTE PTR [4+ebp+edi]
0021c 0f 85 ee 03 00
00 jne .B2.81
.B2.34:
00222 45 inc ebp
00223 4b dec ebx
00224 75 ee jne .B2.33
.B2.35:
00226 8b b4 24 2c 00
01 00 mov esi, DWORD PTR [65580+esp]
.B2.36:
0022d 03 c6 add eax, esi
0022f 81 c4 18 00 01
00 add esp, 65560
00235 5d pop ebp
00236 5b pop ebx
00237 5f pop edi
00238 5e pop esi
00239 c3 ret
.B2.38:
0023a 8b 9c 24 10 00
01 00 mov ebx, DWORD PTR [65552+esp]
.B2.39:
00241 03 c3 add eax, ebx
00243 3b 84 24 14 00
01 00 cmp eax, DWORD PTR [65556+esp]
0024a 76 86 jbe .B2.29
.B2.40:
0024c 33 c0 xor eax, eax
0024e 81 c4 18 00 01
00 add esp, 65560
00254 5d pop ebp
00255 5b pop ebx
00256 5f pop edi
00257 5e pop esi
00258 c3 ret
.B2.41:
...
.B2.81:
00610 89 b4 24 00 00
01 00 mov DWORD PTR [65536+esp], esi
00617 8b 94 24 0c 00
01 00 mov edx, DWORD PTR [65548+esp]
0061e 8b b4 24 2c 00
01 00 mov esi, DWORD PTR [65580+esp]
.B2.82:
00625 85 db test ebx, ebx
00627 0f 84 00 fc ff
ff je .B2.36
.B2.83:
0062d bb 01 00 00 00 mov ebx, 1
00632 e9 0a fc ff ff jmp .B2.39
; The whole section (main loop plus the exit) is 00258-001d2+1 + 00632-00610+5 = 135+39 bytes long.
*/
} else { // if ( cbPattern<=NeedleThresholdBIGSekireiPiri )
// MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() [
countSTATIC = cbPattern-2-2;
ulHashPattern = *(uint32_t *)(pbPattern); // First four bytes
//ulHashTarget = *(unsigned short *)(pbPattern+cbPattern-1-1); // Last two bytes
i=0;
for (a=0; a < 1<<(HashTableSizeSekireiPiri-3); a++) {bm_Hasherezade_HASH[a]= 0;} // to-do: 'memset' if not optimized
// cbPattern - Order + 1 i.e. number of BBs for 11 'fastest fox' 11-8+1=4: 'fastest ', 'astest f', 'stest fo', 'test fox'
for (j=0; j < cbPattern-12+1; j++) {
hash32 = (2166136261 ^ *(uint32_t *)(pbPattern+j+0)) * 709607;
hash32B = (2166136261 ^ *(uint32_t *)(pbPattern+j+4)) * 709607;
hash32C = (2166136261 ^ *(uint32_t *)(pbPattern+j+8)) * 709607;
hash32 = (hash32 ^ _rotl_KAZE(hash32C,5) ) * 709607;
hash32 = (hash32 ^ _rotl_KAZE(hash32B,5) ) * 709607;
hash32 = ( hash32 ^ (hash32 >> 16) ) & ( (1<<(HashTableSizeSekireiPiri))-1 );
bm_Hasherezade_HASH[hash32>>3]= bm_Hasherezade_HASH[hash32>>3] | (1<<(hash32&0x7));
}
while (i <= cbTarget-cbPattern) {
Gulliver = 1; // Assume minimal jump as initial value.
// The goal: to jump when the rightmost 8bytes (Order 8 Horspool) of window do not look like any of Needle prefixes i.e. are not to be found. This maximum jump equals cbPattern-(Order-1) or 11-(8-1)=4 for 'fastest fox' - a small one but for Needle 31 bytes the jump equals 31-(8-1)=24
//GlobalHashSectionExecution++; // Comment it, it is only for stats.
hash32 = (2166136261 ^ *(uint32_t *)(pbTarget+i+cbPattern-12+0)) * 709607;
hash32B = (2166136261 ^ *(uint32_t *)(pbTarget+i+cbPattern-12+4)) * 709607;
hash32C = (2166136261 ^ *(uint32_t *)(pbTarget+i+cbPattern-12+8)) * 709607;
hash32 = (hash32 ^ _rotl_KAZE(hash32C,5) ) * 709607;
hash32 = (hash32 ^ _rotl_KAZE(hash32B,5) ) * 709607;
hash32 = ( hash32 ^ (hash32 >> 16) ) & ( (1<<(HashTableSizeSekireiPiri))-1 );
if ( (bm_Hasherezade_HASH[hash32>>3] & (1<<(hash32&0x7))) ==0 )
Gulliver = cbPattern-(12-1);
if ( Gulliver == 1 ) { // Means the Building-Block order 8/12 is found somewhere i.e. NO MAXIMUM SKIP
if ( *(uint32_t *)&pbTarget[i] == ulHashPattern) {
count = countSTATIC;
while ( count !=0 && *(char *)(pbPattern+(countSTATIC-count)+4) == *(char *)(&pbTarget[i]+(countSTATIC-count)+4) )
count--;
if ( count == 0) return(pbTarget+i);
}
}
i = i + Gulliver;
//GlobalI++; // Comment it, it is only for stats.
} // while (i <= cbTarget-cbPattern)
return(NULL);
// MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() ]
} // if ( cbPattern<=NeedleThresholdBIGSekireiPiri )
} //if (cbTarget<HaystackThresholdSekireiPiri)
} //if ( cbPattern<4 )
}