// Scheherezade -> Hasherezade
// [Italian: buffa, feminine of buffo, comic.]
/*
First off: I am heavily disappointed from Speed-Performance of 'Hasherezade' and comparing Skip-Performance of 'Gulliver' and 'Hasherezade' doesn't make me happy, either.
Pattern: fastest fox
Doing Search for Pattern(11bytes) into String(206908949bytes) as-one-line ...
Railgun_Quadruplet_7sun performance: 1,683KB/clock / 00,775%, 26,672,940 skips/iterations
Railgun_Quadruplet_7 performance: 1,642KB/clock / 00,669%, 30,925,578 skips/iterations
Railgun_Quadruplet_7sunhorse performance: 0,944KB/clock / 00,912%, 22,663,583 skips/iterations
Railgun_Quadruplet_7deuce performance: 0,801KB/clock / 00,797%, 25,945,709 skips/iterations
Railgun_Quadruplet_7Elsiane performance: 0,704KB/clock / 00,931%, 22,213,101 skips/iterations
Railgun_Quadruplet_7Gulliver performance: 2,104KB/clock / 00,977%, 21,166,516 skips/iterations
Railgun_Quadruplet_7Hasherezade performance: 1,496KB/clock / 00,980%, 21,112,657 skips/iterations 18bit HashTable
Railgun_Quadruplet_7Hasherezade performance: 1,642KB/clock / 00,980%, 21,112,646 skips/iterations 14bit HashTable
Railgun_Quadruplet_7Hasherezade performance: 1,642KB/clock / 00,980%, 21,112,735 skips/iterations 10bit HashTable
Boyer_Moore_Flensburg performance: 1,057KB/clock / 00,669%, 30,925,578 skips/iterations
Pattern: and every day a continuous cleaning goes on
Doing Search for Pattern(43bytes) into String(206908949bytes) as-one-line ...
Railgun_Quadruplet_7sun performance: 2,557KB/clock / 01,495%, 13,832,201 skips/iterations
Railgun_Quadruplet_7 performance: 2,590KB/clock / 01,404%, 14,731,326 skips/iterations
Railgun_Quadruplet_7sunhorse performance: 1,888KB/clock / 02,222%, 09,308,136 skips/iterations
Railgun_Quadruplet_7deuce performance: 1,741KB/clock / 01,971%, 10,494,460 skips/iterations
Railgun_Quadruplet_7Elsiane performance: 1,642KB/clock / 02,229%, 09,279,871 skips/iterations
Railgun_Quadruplet_7Gulliver performance: 3,157KB/clock / 03,795%, 05,450,890 skips/iterations
Railgun_Quadruplet_7Hasherezade performance: 2,322KB/clock / 04,100%, 05,046,049 skips/iterations 18bit HashTable
Railgun_Quadruplet_7Hasherezade performance: 2,971KB/clock / 04,100%, 05,046,445 skips/iterations 14bit HashTable
Railgun_Quadruplet_7Hasherezade performance: 3,015KB/clock / 04,090%, 05,058,667 skips/iterations 10bit HashTable
Boyer_Moore_Flensburg performance: 1,683KB/clock / 01,447%, 14,294,963 skips/iterations
Pattern: And let this be your very fundamental insight... about everything. Just for one year, don't choose.
Doing Search for Pattern(99bytes) into String(206908949bytes) as-one-line ...
Railgun_Quadruplet_7sun performance: 2,845KB/clock / 02,248%, 09,200,917 skips/iterations
Railgun_Quadruplet_7 performance: 2,928KB/clock / 02,105%, 09,827,389 skips/iterations
Railgun_Quadruplet_7sunhorse performance: 2,270KB/clock / 03,283%, 06,302,096 skips/iterations
Railgun_Quadruplet_7deuce performance: 2,149KB/clock / 03,196%, 06,472,407 skips/iterations
Railgun_Quadruplet_7Elsiane performance: 2,172KB/clock / 03,282%, 06,303,154 skips/iterations
Railgun_Quadruplet_7Gulliver performance: 2,928KB/clock / 07,820%, 02,645,814 skips/iterations
Railgun_Quadruplet_7Hasherezade performance: 2,494KB/clock / 09,575%, 02,160,890 skips/iterations 18bit HashTable
Railgun_Quadruplet_7Hasherezade performance: 3,015KB/clock / 09,568%, 02,162,493 skips/iterations 14bit HashTable
Railgun_Quadruplet_7Hasherezade performance: 3,015KB/clock / 09,438%, 02,192,204 skips/iterations 10bit HashTable
Boyer_Moore_Flensburg performance: 2,349KB/clock / 02,135%, 09,689,008 skips/iterations
Pattern: Then, singing among the savage branches, it impales itself upon the longest, sharpest spine. And, dying, it rises above its own agony to outcarol the lark and the nightingale.
Doing Search for Pattern(175bytes) into String(206908949bytes) as-one-line ...
Railgun_Quadruplet_7sun performance: 2,658KB/clock / 03,142%, 06,583,682 skips/iterations
Railgun_Quadruplet_7 performance: 2,658KB/clock / 03,095%, 06,685,179 skips/iterations
Railgun_Quadruplet_7sunhorse performance: 2,377KB/clock / 04,776%, 04,331,865 skips/iterations
Railgun_Quadruplet_7deuce performance: 2,349KB/clock / 04,728%, 04,376,097 skips/iterations
Railgun_Quadruplet_7Elsiane performance: 2,525KB/clock / 04,778%, 04,330,200 skips/iterations
Railgun_Quadruplet_7Gulliver performance: 2,245KB/clock / 12,024%, 01,720,711 skips/iterations
Railgun_Quadruplet_7Hasherezade performance: 2,434KB/clock / 17,127%, 01,208,072 skips/iterations 18bit HashTable
Railgun_Quadruplet_7Hasherezade performance: 2,590KB/clock / 17,084%, 01,211,064 skips/iterations 14bit HashTable
Railgun_Quadruplet_7Hasherezade performance: 2,590KB/clock / 16,379%, 01,263,238 skips/iterations 10bit HashTable
Boyer_Moore_Flensburg performance: 2,525KB/clock / 03,198%, 06,469,280 skips/iterations Five times more main-cycles and faster than 'Hasherezade', pshaw!
*/
#define ROL(x, n) (((x) << (n)) | ((x) >> (32-(n))))
#define HashTableSize 18
// Revision: 1, 2012-Feb-01, the main disadvantage: the preprocessing overhead PLUS a hasher.
// Caution: For better speed the case 'if (cbPattern==1)' was removed, so Pattern must be longer than 1 char.
char * Railgun_Quadruplet_7Hasherezade_count_hits (char * pbTarget, char * pbPattern, unsigned long cbTarget, unsigned long cbPattern)
{
char * pbTargetMax = pbTarget + cbTarget;
register unsigned long ulHashPattern;
register unsigned long ulHashTarget;
signed long count;
signed long countSTATIC;
unsigned char SINGLET;
unsigned long Quadruplet2nd;
unsigned long Quadruplet3rd;
unsigned long Quadruplet4th;
unsigned long AdvanceHopperGrass;
long i; //BMH needed
int a, j;
unsigned int bm_bc[256]; //BMH needed
unsigned int bm_bc2nd[256]; //BMS needed
unsigned char bm_Horspool_Order2[256*256]; //BMHSS(Elsiane) needed, 'char' limits patterns to 255, if 'long' then table becomes 256KB, grrr.
unsigned char bm_Hasherezade_HASH[1<<(HashTableSize)]; // Jesteressing 8bytes (Horspool Order 8) for fast lookup, should be bitwise (i.e. 8times smaller) since it says yes/no for presence.
uint32_t hash32;
unsigned long Gulliver; // or unsigned char or unsigned short
if (cbPattern > cbTarget)
return(NULL);
if ( cbPattern<4) {
pbTarget = pbTarget+cbPattern;
ulHashPattern = ( (*(char *)(pbPattern))<<8 ) + *(pbPattern+(cbPattern-1));
if ( cbPattern==3) {
for ( ;; ) {
if ( ulHashPattern == ( (*(char *)(pbTarget-3))<<8 ) + *(pbTarget-1) ) {
if ( *(char *)(pbPattern+1) == *(char *)(pbTarget-2) ) Railgunhits++; //return((pbTarget-3));
}
if ( (char)(ulHashPattern>>8) != *(pbTarget-2) ) pbTarget++;
pbTarget++;
if (pbTarget > pbTargetMax)
return(NULL);
}
} else {
}
for ( ;; ) {
if ( ulHashPattern == ( (*(char *)(pbTarget-2))<<8 ) + *(pbTarget-1) )
Railgunhits++; //return((pbTarget-2));
if ( (char)(ulHashPattern>>8) != *(pbTarget-1) ) pbTarget++;
pbTarget++;
if (pbTarget > pbTargetMax)
return(NULL);
}
} else { //if ( cbPattern<4)
if (cbTarget<961) { // This value is arbitrary(don't know how exactly), it ensures(at least must) better performance than 'Boyer_Moore_Horspool'.
/*
// A better strstr, with no asm code
// Written by Mischa Sandberg
// http://mischasan.wordpress.com
// static char const *
// scanstrm(char const *tgt, char const *pat, int len)
// {
// uint32_t head = MSBF32(pat), wind = 0, next;
//
// pat += 4, len -= 4;
// while ((next = *(uint8_t const*)tgt++)) {
// wind = ( wind << 8 ) + next;
// if (wind == head && !memcmp(tgt, pat, len))
// return tgt - 4;
// }
// return NULL;
//}
ulHashPattern = 0;
ulHashPattern = ( ulHashPattern << 8 ) + *(uint8_t const*)pbPattern++;
ulHashPattern = ( ulHashPattern << 8 ) + *(uint8_t const*)pbPattern++;
ulHashPattern = ( ulHashPattern << 8 ) + *(uint8_t const*)pbPattern++;
ulHashPattern = ( ulHashPattern << 8 ) + *(uint8_t const*)pbPattern++;
AdvanceHopperGrass = 0;
cbPattern -= 4;
while ((ulHashTarget = *(uint8_t const*)pbTarget++)) {
AdvanceHopperGrass = ( AdvanceHopperGrass << 8 ) + ulHashTarget;
if (AdvanceHopperGrass == ulHashPattern && !memcmp(pbTarget, pbPattern, cbPattern))
Railgunhits++; //return pbTarget - 4;
}
return NULL;
*/
pbTarget = pbTarget+cbPattern;
ulHashPattern = *(unsigned long *)(pbPattern);
SINGLET = ulHashPattern & 0xFF;
Quadruplet2nd = SINGLET<<8;
Quadruplet3rd = SINGLET<<16;
Quadruplet4th = SINGLET<<24;
for ( ;; ) {
AdvanceHopperGrass = 0;
ulHashTarget = *(unsigned long *)(pbTarget-cbPattern);
if ( ulHashPattern == ulHashTarget ) { // Three unnecessary comparisons here, but 'AdvanceHopperGrass' must be calculated - it has a higher priority.
count = cbPattern-1;
while ( count && *(char *)(pbPattern+(cbPattern-count)) == *(char *)(pbTarget-count) ) {
if ( cbPattern-1==AdvanceHopperGrass+count && SINGLET != *(char *)(pbTarget-count) ) AdvanceHopperGrass++;
count--;
}
if ( count == 0) Railgunhits++; //return((pbTarget-cbPattern));
} else { // The goal here: to avoid memory accesses by stressing the registers.
if ( Quadruplet2nd != (ulHashTarget & 0x0000FF00) ) {
AdvanceHopperGrass++;
if ( Quadruplet3rd != (ulHashTarget & 0x00FF0000) ) {
AdvanceHopperGrass++;
if ( Quadruplet4th != (ulHashTarget & 0xFF000000) ) AdvanceHopperGrass++;
}
}
}
AdvanceHopperGrass++;
pbTarget = pbTarget + AdvanceHopperGrass;
if (pbTarget > pbTargetMax)
return(NULL);
}
} else { //if (cbTarget<961)
countSTATIC = cbPattern-2-2;
for (a=0; a < 256; a++) {bm_bc[a]=cbPattern; bm_bc2nd[a]=cbPattern+1;}
for (j=0; j < cbPattern-1; j++) bm_bc[pbPattern[j]]=cbPattern-j-1;
for (j=0; j < cbPattern; j++) bm_bc2nd[pbPattern[j]]=cbPattern-j;
ulHashPattern = *(unsigned long *)(pbPattern); // First four bytes
//ulHashTarget = *(unsigned short *)(pbPattern+cbPattern-1-1); // Last two bytes
AdvanceHopperGrass = 0;
i=0;
// Elsiane r.2 [
for (a=0; a < 256*256; a++) {bm_Horspool_Order2[a]= cbPattern-1;} // cbPattern-(Order-1) for Horspool; 'memset' if not optimized
// alfalfa 7 long 6 BBs (al lf fa al lf fa) 3 distinct BBs (al lf fa)
// fast 4 0-1-2 fa as st
for (j=0; j < cbPattern-1; j++) bm_Horspool_Order2[*(unsigned short *)(pbPattern+j)]=j; // Rightmost appearance/position is needed
// Elsiane r.2 ]
if ( cbPattern>10) {
// Hasherezade r.1 [
// OSHO.TXT has 00,046,486 03bytes distinct BBs
// OSHO.TXT has 00,248,019 04bytes distinct BBs
// OSHO.TXT has 00,855,682 05bytes distinct BBs
// OSHO.TXT has 02,236,138 06bytes distinct BBs
// OSHO.TXT has 04,803,152 07bytes distinct BBs
// OSHO.TXT has 08,956,496 08bytes distinct BBs to be hashed in 18bit i.e. 256KB i.e. 262,144 slots i.e. 34 vs 1.
// OSHO.TXT has 15,006,172 09bytes distinct BBs
// OSHO.TXT has 22,992,127 10bytes distinct BBs
// Note: BB stands for Building-Block (also suffix)
for (a=0; a < 1<<(HashTableSize); a++) {bm_Hasherezade_HASH[a]= 0;} // to-do: bit to replace byte; 'memset' if not optimized
// cbPattern - Order + 1 i.e. number of BBs for 11 'fastest fox' 11-8+1=4: 'fastest ', 'astest f', 'stest fo', 'test fox'
for (j=0; j < cbPattern-8+1; j++) {
hash32 = (2166136261 ^ (ROL(*(uint32_t *)(pbPattern+j),5)^*(uint32_t *)(pbPattern+j+4))) * 709607;
bm_Hasherezade_HASH[( hash32 ^ (hash32 >> 16) ) & ( (1<<(HashTableSize))-1 )]=1;
/*
for (a=0; a<8; a++)
printf("%c",*(char *)(pbPattern+j+a) );
printf(" %lu\n",( hash32 ^ (hash32 >> 16) ) & ( (1<<(HashTableSize))-1 ));
//Input Pattern(up to 19+2000 chars): and every day a continuous cleaning goes on
//Doing Search for Pattern(43bytes) into String(206908949bytes) as-one-line ...
BBs Slot(HashCode for 18bit HashTable)
and ever 117013
nd every 108604
d every 155516
every d 170959
every da 115291
very day 73191
ery day 97042
ry day a 83793
y day a 11244
day a c 115855
day a co 101797
ay a con 222568
y a cont 29130
a conti 20978
a contin 258405
continu 252691
continuo 123607
ontinuou 56546
ntinuous 135857
tinuous 15332
inuous c 250584
nuous cl 48224
uous cle 106616
ous clea 137020
us clean 35751
s cleani 178989
cleanin 213855
cleaning 63337
leaning 97138
eaning g 62366
aning go 247590
ning goe 36571
ing goes 41142
ng goes 228365
g goes o 229696
goes on 176852
*/
}
// Hasherezade r.1 ]
while (i <= cbTarget-cbPattern-1) { // -1 because Sunday is used
Gulliver = bm_Horspool_Order2[*(unsigned short *)&pbTarget[i+cbPattern-1-1]];
if ( Gulliver == cbPattern-2 ) { // CASE #1: means the pair (char order 2) is found
if ( *(unsigned long *)&pbTarget[i] == ulHashPattern) {
count = countSTATIC; // Last two chars already matched, to be fixed with -2
while ( count !=0 && *(char *)(pbPattern+(countSTATIC-count)+4) == *(char *)(&pbTarget[i]+(countSTATIC-count)+4) )
count--;
if ( count == 0) Railgunhits++; //return(pbTarget+i);
}
//i = i + 1; // r.1, obviuosly this is the worst skip so turning to 'SunHorse': lines below
if ( bm_bc[(unsigned char)pbTarget[i+cbPattern-1]] < bm_bc2nd[(unsigned char)pbTarget[i+(cbPattern)]] )
Gulliver = bm_bc2nd[(unsigned char)pbTarget[i+(cbPattern)]];
else
Gulliver = bm_bc[(unsigned char)pbTarget[i+cbPattern-1]];
} else if ( Gulliver != cbPattern-1 ) // CASE #2: if equal means the pair (char order 2) is not found i.e. Gulliver remains intact, skip the whole pattern and fall back (Order-1) chars i.e. one char for Order 2
Gulliver = cbPattern - Gulliver - 2; // CASE #3: the pair is found and not as suffix i.e. rightmost position
// The goal: to jump when the rightmost 8bytes (Order 8 Horspool) of window do not look like any of Needle prefixes i.e. are not to be found. This maximum jump equals cbPattern-(Order-1) or 11-(8-1)=4 for 'fastest fox' - a small one but for Needle 31 bytes the jump equals 31-(8-1)=24
if (Gulliver < cbPattern-(8-1)) {
hash32 = (2166136261 ^ (ROL(*(uint32_t *)(pbTarget+i+cbPattern-8),5)^*(uint32_t *)(pbTarget+i+cbPattern-8+4))) * 709607;
if ( bm_Hasherezade_HASH[( hash32 ^ (hash32 >> 16) ) & ( (1<<(HashTableSize))-1 )]==0 )
Gulliver = cbPattern-(8-1);
}
i = i + Gulliver;
AdvanceHopperGrass++;
/*
; 4155 : // The goal: to jump when the rightmost 8bytes (Order 8 Horspool) of window do not look like any of Needle prefixes i.e. are not to be found. This maximum jump equals cbPattern-(Order-1) or 11-(8-1)=4 for 'fastest fox' - a small one but for Needle 31 bytes the jump equals 31-(8-1)=24
; 4156 : if (Gulliver < cbPattern-(8-1)) {
01f16 8d 43 f9 lea eax, DWORD PTR [ebx-7]
01f19 3b c8 cmp ecx, eax
01f1b 73 30 jae SHORT $LN18@Railgun_Qu@8
; 4157 : hash32 = (2166136261 ^ (ROL(*(uint32_t *)(pbTarget+i+cbPattern-8),5)^*(uint32_t *)(pbTarget+i+cbPattern-8+4))) * 709607;
01f1d 8b 44 32 f8 mov eax, DWORD PTR [edx+esi-8]
01f21 c1 c0 05 rol eax, 5
01f24 33 44 32 fc xor eax, DWORD PTR [edx+esi-4]
01f28 35 c5 9d 1c 81 xor eax, -2128831035 ; 811c9dc5H
01f2d 69 c0 e7 d3 0a
00 imul eax, 709607 ; 000ad3e7H
; 4158 : if ( bm_Hasherezade_HASH[( hash32 ^ (hash32 >> 16) ) & ( (1<<(HashTableSize))-1 )]==0 )
01f33 8b f8 mov edi, eax
01f35 c1 ef 10 shr edi, 16 ; 00000010H
01f38 33 f8 xor edi, eax
01f3a 81 e7 ff ff 03
00 and edi, 262143 ; 0003ffffH
01f40 80 bc 3c 28 08
01 00 00 cmp BYTE PTR _bm_Hasherezade_HASH$[esp+edi+329776], 0
01f48 75 03 jne SHORT $LN18@Railgun_Qu@8
; 4159 : Gulliver = cbPattern-(8-1);
01f4a 8d 4b f9 lea ecx, DWORD PTR [ebx-7]
$LN18@Railgun_Qu@8:
; 4160 : }
; 4161 : i = i + Gulliver;
; 4162 : AdvanceHopperGrass++;
*/
}
} else {
while (i <= cbTarget-cbPattern-1) { // -1 because Sunday is used
Gulliver = bm_Horspool_Order2[*(unsigned short *)&pbTarget[i+cbPattern-1-1]];
if ( Gulliver == cbPattern-2 ) { // CASE #1: means the pair (char order 2) is found
if ( *(unsigned long *)&pbTarget[i] == ulHashPattern) {
count = countSTATIC; // Last two chars already matched, to be fixed with -2
while ( count !=0 && *(char *)(pbPattern+(countSTATIC-count)+4) == *(char *)(&pbTarget[i]+(countSTATIC-count)+4) )
count--;
if ( count == 0) Railgunhits++; //return(pbTarget+i);
}
//i = i + 1; // r.1, obviuosly this is the worst skip so turning to 'SunHorse': lines below
if ( bm_bc[(unsigned char)pbTarget[i+cbPattern-1]] < bm_bc2nd[(unsigned char)pbTarget[i+(cbPattern)]] )
Gulliver = bm_bc2nd[(unsigned char)pbTarget[i+(cbPattern)]];
else
Gulliver = bm_bc[(unsigned char)pbTarget[i+cbPattern-1]];
} else if ( Gulliver != cbPattern-1 ) // CASE #2: if equal means the pair (char order 2) is not found i.e. Gulliver remains intact, skip the whole pattern and fall back (Order-1) chars i.e. one char for Order 2
Gulliver = cbPattern - Gulliver - 2; // CASE #3: the pair is found and not as suffix i.e. rightmost position
i = i + Gulliver;
// 32323218 Order 1 Horspool Skip-table A
// 01234568 Order 1 Horspool Skip-table B
// fa af fa af fa as st Order 2 Horspool Skip-table B
// 0 1 2 3 4 5 6
// HIKARIfast
// fafafast
// fafafast +2 Order 1 'a' vs 't'
// fafafast +2 = (cbPattern-SkipB-Order = 8-5-1 = 2) Order 1 'a' vs 't'
// fafafast +2 = (cbPattern-SkipB-Order = 8-4-2 = 2) Order 2 'fa' vs 'st' i.e. CASE #3
// 76543218 Order 1 Horspool
// lo on ng gp pa ac ce Order 2 Horspool
// 0 1 2 3 4 5 6
// HIKARIfast
// longpace
// longpace +2 Order 1 'a' vs 'e'
// longpace +7 = (cbPattern-(Order-1) = 8-(2-1) = 7) Order 2 'fa' vs 'ce' i.e. CASE #2
AdvanceHopperGrass++;
}
} //if ( cbPattern>10) {
if (i == cbTarget-cbPattern) {
if ( *(unsigned long *)&pbTarget[i] == ulHashPattern) {
count = countSTATIC;
while ( count !=0 && *(char *)(pbPattern+(countSTATIC-count)+4) == *(char *)(&pbTarget[i]+(countSTATIC-count)+4) )
count--;
if ( count == 0) Railgunhits++; //return(pbTarget+i);
}
AdvanceHopperGrass++;
}
GlobalSP += (int)((double)cbTarget/AdvanceHopperGrass*100);
GlobalI += AdvanceHopperGrass;
printf("Skip-Performance(bigger-the-better): %d%%, %d skips/iterations\n",(int)((double)cbTarget/AdvanceHopperGrass*100), AdvanceHopperGrass);
return(NULL);
} //if (cbTarget<961)
} //if ( cbPattern<4)
}
// ### Mix(2in1) of Karp-Rabin & Boyer-Moore-Sunday-Horspool algorithm ]