// ### Mix(2in1) of Karp-Rabin & Boyer-Moore-Sunday-Horspool algorithm [
/*
Tuning continues but the skeleton is built, I see 'Gulliver' as a really High-Performance etude.
And not to be empty-handed here the Gulliver's swiftness is benchmarked on String(206,908,949bytes) as-one-line:
Pattern: fast
Railgun_Quadruplet_7sun performance: 1057KB/clock / 456%, 45330622 skips/iterations
Railgun_Quadruplet_7 performance: 0976KB/clock / 377%, 54788054 skips/iterations
Railgun_Quadruplet_7sunhorse performance: 0649KB/clock / 480%, 43103056 skips/iterations
Railgun_Quadruplet_7deuce performance: 0564KB/clock / 389%, 53138919 skips/iterations
Railgun_Quadruplet_7Elsiane performance: 0505KB/clock / 551%, 37541955 skips/iterations
Railgun_Quadruplet_7Gulliver performance: 0780KB/clock / 300%, 68943184 skips/iterations
Boyer_Moore_Flensburg performance: 0486KB/clock / 377%, 54788139 skips/iterations
Pattern: faster
Railgun_Quadruplet_7sun performance: 1356KB/clock / 591%, 34996936 skips/iterations
Railgun_Quadruplet_7 performance: 1320KB/clock / 514%, 40194194 skips/iterations
Railgun_Quadruplet_7sunhorse performance: 0771KB/clock / 656%, 31504148 skips/iterations
Railgun_Quadruplet_7deuce performance: 0651KB/clock / 567%, 36434006 skips/iterations
Railgun_Quadruplet_7Elsiane performance: 0535KB/clock / 710%, 29101626 skips/iterations
Railgun_Quadruplet_7Gulliver performance: 1195KB/clock / 498%, 41544613 skips/iterations
Boyer_Moore_Flensburg performance: 0684KB/clock / 514%, 40194282 skips/iterations
Pattern: fastest
Railgun_Quadruplet_7sun performance: 1554KB/clock / 687%, 30084306 skips/iterations
Railgun_Quadruplet_7 performance: 1519KB/clock / 599%, 34540430 skips/iterations
Railgun_Quadruplet_7sunhorse performance: 0918KB/clock / 761%, 27188853 skips/iterations
Railgun_Quadruplet_7deuce performance: 0771KB/clock / 663%, 31175827 skips/iterations
Railgun_Quadruplet_7Elsiane performance: 0627KB/clock / 818%, 25281493 skips/iterations
Railgun_Quadruplet_7Gulliver performance: 1453KB/clock / 595%, 34744153 skips/iterations
Boyer_Moore_Flensburg performance: 0792KB/clock / 621%, 33278240 skips/iterations
Pattern: fastest fox
Railgun_Quadruplet_7sun performance: 1712KB/clock / 775%, 26672940 skips/iterations
Railgun_Quadruplet_7 performance: 1669KB/clock / 669%, 30925578 skips/iterations
Railgun_Quadruplet_7sunhorse performance: 0962KB/clock / 912%, 22663583 skips/iterations
Railgun_Quadruplet_7deuce performance: 0808KB/clock / 797%, 25945709 skips/iterations
Railgun_Quadruplet_7Elsiane performance: 0719KB/clock / 931%, 22213101 skips/iterations
Railgun_Quadruplet_7Gulliver performance: 2126KB/clock / 977%, 21166516 skips/iterations
Boyer_Moore_Flensburg performance: 1074KB/clock / 669%, 30925649 skips/iterations
Pattern: fastest fox with biggest strides
Railgun_Quadruplet_7sun performance: 2658KB/clock / 1584%, 13060463 skips/iterations
Railgun_Quadruplet_7 performance: 2767KB/clock / 1511%, 13689243 skips/iterations
Railgun_Quadruplet_7sunhorse performance: 1820KB/clock / 2138%, 09677267 skips/iterations
Railgun_Quadruplet_7deuce performance: 1669KB/clock / 2053%, 10075650 skips/iterations
Railgun_Quadruplet_7Elsiane performance: 1554KB/clock / 2143%, 09652548 skips/iterations
Railgun_Quadruplet_7Gulliver performance: 3157KB/clock / 2924%, 07074287 skips/iterations Stratosphere-borne!
Boyer_Moore_Flensburg performance: 1836KB/clock / 1554%, 13307181 skips/iterations
Pattern: fastest fox with biggest strides known to me
Railgun_Quadruplet_7sun performance: 2590KB/clock / 1548%, 13363356 skips/iterations
Railgun_Quadruplet_7 performance: 2694KB/clock / 1447%, 14292419 skips/iterations
Railgun_Quadruplet_7sunhorse performance: 1924KB/clock / 2234%, 09259505 skips/iterations
Railgun_Quadruplet_7deuce performance: 1741KB/clock / 2011%, 10287584 skips/iterations
Railgun_Quadruplet_7Elsiane performance: 1683KB/clock / 2240%, 09236188 skips/iterations
Railgun_Quadruplet_7Gulliver performance: 3157KB/clock / 3832%, 05399116 skips/iterations Mesosphere-borne!
Boyer_Moore_Flensburg performance: 1741KB/clock / 1540%, 13431751 skips/iterations
Pattern: fastest fox with biggest strides known to me up to 2012 January 26 namely 'Gulliver'
Railgun_Quadruplet_7sun performance: 3108KB/clock / 2890%, 07159321 skips/iterations
Railgun_Quadruplet_7 performance: 3108KB/clock / 2742%, 07545141 skips/iterations
Railgun_Quadruplet_7sunhorse performance: 2590KB/clock / 4138%, 04999777 skips/iterations
Railgun_Quadruplet_7deuce performance: 2464KB/clock / 4029%, 05135444 skips/iterations
Railgun_Quadruplet_7Elsiane performance: 2557KB/clock / 4141%, 04995463 skips/iterations
Railgun_Quadruplet_7Gulliver performance: 3157KB/clock / 7218%, 02866192 skips/iterations Vacuum-borne!
Boyer_Moore_Flensburg performance: 2767KB/clock / 2745%, 07536097 skips/iterations
*/
//
// Revision: 2, 2012-Jan-30, the main disadvantage: the preprocessing overhead.
// Caution: For better speed the case 'if (cbPattern==1)' was removed, so Pattern must be longer than 1 char.
//
char * Railgun_Quadruplet_7Gulliver_count_hits (char * pbTarget, char * pbPattern, unsigned long cbTarget, unsigned long cbPattern)
{
char * pbTargetMax = pbTarget + cbTarget;
register unsigned long ulHashPattern;
register unsigned long ulHashTarget;
signed long count;
signed long countSTATIC;
unsigned char SINGLET;
unsigned long Quadruplet2nd;
unsigned long Quadruplet3rd;
unsigned long Quadruplet4th;
unsigned long AdvanceHopperGrass;
long i; //BMH needed
int a, j;
unsigned int bm_bc[256]; //BMH needed
unsigned int bm_bc2nd[256]; //BMS needed
unsigned char bm_Horspool_Order2[256*256]; //BMHSS(Elsiane) needed, 'char' limits patterns to 255, if 'long' then table becomes 256KB, grrr.
unsigned long Gulliver; // or unsigned char or unsigned short
if (cbPattern > cbTarget)
return(NULL);
if ( cbPattern<4) {
pbTarget = pbTarget+cbPattern;
ulHashPattern = ( (*(char *)(pbPattern))<<8 ) + *(pbPattern+(cbPattern-1));
if ( cbPattern==3) {
for ( ;; ) {
if ( ulHashPattern == ( (*(char *)(pbTarget-3))<<8 ) + *(pbTarget-1) ) {
if ( *(char *)(pbPattern+1) == *(char *)(pbTarget-2) ) Railgunhits++; //return((pbTarget-3));
}
if ( (char)(ulHashPattern>>8) != *(pbTarget-2) ) pbTarget++;
pbTarget++;
if (pbTarget > pbTargetMax)
return(NULL);
}
} else {
}
for ( ;; ) {
if ( ulHashPattern == ( (*(char *)(pbTarget-2))<<8 ) + *(pbTarget-1) )
Railgunhits++; //return((pbTarget-2));
if ( (char)(ulHashPattern>>8) != *(pbTarget-1) ) pbTarget++;
pbTarget++;
if (pbTarget > pbTargetMax)
return(NULL);
}
} else { //if ( cbPattern<4)
if (cbTarget<961) { // This value is arbitrary(don't know how exactly), it ensures(at least must) better performance than 'Boyer_Moore_Horspool'.
/*
// A better strstr, with no asm code
// Written by Mischa Sandberg
// http://mischasan.wordpress.com
// static char const *
// scanstrm(char const *tgt, char const *pat, int len)
// {
// uint32_t head = MSBF32(pat), wind = 0, next;
//
// pat += 4, len -= 4;
// while ((next = *(uint8_t const*)tgt++)) {
// wind = ( wind << 8 ) + next;
// if (wind == head && !memcmp(tgt, pat, len))
// return tgt - 4;
// }
// return NULL;
//}
ulHashPattern = 0;
ulHashPattern = ( ulHashPattern << 8 ) + *(uint8_t const*)pbPattern++;
ulHashPattern = ( ulHashPattern << 8 ) + *(uint8_t const*)pbPattern++;
ulHashPattern = ( ulHashPattern << 8 ) + *(uint8_t const*)pbPattern++;
ulHashPattern = ( ulHashPattern << 8 ) + *(uint8_t const*)pbPattern++;
AdvanceHopperGrass = 0;
cbPattern -= 4;
while ((ulHashTarget = *(uint8_t const*)pbTarget++)) {
AdvanceHopperGrass = ( AdvanceHopperGrass << 8 ) + ulHashTarget;
if (AdvanceHopperGrass == ulHashPattern && !memcmp(pbTarget, pbPattern, cbPattern))
Railgunhits++; //return pbTarget - 4;
}
return NULL;
*/
pbTarget = pbTarget+cbPattern;
ulHashPattern = *(unsigned long *)(pbPattern);
SINGLET = ulHashPattern & 0xFF;
Quadruplet2nd = SINGLET<<8;
Quadruplet3rd = SINGLET<<16;
Quadruplet4th = SINGLET<<24;
for ( ;; ) {
AdvanceHopperGrass = 0;
ulHashTarget = *(unsigned long *)(pbTarget-cbPattern);
if ( ulHashPattern == ulHashTarget ) { // Three unnecessary comparisons here, but 'AdvanceHopperGrass' must be calculated - it has a higher priority.
count = cbPattern-1;
while ( count && *(char *)(pbPattern+(cbPattern-count)) == *(char *)(pbTarget-count) ) {
if ( cbPattern-1==AdvanceHopperGrass+count && SINGLET != *(char *)(pbTarget-count) ) AdvanceHopperGrass++;
count--;
}
if ( count == 0) Railgunhits++; //return((pbTarget-cbPattern));
} else { // The goal here: to avoid memory accesses by stressing the registers.
if ( Quadruplet2nd != (ulHashTarget & 0x0000FF00) ) {
AdvanceHopperGrass++;
if ( Quadruplet3rd != (ulHashTarget & 0x00FF0000) ) {
AdvanceHopperGrass++;
if ( Quadruplet4th != (ulHashTarget & 0xFF000000) ) AdvanceHopperGrass++;
}
}
}
AdvanceHopperGrass++;
pbTarget = pbTarget + AdvanceHopperGrass;
if (pbTarget > pbTargetMax)
return(NULL);
}
} else { //if (cbTarget<961)
countSTATIC = cbPattern-2-2;
for (a=0; a < 256; a++) {bm_bc[a]=cbPattern; bm_bc2nd[a]=cbPattern+1;}
for (j=0; j < cbPattern-1; j++) bm_bc[pbPattern[j]]=cbPattern-j-1;
for (j=0; j < cbPattern; j++) bm_bc2nd[pbPattern[j]]=cbPattern-j;
ulHashPattern = *(unsigned long *)(pbPattern); // First four bytes
//ulHashTarget = *(unsigned short *)(pbPattern+cbPattern-1-1); // Last two bytes
AdvanceHopperGrass = 0;
i=0;
// Elsiane r.2 [
for (a=0; a < 256*256; a++) {bm_Horspool_Order2[a]= cbPattern-1;} // cbPattern-(Order-1) for Horspool; 'memset' if not optimized
// alfalfa 7 long 6 BBs (al lf fa al lf fa) 3 distinct BBs (al lf fa)
// fast 4 0-1-2 fa as st
for (j=0; j < cbPattern-1; j++) bm_Horspool_Order2[*(unsigned short *)(pbPattern+j)]=j; // Rightmost appearance/position is needed
// Elsiane r.2 ]
while (i <= cbTarget-cbPattern-1) { // -1 because Sunday is used
Gulliver = bm_Horspool_Order2[*(unsigned short *)&pbTarget[i+cbPattern-1-1]];
if ( Gulliver == cbPattern-2 ) { // CASE #1: means the pair (char order 2) is found
if ( *(unsigned long *)&pbTarget[i] == ulHashPattern) {
count = countSTATIC; // Last two chars already matched, to be fixed with -2
while ( count !=0 && *(char *)(pbPattern+(countSTATIC-count)+4) == *(char *)(&pbTarget[i]+(countSTATIC-count)+4) )
count--;
if ( count == 0) Railgunhits++; //return(pbTarget+i);
}
//i = i + 1; // r.1, obviuosly this is the worst skip so turning to 'SunHorse': lines below
if ( bm_bc[(unsigned char)pbTarget[i+cbPattern-1]] < bm_bc2nd[(unsigned char)pbTarget[i+(cbPattern)]] )
Gulliver = bm_bc2nd[(unsigned char)pbTarget[i+(cbPattern)]];
else
Gulliver = bm_bc[(unsigned char)pbTarget[i+cbPattern-1]];
} else if ( Gulliver != cbPattern-1 ) // CASE #2: if equal means the pair (char order 2) is not found i.e. Gulliver remains intact, skip the whole pattern and fall back (Order-1) chars i.e. one char for Order 2
Gulliver = cbPattern - Gulliver - 2; // CASE #3: the pair is found and not as suffix i.e. rightmost position
i = i + Gulliver;
// 32323218 Order 1 Horspool Skip-table A
// 01234568 Order 1 Horspool Skip-table B
// fa af fa af fa as st Order 2 Horspool Skip-table B
// 0 1 2 3 4 5 6
// HIKARIfast
// fafafast
// fafafast +2 Order 1 'a' vs 't'
// fafafast +2 = (cbPattern-SkipB-Order = 8-5-1 = 2) Order 1 'a' vs 't'
// fafafast +2 = (cbPattern-SkipB-Order = 8-4-2 = 2) Order 2 'fa' vs 'st' i.e. CASE #3
// 76543218 Order 1 Horspool
// lo on ng gp pa ac ce Order 2 Horspool
// 0 1 2 3 4 5 6
// HIKARIfast
// longpace
// longpace +2 Order 1 'a' vs 'e'
// longpace +7 = (cbPattern-(Order-1) = 8-(2-1) = 7) Order 2 'fa' vs 'ce' i.e. CASE #2
AdvanceHopperGrass++;
}
if (i == cbTarget-cbPattern) {
if ( *(unsigned long *)&pbTarget[i] == ulHashPattern) {
count = countSTATIC;
while ( count !=0 && *(char *)(pbPattern+(countSTATIC-count)+4) == *(char *)(&pbTarget[i]+(countSTATIC-count)+4) )
count--;
if ( count == 0) Railgunhits++; //return(pbTarget+i);
}
AdvanceHopperGrass++;
}
GlobalSP += (int)((double)cbTarget/AdvanceHopperGrass*100);
GlobalI += AdvanceHopperGrass;
printf("Skip-Performance(bigger-the-better): %d%%, %d skips/iterations\n",(int)((double)cbTarget/AdvanceHopperGrass*100), AdvanceHopperGrass);
return(NULL);
} //if (cbTarget<961)
} //if ( cbPattern<4)
}
// ### Mix(2in1) of Karp-Rabin & Boyer-Moore-Sunday-Horspool algorithm ]