Click here to Skip to main content
15,893,663 members
Articles / General Programming / Optimization

Fastest strstr-like function in C!?

Rate me:
Please Sign up or sign in to vote.
4.81/5 (39 votes)
22 Aug 2016CPOL38 min read 273.2K   3.5K   70  
Tuned function for searching a needle in a haystack
#define _rotl_KAZE(x, n) (((x) << (n)) | ((x) >> (32-(n))))
#define HaystackThresholdSekireiPiri 961 // Quadruplet works up to this value, if bigger then BMH2 takes over.
#define NeedleThresholdBIGSekireiPiri 12+180 // Should be bigger than 'HasherezadeOrder'. BMH2 works up to this value (inclusive).
#define HashTableSizeSekireiPiri 17-1 // In fact the real size is -3, because it is BITwise, when 17-3=14 it means 16KB, (17-1)-3=13 it means 8KB.
char * Railgun_Sekireigan_Piri (char * pbTarget, char * pbPattern, uint32_t cbTarget, uint32_t cbPattern)
{
	char * pbTargetMax = pbTarget + cbTarget;
	register uint32_t ulHashPattern;
	register uint32_t ulHashTarget;
	signed long count;
	signed long countSTATIC;

	unsigned char SINGLET;
	uint32_t Quadruplet2nd;
	uint32_t Quadruplet3rd;
	uint32_t Quadruplet4th;

	uint32_t AdvanceHopperGrass;

	long i; //BMH needed
	int a, j;
	unsigned char bm_Horspool_Order2[256*256]; // BMHSS(Elsiane) needed, 'char' limits patterns to 255, if 'long' then table becomes 256KB, grrr.
	uint32_t Gulliver; // or unsigned char or unsigned short

	unsigned char bm_Hasherezade_HASH[1<<(HashTableSizeSekireiPiri-3)];
	uint32_t hash32;
	uint32_t hash32B;
	uint32_t hash32C;

	if (cbPattern > cbTarget) return(NULL);

	if ( cbPattern<4 ) { 

        	pbTarget = pbTarget+cbPattern;
		ulHashPattern = ( (*(char *)(pbPattern))<<8 ) + *(pbPattern+(cbPattern-1));
		if ( cbPattern==3 ) {
			for ( ;; ) {
				if ( ulHashPattern == ( (*(char *)(pbTarget-3))<<8 ) + *(pbTarget-1) ) {
					if ( *(char *)(pbPattern+1) == *(char *)(pbTarget-2) ) return((pbTarget-3));
				}
				if ( (char)(ulHashPattern>>8) != *(pbTarget-2) ) { 
					pbTarget++;
					if ( (char)(ulHashPattern>>8) != *(pbTarget-2) ) pbTarget++;
				}
				pbTarget++;
				if (pbTarget > pbTargetMax) return(NULL);
			}
		} else {
		}
		for ( ;; ) {
			if ( ulHashPattern == ( (*(char *)(pbTarget-2))<<8 ) + *(pbTarget-1) ) return((pbTarget-2));
			if ( (char)(ulHashPattern>>8) != *(pbTarget-1) ) pbTarget++;
			pbTarget++;
			if (pbTarget > pbTargetMax) return(NULL);
		}

	} else {
		if (cbTarget<HaystackThresholdSekireiPiri) { // This value is arbitrary (don't know how exactly), it ensures (at least must) better performance than 'Boyer_Moore_Horspool'.

		pbTarget = pbTarget+cbPattern;
		ulHashPattern = *(uint32_t *)(pbPattern);
		SINGLET = ulHashPattern & 0xFF;
		Quadruplet2nd = SINGLET<<8;
		Quadruplet3rd = SINGLET<<16;
		Quadruplet4th = SINGLET<<24;
		for ( ;; ) {
			AdvanceHopperGrass = 0;
			ulHashTarget = *(uint32_t *)(pbTarget-cbPattern);
			if ( ulHashPattern == ulHashTarget ) { // Three unnecessary comparisons here, but 'AdvanceHopperGrass' must be calculated - it has a higher priority.
				count = cbPattern-1;
				while ( count && *(char *)(pbPattern+(cbPattern-count)) == *(char *)(pbTarget-count) ) {
					if ( cbPattern-1==AdvanceHopperGrass+count && SINGLET != *(char *)(pbTarget-count) ) AdvanceHopperGrass++;
					count--;
				}
				if ( count == 0) return((pbTarget-cbPattern));
			} else { // The goal here: to avoid memory accesses by stressing the registers.
				if ( Quadruplet2nd != (ulHashTarget & 0x0000FF00) ) {
					AdvanceHopperGrass++;
					if ( Quadruplet3rd != (ulHashTarget & 0x00FF0000) ) {
						AdvanceHopperGrass++;
						if ( Quadruplet4th != (ulHashTarget & 0xFF000000) ) AdvanceHopperGrass++;
					}
				}
			}
			AdvanceHopperGrass++;
			pbTarget = pbTarget + AdvanceHopperGrass;
			if (pbTarget > pbTargetMax) return(NULL);
		}

		} else { //if (cbTarget<HaystackThresholdSekireiPiri)

	if ( cbPattern<=NeedleThresholdBIGSekireiPiri ) { 

			countSTATIC = cbPattern-2-2;
			ulHashPattern = *(uint32_t *)(pbPattern); // First four bytes
			//ulHashTarget = *(unsigned short *)(pbPattern+cbPattern-1-1); // Last two bytes
			i=0;
			//for (a=0; a < 256*256; a++) {bm_Horspool_Order2[a]= cbPattern-1;} // cbPattern-(Order-1) for Horspool; 'memset' if not optimized
			//for (j=0; j < cbPattern-1; j++) bm_Horspool_Order2[*(unsigned short *)(pbPattern+j)]=j; // Rightmost appearance/position is needed
			for (a=0; a < 256*256; a++) {bm_Horspool_Order2[a]=0;}
			// In line below we "hash" 4bytes to 2bytes i.e. 16bit table, how to compute TOTAL number of BBs, 'cbPattern - Order + 1' is the number of BBs for text 'cbPattern' bytes long, for example, for cbPattern=11 'fastest fox' and Order=4 we have BBs = 11-4+1=8:
			//"fast"
			//"aste"
			//"stes"
			//"test"
			//"est "
			//"st f"
			//"t fo"
			//" fox"
			//for (j=0; j < cbPattern-4+1; j++) bm_Horspool_Order2[( *(unsigned short *)(pbPattern+j+0) + *(unsigned short *)(pbPattern+j+2) ) & ( (1<<16)-1 )]=1;
			for (j=0; j < cbPattern-4+1; j++) bm_Horspool_Order2[( (*(uint32_t *)(pbPattern+j+0)>>16)+(*(uint32_t *)(pbPattern+j+0)&0xFFFFF) ) & ( (1<<16)-1 )]=1;
			while (i <= cbTarget-cbPattern) {
				Gulliver = 1;
				//if ( bm_Horspool_Order2[*(unsigned short *)&pbTarget[i+cbPattern-1-1]] != 0 ) {
				//if ( bm_Horspool_Order2[( *(unsigned short *)&pbTarget[i+cbPattern-1-1-2] + *(unsigned short *)&pbTarget[i+cbPattern-1-1] ) & ( (1<<16)-1 )] != 0 ) {
				//  001d2 0f b7 6c 01 fc   movzx ebp, WORD PTR [-4+ecx+eax]       
				//  001d7 0f b7 5c 01 fe   movzx ebx, WORD PTR [-2+ecx+eax]       
				//  001dc 03 eb            add ebp, ebx                           
				// TO-DO: The above line should be with one only memory access i.e. to fetch 4 bytes at once and then to mix the low and high part.
				if ( bm_Horspool_Order2[( (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2]>>16)+(*(uint32_t *)&pbTarget[i+cbPattern-1-1-2]&0xFFFFF) ) & ( (1<<16)-1 )] != 0 ) {
					//if ( Gulliver == 1 ) { // Means the Building-Block order 2 is found somewhere i.e. NO MAXIMUM SKIP
						if ( *(uint32_t *)&pbTarget[i] == ulHashPattern) {
							count = countSTATIC; // Last two chars already matched, to be fixed with -2
							while ( count !=0 && *(char *)(pbPattern+(countSTATIC-count)+4) == *(char *)(&pbTarget[i]+(countSTATIC-count)+4) )
								count--;
							if ( count == 0) return(pbTarget+i);
						}
					//}
					// Trying to strengthen the Skip performance... here ONLY one additional lookup, for better/longer jumps more such lookups, unrolled to be added. 
					// Since 'Piri' revision not trying anymore, instead trying to reduce main loop size while going to order 4.
					//if ( bm_Horspool_Order2[*(unsigned short *)&pbTarget[i+cbPattern-1-1-2]] == 0 ) Gulliver = cbPattern-(2-1)-2;
				} else Gulliver = cbPattern-(2-1)-2; // -2 because we check the 4 rightmost bytes not 2.
				i = i + Gulliver;
				//GlobalI++; // Comment it, it is only for stats.
			}
			return(NULL);
// The above loop in Assembly:
/*
; mark_description "Intel(R) C++ Compiler XE for applications running on IA-32, Version 12.1.1.258 Build 20111011";
; mark_description "-O3 -FAcs -Festrstr_SHORT-SHOWDOWN_LV_WIKI_32bit_IntelV12";
.B2.29:                         
  001d2 8b 5c 01 fc      mov ebx, DWORD PTR [-4+ecx+eax]        
  001d6 8b eb            mov ebp, ebx                           
  001d8 c1 ed 10         shr ebp, 16                            
  001db 03 eb            add ebp, ebx                           
  001dd 0f b7 fd         movzx edi, bp                          
  001e0 0f b6 1c 3c      movzx ebx, BYTE PTR [esp+edi]          
  001e4 85 db            test ebx, ebx                          
  001e6 74 52            je .B2.38 ; Prob 50%                   
.B2.30:                         
  001e8 3b 14 06         cmp edx, DWORD PTR [esi+eax]           
  001eb 0f 85 3c 04 00 
        00               jne .B2.83 
.B2.31:                         
  001f1 8b 9c 24 08 00 
        01 00            mov ebx, DWORD PTR [65544+esp]         
  001f8 8b eb            mov ebp, ebx                           
  001fa f7 dd            neg ebp                                
  001fc 85 db            test ebx, ebx                          
  001fe 0f 84 21 04 00 
        00               je .B2.82 
.B2.32:                         
  00204 8b bc 24 04 00 
        01 00            mov edi, DWORD PTR [65540+esp]         
  0020b 8b b4 24 00 00 
        01 00            mov esi, DWORD PTR [65536+esp]         
  00212 03 f8            add edi, eax                           
.B2.33:                         
  00214 8a 54 35 04      mov dl, BYTE PTR [4+ebp+esi]           
  00218 3a 54 3d 04      cmp dl, BYTE PTR [4+ebp+edi]           
  0021c 0f 85 ee 03 00 
        00               jne .B2.81 
.B2.34:                         
  00222 45               inc ebp                                
  00223 4b               dec ebx                                
  00224 75 ee            jne .B2.33 
.B2.35:                         
  00226 8b b4 24 2c 00 
        01 00            mov esi, DWORD PTR [65580+esp]         
.B2.36:                         
  0022d 03 c6            add eax, esi                           
  0022f 81 c4 18 00 01 
        00               add esp, 65560                         
  00235 5d               pop ebp                                
  00236 5b               pop ebx                                
  00237 5f               pop edi                                
  00238 5e               pop esi                                
  00239 c3               ret                                    
.B2.38:                         
  0023a 8b 9c 24 10 00 
        01 00            mov ebx, DWORD PTR [65552+esp]         
.B2.39:                         
  00241 03 c3            add eax, ebx                           
  00243 3b 84 24 14 00 
        01 00            cmp eax, DWORD PTR [65556+esp]         
  0024a 76 86            jbe .B2.29 
.B2.40:                         
  0024c 33 c0            xor eax, eax                           
  0024e 81 c4 18 00 01 
        00               add esp, 65560                         
  00254 5d               pop ebp                                
  00255 5b               pop ebx                                
  00256 5f               pop edi                                
  00257 5e               pop esi                                
  00258 c3               ret                                    
.B2.41:                         
...
.B2.81:                         
  00610 89 b4 24 00 00 
        01 00            mov DWORD PTR [65536+esp], esi         
  00617 8b 94 24 0c 00 
        01 00            mov edx, DWORD PTR [65548+esp]         
  0061e 8b b4 24 2c 00 
        01 00            mov esi, DWORD PTR [65580+esp]         
.B2.82:                         
  00625 85 db            test ebx, ebx                          
  00627 0f 84 00 fc ff 
        ff               je .B2.36 
.B2.83:                         
  0062d bb 01 00 00 00   mov ebx, 1                             
  00632 e9 0a fc ff ff   jmp .B2.39 
; The whole section (main loop plus the exit) is 00258-001d2+1 + 00632-00610+5 = 135+39 bytes long.
*/
	} else { // if ( cbPattern<=NeedleThresholdBIGSekireiPiri )
// MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() [
			countSTATIC = cbPattern-2-2;

			ulHashPattern = *(uint32_t *)(pbPattern); // First four bytes
			//ulHashTarget = *(unsigned short *)(pbPattern+cbPattern-1-1); // Last two bytes
		
			i=0;

			for (a=0; a < 1<<(HashTableSizeSekireiPiri-3); a++) {bm_Hasherezade_HASH[a]= 0;} // to-do: 'memset' if not optimized
			// cbPattern - Order + 1 i.e. number of BBs for 11 'fastest fox' 11-8+1=4: 'fastest ', 'astest f', 'stest fo', 'test fox'
			for (j=0; j < cbPattern-12+1; j++) {
				hash32 = (2166136261 ^ *(uint32_t *)(pbPattern+j+0)) * 709607;        
				hash32B = (2166136261 ^ *(uint32_t *)(pbPattern+j+4)) * 709607;        
				hash32C = (2166136261 ^ *(uint32_t *)(pbPattern+j+8)) * 709607;        
				hash32 = (hash32 ^ _rotl_KAZE(hash32C,5) ) * 709607;
				hash32 = (hash32 ^ _rotl_KAZE(hash32B,5) ) * 709607;
				hash32 = ( hash32 ^ (hash32 >> 16) ) & ( (1<<(HashTableSizeSekireiPiri))-1 );
				bm_Hasherezade_HASH[hash32>>3]= bm_Hasherezade_HASH[hash32>>3] | (1<<(hash32&0x7));
			}

			while (i <= cbTarget-cbPattern) {
				Gulliver = 1; // Assume minimal jump as initial value.
				// The goal: to jump when the rightmost 8bytes (Order 8 Horspool) of window do not look like any of Needle prefixes i.e. are not to be found. This maximum jump equals cbPattern-(Order-1) or 11-(8-1)=4 for 'fastest fox' - a small one but for Needle 31 bytes the jump equals 31-(8-1)=24
				//GlobalHashSectionExecution++; // Comment it, it is only for stats.
					hash32 = (2166136261 ^ *(uint32_t *)(pbTarget+i+cbPattern-12+0)) * 709607;        
					hash32B = (2166136261 ^ *(uint32_t *)(pbTarget+i+cbPattern-12+4)) * 709607;        
					hash32C = (2166136261 ^ *(uint32_t *)(pbTarget+i+cbPattern-12+8)) * 709607;        
					hash32 = (hash32 ^ _rotl_KAZE(hash32C,5) ) * 709607;
					hash32 = (hash32 ^ _rotl_KAZE(hash32B,5) ) * 709607;
					hash32 = ( hash32 ^ (hash32 >> 16) ) & ( (1<<(HashTableSizeSekireiPiri))-1 );
					if ( (bm_Hasherezade_HASH[hash32>>3] & (1<<(hash32&0x7))) ==0 )
						Gulliver = cbPattern-(12-1);

				if ( Gulliver == 1 ) { // Means the Building-Block order 8/12 is found somewhere i.e. NO MAXIMUM SKIP
					if ( *(uint32_t *)&pbTarget[i] == ulHashPattern) {
						count = countSTATIC;
						while ( count !=0 && *(char *)(pbPattern+(countSTATIC-count)+4) == *(char *)(&pbTarget[i]+(countSTATIC-count)+4) )
							count--;
						if ( count == 0) return(pbTarget+i);
					}
				}
					i = i + Gulliver;
				//GlobalI++; // Comment it, it is only for stats.
			} // while (i <= cbTarget-cbPattern)
			return(NULL);
// MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() MEMMEM() ]
	} // if ( cbPattern<=NeedleThresholdBIGSekireiPiri )
		} //if (cbTarget<HaystackThresholdSekireiPiri)
	} //if ( cbPattern<4 )
}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)


Written By
Other
Bulgaria Bulgaria
A Bulgarian old boy interested in search techniques, nothing special.

Comments and Discussions