Hi Amir, it is good to see fellow member wrestling with texts, I myself still struggle with text searches.
As you know every approach/etude has its merits, currently you use malloc() and strcpy(), consider to get rid of them - they are speed breaks.
You may try your current code versus the plain strstr() one, compilers use SSE2, so searches for short needles (2,3 in particular) are superfast.
I expect the naive variant using memmem(), in fact strstr() is not useful, to be much faster than yours.
Have you tried simplified (only deletion no replace and no insertion) Levenshtein Distance approach?
Below you can use my old code to write such searcher:
#define _WIN32_ENVIRONMENT_
#define MaxLineLength 128
#define KAZE_tolower(c) ( (((c) >= 'A') && ((c) <= 'Z')) ? ((c) - 'A' + 'a') : (c) )
#define KAZE_toupper(c) ( (((c) >= 'a') && ((c) <= 'z')) ? ((c) - 'a' + 'A') : (c) )
#if defined(_WIN32ASM_)
unsigned int abs_AF (int n) {
__asm {
mov eax, n
cdq
xor eax, edx
sub eax, edx
} }
unsigned int min_AF (int a, int b, int c) {
__asm {
mov eax, a
mov ebx, b
sub eax, ebx
sbb edx, edx
and edx, eax
add ebx, edx
mov eax, c
sub eax, ebx
sbb edx, edx
and edx, eax
add ebx, edx
mov eax, ebx
}
}
#endif
long KAZE_strlenLF (const char * str)
{
const char *eos = str;
char LFa[1];
LFa[0] = 10; while( *eos++ != LFa[0] ) ;
return( (int)(eos - str - 1) );
}
void x64toaKAZE (
unsigned long long val,
char *buf,
unsigned radix,
int is_neg
)
{
char *p;
char *firstdig;
char temp;
unsigned digval;
p = buf;
if ( is_neg )
{
*p++ = '-';
val = (unsigned long long)(-(long long)val);
}
firstdig = p;
do {
digval = (unsigned) (val % radix);
val /= radix;
if (digval > 9)
*p++ = (char) (digval - 10 + 'a');
else
*p++ = (char) (digval + '0');
} while (val > 0);
*p-- = '\0';
do {
temp = *p;
*p = *firstdig;
*firstdig = temp;
--p;
++firstdig;
} while (firstdig < p);
}
char * _i64toaKAZE (
long long val,
char *buf,
int radix
)
{
x64toaKAZE((unsigned long long)val, buf, radix, (radix == 10 && val < 0));
return buf;
}
char * _ui64toaKAZE (
unsigned long long val,
char *buf,
int radix
)
{
x64toaKAZE(val, buf, radix, 0);
return buf;
}
char * _ui64toaKAZEzerocomma (
unsigned long long val,
char *buf,
int radix
)
{
char *p;
char temp;
int txpman;
int pxnman;
x64toaKAZE(val, buf, radix, 0);
p = buf;
do {
} while (*++p != '\0');
p--; buf[26] = 0;
txpman = 1;
pxnman = 0;
do
{ if (buf <= p)
{ temp = *p;
buf[26-txpman] = temp; pxnman++;
p--;
if (pxnman % 3 == 0)
{ txpman++;
buf[26-txpman] = (char) (',');
}
}
else
{ buf[26-txpman] = (char) ('0'); pxnman++;
if (pxnman % 3 == 0)
{ txpman++;
buf[26-txpman] = (char) (',');
}
}
txpman++;
} while (txpman <= 26);
return buf;
}
char * _ui64toaKAZEcomma (
unsigned long long val,
char *buf,
int radix
)
{
char *p;
char temp;
int txpman;
int pxnman;
x64toaKAZE(val, buf, radix, 0);
p = buf;
do {
} while (*++p != '\0');
p--; buf[26] = 0;
txpman = 1;
pxnman = 0;
while (buf <= p)
{ temp = *p;
buf[26-txpman] = temp; pxnman++;
p--;
if (pxnman % 3 == 0 && buf <= p)
{ txpman++;
buf[26-txpman] = (char) (',');
}
txpman++;
}
return buf+26-(txpman-1);
}
#if defined(_WIN32_ENVIRONMENT_)
#include <io.h> // needed for Windows' 'lseeki64' and 'telli64'
#else
#endif /* defined(_WIN32_ENVIRONMENT_) */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
int Levenshtein[MaxLineLength+1][MaxLineLength+1]; #define MIN(x,y) ((x) < (y) ? (x) : (y))
#define MAX(x,y) ((x) > (y) ? (x) : (y))
int main(int argc, char **argv) {
FILE *fp_inLINE;
FILE *fp_outLINE;
unsigned char workK[1024*32];
signed int workKoffset = -1;
unsigned long long FilesLEN;
unsigned long long k, k2, k3;
unsigned int LINE10len, wrdlen;
#if defined(_WIN32_ENVIRONMENT_)
unsigned long long size_inLINESIXFOUR;
#else
size_t size_inLINESIXFOUR;
#endif /* defined(_WIN32_ENVIRONMENT_) */
unsigned char wrd[MaxLineLength+1+1]; unsigned char wrdCACHED[MaxLineLength+1+1]; unsigned char workbyte;
time_t t1, t2, t3;
unsigned long long TotalLines=0;
unsigned long long WordsChecked=0;
unsigned long long DumpedLines=0;
unsigned int AtMostLevenshteinDistance;
unsigned int SkipHeuristic;
unsigned int StartingPosition;
char llTOaDigits[27]; char llTOaDigits2[27]; char llTOaDigits3[27];
int i,j,m,n;
char s[] = "sitting";
char t[] = "kitten";
m = strlen(s);
n = strlen(t);
printf("Galadriel, an x-gram suggesteress using Wagner-Fischer Levenshtein Distance, revision 1+++, copyleft Sanmayce 2013-Jan-21.\n");
if (argc != 4) {
printf("Usage: Galadriel AtMostLevenshteinDistance xgram xgramfile\n");
printf("Note: Longest xgram could be %d chars.\n", MaxLineLength);
printf("Example1: E:\\>Galadriel 0 ramjet MASAKARI_General-Purpose_Grade_English_Wordlist_r3_316423_words.wrd\n");
printf("Example2: E:\\>Galadriel 3 psychedlicize MASAKARI_General-Purpose_Grade_English_Wordlist_r3_316423_words.wrd\n");
} else {
(void) time(&t1);
AtMostLevenshteinDistance = atoi(argv[1]);
n = strlen(argv[2]);
if (n>MaxLineLength)
{ printf( "Galadriel: Incoming xgram exceeding the limit.\n" ); return( 1 ); }
memset (wrdCACHED, 0, MaxLineLength+1+1);
for(i=0;i<=MaxLineLength;i++)
Levenshtein[i][0] = i;
for(j=0;j<=MaxLineLength;j++)
Levenshtein[0][j] = j;
if( ( fp_inLINE = fopen( argv[3], "rb" ) ) == NULL )
{ printf( "Galadriel: Can't open %s file.\n", argv[3] ); return( 1 ); }
if( ( fp_outLINE = fopen( "Galadriel.txt", "wb" ) ) == NULL )
{ printf( "Galadriel: Can't open Galadriel.txt file.\n" ); return( 1 ); }
#if defined(_WIN32_ENVIRONMENT_)
_lseeki64( fileno(fp_inLINE), 0L, SEEK_END );
size_inLINESIXFOUR = _telli64( fileno(fp_inLINE) );
_lseeki64( fileno(fp_inLINE), 0L, SEEK_SET );
#else
fseeko( fp_inLINE, 0L, SEEK_END );
size_inLINESIXFOUR = ftello( fp_inLINE );
fseeko( fp_inLINE, 0L, SEEK_SET );
#endif /* defined(_WIN32_ENVIRONMENT_) */
FilesLEN = size_inLINESIXFOUR;
wrdlen = 0;
for( k = 0; k < size_inLINESIXFOUR; k++ )
{
if (workKoffset == -1) {
if (k + 1024*32 < size_inLINESIXFOUR) {
fread( &workK[0], 1, 1024*32, fp_inLINE );
workKoffset = 0;
workbyte = workK[workKoffset];
} else
fread( &workbyte, 1, 1, fp_inLINE );
} else {
workKoffset++;
workbyte = workK[workKoffset];
if (workKoffset == 1024*32 - 1) workKoffset = -1;
}
if( wrdlen < MaxLineLength +1+1)
{ wrd[ wrdlen ] = workbyte; } if (workbyte == 10) {
TotalLines++;
if ( 0 < wrdlen && wrdlen < MaxLineLength +1+1)
{
wrd[ wrdlen ] = 0;
if ( wrd[ wrdlen-1 ] == 13 ) wrd[ --wrdlen ] = 0;
m = wrdlen; if (m>MaxLineLength)
{ printf( "Galadriel: Incoming xgram exceeding the limit.\n" ); return( 1 ); }
#if defined(_WIN32ASM_)
if (AtMostLevenshteinDistance >= abs_AF(m-n))
#else
if (AtMostLevenshteinDistance >= MAX(m,n)-MIN(m,n)) #endif
{
StartingPosition = 1;
while ( wrdCACHED[StartingPosition-1] && wrdCACHED[StartingPosition-1]==wrd[StartingPosition-1]) StartingPosition++;
StartingPosition = MIN(StartingPosition, i);
WordsChecked++;
SkipHeuristic=0;
for(i=StartingPosition;i<=m;i++) { for (j=1;j<=n;j++) {
if(wrd[i-1] == argv[2][j-1])
Levenshtein[i][j] = Levenshtein[i-1][j-1];
else
#if defined(_WIN32ASM_)
Levenshtein[i][j] = min_AF(Levenshtein[i-1][j]+1, Levenshtein[i][j-1]+1, Levenshtein[i-1][j-1]+1); #else
Levenshtein[i][j] = MIN(MIN((Levenshtein[i-1][j]+1),(Levenshtein[i][j-1]+1)),(Levenshtein[i-1][j-1]+1)); #endif
}
if ( Levenshtein[i][n] - (m-i) >= 0 && AtMostLevenshteinDistance < Levenshtein[i][n] - (m-i) ) {SkipHeuristic=1; break;} }
if (SkipHeuristic==0 && AtMostLevenshteinDistance >= Levenshtein[m][n]) {
fprintf( fp_outLINE, "%s\n", wrd); DumpedLines++;
if ((DumpedLines & 0xff) == 0xff)
fflush(fp_outLINE); }
memcpy(wrdCACHED, wrd, wrdlen+1); }
}
wrdlen = 0;
}
else wrdlen++;
}
fclose(fp_inLINE);
fclose(fp_outLINE);
(void) time(&t3);
if (t3 <= t1) {t3 = t1; t3++;}
printf( "Galadriel: Total/Checked/Dumped xgrams: %s/%s/%s\n", _ui64toaKAZEcomma(TotalLines, llTOaDigits3, 10), _ui64toaKAZEcomma(WordsChecked, llTOaDigits2, 10), _ui64toaKAZEcomma(DumpedLines, llTOaDigits, 10) );
printf( "Galadriel: Performance: %s xgrams/s\n", _ui64toaKAZEcomma((TotalLines)/((int) t3-t1), llTOaDigits, 10) );
}
exit (0);
}
Hope it can be of use to your attempts.
Edit:
In case of not having memmem(), Windows lacks it, here is mine:
char * Railgun_Doublet (char * pbTarget, char * pbPattern, uint32_t cbTarget, uint32_t cbPattern)
{
char * pbTargetMax = pbTarget + cbTarget;
register uint32_t ulHashPattern;
uint32_t ulHashTarget, count, countSTATIC;
if (cbPattern > cbTarget) return(NULL);
countSTATIC = cbPattern-2;
pbTarget = pbTarget+cbPattern;
ulHashPattern = (*(uint16_t *)(pbPattern));
for ( ;; ) {
if ( ulHashPattern == (*(uint16_t *)(pbTarget-cbPattern)) ) {
count = countSTATIC;
while ( count && *(char *)(pbPattern+2+(countSTATIC-count)) == *(char *)(pbTarget-cbPattern+2+(countSTATIC-count)) ) {
count--;
}
if ( count == 0 ) return((pbTarget-cbPattern));
}
pbTarget++;
if (pbTarget > pbTargetMax) return(NULL);
}
}
modified 23-Feb-15 14:50pm.
|