/* Program for Bad Character Heuristic of Boyer Moore String Matching Algorithm */ # include <limits.h> # include <string.h> # include <vector> #include <stdio.h> #include <iostream> #include <fstream> #include <omp.h> # define NO_OF_CHARS 256 using namespace std; // An unsigned char can store 1 Bytes (8bits) of data (0-255) typedef unsigned char BYTE; // A utility function to get maximum of two integers int max (int a, int b) { return (a > b)? a: b; } // The preprocessing function for Boyer Moore's bad character heuristic void badCharHeuristic( char *str, int size, int badchar[NO_OF_CHARS]) { int i; // Initialize all occurrences as -1 for (i = 0; i < NO_OF_CHARS; i++) badchar[i] = -1; // Fill the actual value of last occurrence of a character for (i = 0; i < size; i++) badchar[(int) str[i]] = i; } /* A pattern searching function that uses Bad Character Heuristic of Boyer Moore Algorithm */ void search( char *txt, string *Keywords,long long Pos,int &KeyWordCount,vector< vector<long long> > &Offsets) { char *pat; for (int i=0;i<KeyWordCount;i++) { pat=(char*)Keywords[i].c_str(); int m = strlen(pat); int n = strlen(txt); //cout<<"\n"<<pat<<"\n"; int badchar[NO_OF_CHARS]; /* Fill the bad character array by calling the preprocessing function badCharHeuristic() for given pattern */ badCharHeuristic(pat, m, badchar); int s = 0; // s is shift of the pattern with respect to text while(s <= (n - m)) { int j = m-1; /* Keep reducing index j of pattern while characters of pattern and text are matching at this shift s */ while(j >= 0 && pat[j] == txt[s+j]) j--; /* If the pattern is present at current shift, then index j will become -1 after the above loop */ if (j < 0) { //printf("\nFound @ Offset %d", Pos+s); if(!Offsets[i].empty()) //if Any Offset Exist For Keyword if(Offsets[i].back()==(Pos+s)) //Check it with last offset for duplication Offsets[i].pop_back(); //If duplicates, pop out Offsets[i].push_back((long long)(Pos+s)); //Push the offset //cout<<"\n '"<<pat<<"' Found @ Offset:"<<(Pos+s); /* Shift the pattern so that the next character in text aligns with the last occurrence of it in pattern. The condition s+m < n is necessary for the case when pattern occurs at the end of text */ s += (s+m < n)? m-badchar[txt[s+m]] : 1; } else /* Shift the pattern so that the bad character in text aligns with the last occurrence of it in pattern. The max function is used to make sure that we get a positive shift. We may get a negative shift if the last occurrence of bad character in pattern is on the right side of the current character. */ s += max(1, j - badchar[txt[s+j]]); } } } long long FileSize(const char* sFileName) { std::ifstream f; f.open(sFileName, std::ios_base::binary | std::ios_base::in); if (!f.good() || f.eof() || !f.is_open()) { return 0; } f.seekg(0, std::ios_base::beg); std::ifstream::pos_type begin_pos = f.tellg(); f.seekg(0, std::ios_base::end); return static_cast<long long>(f.tellg() - begin_pos); } /* The Program read the File and Search For Multiple Keyword. During Searching the occurrence of each Keyword is Stored in a Seperate Vector The Program Currently Displays the Hit Count of each Keyword using its Vector Size */ int main() { /*---------------------------------*/ /* Arguments to the Function, these values need to be updated */ //Give the File Location const char *filePath = "xyz.raw"; // int KeyWordCount=2; //Define the Searching Keywords; string *Keywords=new string[KeyWordCount]; Keywords[2]="gmail"; Keywords[0]="facebook"; int MaxKeyLen=0; //Define a Vector For Storing Offsets of Keyword occurrence vector< vector<long long> > Offsets; for(int i=0;i<KeyWordCount;i++) { Offsets.push_back(vector<long long>()); //Initialise Vector if(Keywords[i].size()> MaxKeyLen) MaxKeyLen=Keywords[i].size(); } cout<<"Max Length Of Keyword: "<<MaxKeyLen<<"\n"; /* --------------------------------------------*/ FILE *file = NULL; // File pointer int ChunkSize=512; // Declare the Blocks Size ie read from File // Current File Pointer // Open the file in binary mode using the "rb" format string // This also checks if the file exists and/or can be opened for reading correctly if ((file = fopen(filePath, "rb")) == NULL) cout << "Could not open specified file" << endl; else cout << "File opened successfully" << endl; //off_t fileSize = getfilesize(file); long long fileSize = FileSize(filePath); cout<<"FileSize:"<<fileSize<<endl; cout<<"Searching :\n"; //Process the File as Blocks int Currlevel; int w; #pragma omp parallel for for(int Pos=0;Pos<=fileSize; Pos+=ChunkSize-MaxKeyLen) { //Advance the pointer , here the MaxKeyLen is Subtracted //if(Pos!=0) //to skip keyword Skipping duringreading int x; // a variable local or private to each thread char *fileBuf; // Allocate space in the buffer for the Specified Block Size try { fileBuf = new char[ChunkSize]; } catch(...) { cout<<"Memory allocation exception"<<endl; //return 0; } x = omp_get_thread_num(); cout<<"Thread="<<x<<"Pos:"<<Pos<<"\n"; fread(fileBuf, ChunkSize, 1, file); //Read the File into Buffer fileBuf[ChunkSize]='\0'; //put a String Terminator For Search search(fileBuf,Keywords,Pos,KeyWordCount,Offsets); //Record the Hit delete[]fileBuf; //delete the Buffer w++; fseek(file,Pos,SEEK_SET); //Seek the file pointer //std::cin.get(); } //#pragma omp barrier //cout<<"\r"<<flush<<"Completed"; //Refresh the Screen /* Print the Search Result */ cout<<"\n\t***Keyword Result*** \n \tKeyword :\t Count"; for(int i=0;i<KeyWordCount;i++) cout<<"\n\t"<<Keywords[i]<<" :\t "<<Offsets[i].size()<<""; //Print Each Keyword and Its Seach Hit Count using Size of its Vector cout<<"\n i="<<w; /* --------------------------------------------*/ delete []Keywords; //delete the Buffer Offsets.clear(); fclose(file); //Close the file return 0; }
var
This content, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)