Click here to Skip to main content
15,893,814 members
Articles / Programming Languages / C#

Anatomy of a relevant search engine (part 1)

Rate me:
Please Sign up or sign in to vote.
4.88/5 (20 votes)
4 Jun 2009CPOL23 min read 177.1K   1.4K   105  
Information retrieval, semantic search relevance and ranking. About anatomy of a search engine. The simplest search engine source code.

// simplest search engine - parse several text file and build an inverted index
// Adrian Pirvu 2009

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using System.IO;


namespace SimplestSearchEngine
{
	public partial class Form1 : Form
	{






		public Form1()
		{
			InitializeComponent();
		}





		// inverted index - a dictionary of file lists
		// no file index, no ranking - we want it as simple is possible
		private Dictionary<string, List<string>> InvertedIndex = new Dictionary<string, List<string>>();






		// index files
		private void buttonIndex_Click(object sender, EventArgs e)
		{

			if (openFileDialog.ShowDialog() != DialogResult.OK)
				return;

			// parse each file and build the inverted index. 
			foreach (string fileName in openFileDialog.FileNames)
			{
				string text = File.ReadAllText(fileName).Replace("\r", " ").Replace("\n", " ");
				string[] terms = text.Split(" ".ToCharArray(), StringSplitOptions.RemoveEmptyEntries);

				// parse each term (word) in text file and put it in dictionary
				foreach (string term in terms)
				{
					if (!InvertedIndex.ContainsKey(term))
						InvertedIndex.Add(term, new List<string>());

					if (!InvertedIndex[term].Contains(fileName))
						InvertedIndex[term].Add(fileName);
				}
			}

			// update label
			labelIndex.Text = openFileDialog.FileNames.Length.ToString() + " files indexed";
		}







		// perform seach over the inverted index built earlier
		private void buttonSearch_Click(object sender, EventArgs e)
		{

			// split query in terms
			string query = textBoxQuery.Text;
			string[] terms = query.Split(" ".ToCharArray(), StringSplitOptions.RemoveEmptyEntries);


			// see documents for each term and merge them
			List<string> commonDocuments = null;
			foreach (string term in terms)
			{
				// if one term not in index, end search
				if (!InvertedIndex.ContainsKey(term))
				{
					if (commonDocuments != null)
						commonDocuments.Clear();
					break;
				}

				// find documents containing all terms
				if (commonDocuments == null)
					commonDocuments = InvertedIndex[term];
				else
					commonDocuments = GetCommonDocuments(commonDocuments, InvertedIndex[term]);
			}


			// display results
			if (commonDocuments != null)
			{
				listBoxResults.Items.Clear();
				foreach (string fileName in commonDocuments)
					listBoxResults.Items.Add(fileName);
			}
		}






		// merge two arrays of file indexes
		private List<string> GetCommonDocuments(List<string> documentList, List<string> newDocumentsList)
		{
			List<string> common = new List<string>();
			foreach (string file in documentList)
				if (newDocumentsList.Contains(file))
					common.Add(file);

			return common;
		}








	}
}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)


Written By
IBM
Romania Romania
adrian.pirvu gmail.com

Comments and Discussions