Click here to Skip to main content
Click here to Skip to main content
Add your own
alternative version

Tagged as

Go to top

Anatomy of a relevant search engine (part 1)

, 4 Jun 2009
Information retrieval, semantic search relevance and ranking. About anatomy of a search engine. The simplest search engine source code.
SimplestSearchEngine.zip
SimplestSearchEngine
Properties
Settings.settings
simplest_search_engine.bmp
SimplestSearchEngine.exe

// simplest search engine - parse several text file and build an inverted index
// Adrian Pirvu 2009

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using System.IO;


namespace SimplestSearchEngine
{
	public partial class Form1 : Form
	{






		public Form1()
		{
			InitializeComponent();
		}





		// inverted index - a dictionary of file lists
		// no file index, no ranking - we want it as simple is possible
		private Dictionary<string, List<string>> InvertedIndex = new Dictionary<string, List<string>>();






		// index files
		private void buttonIndex_Click(object sender, EventArgs e)
		{

			if (openFileDialog.ShowDialog() != DialogResult.OK)
				return;

			// parse each file and build the inverted index. 
			foreach (string fileName in openFileDialog.FileNames)
			{
				string text = File.ReadAllText(fileName).Replace("\r", " ").Replace("\n", " ");
				string[] terms = text.Split(" ".ToCharArray(), StringSplitOptions.RemoveEmptyEntries);

				// parse each term (word) in text file and put it in dictionary
				foreach (string term in terms)
				{
					if (!InvertedIndex.ContainsKey(term))
						InvertedIndex.Add(term, new List<string>());

					if (!InvertedIndex[term].Contains(fileName))
						InvertedIndex[term].Add(fileName);
				}
			}

			// update label
			labelIndex.Text = openFileDialog.FileNames.Length.ToString() + " files indexed";
		}







		// perform seach over the inverted index built earlier
		private void buttonSearch_Click(object sender, EventArgs e)
		{

			// split query in terms
			string query = textBoxQuery.Text;
			string[] terms = query.Split(" ".ToCharArray(), StringSplitOptions.RemoveEmptyEntries);


			// see documents for each term and merge them
			List<string> commonDocuments = null;
			foreach (string term in terms)
			{
				// if one term not in index, end search
				if (!InvertedIndex.ContainsKey(term))
				{
					if (commonDocuments != null)
						commonDocuments.Clear();
					break;
				}

				// find documents containing all terms
				if (commonDocuments == null)
					commonDocuments = InvertedIndex[term];
				else
					commonDocuments = GetCommonDocuments(commonDocuments, InvertedIndex[term]);
			}


			// display results
			if (commonDocuments != null)
			{
				listBoxResults.Items.Clear();
				foreach (string fileName in commonDocuments)
					listBoxResults.Items.Add(fileName);
			}
		}






		// merge two arrays of file indexes
		private List<string> GetCommonDocuments(List<string> documentList, List<string> newDocumentsList)
		{
			List<string> common = new List<string>();
			foreach (string file in documentList)
				if (newDocumentsList.Contains(file))
					common.Add(file);

			return common;
		}








	}
}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)

Share

About the Author

Adrian Pirvu

Romania Romania
No Biography provided

| Advertise | Privacy | Mobile
Web03 | 2.8.140926.1 | Last Updated 4 Jun 2009
Article Copyright 2009 by Adrian Pirvu
Everything else Copyright © CodeProject, 1999-2014
Terms of Service
Layout: fixed | fluid