Click here to Skip to main content
15,885,952 members
Articles / Web Development / ASP.NET

DotLucene: Full-Text Search for Your Intranet or Website using 37 Lines of Code

Rate me:
Please Sign up or sign in to vote.
4.81/5 (65 votes)
6 Nov 2012Apache3 min read 383K   6.1K   302  
An introduction to Lucene.Net, the open source full-text search engine.
/*
 * Copyright 2012 dotlucene.net
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

using System.IO;
using System.Text.RegularExpressions;
using Lucene.Net.Analysis.Standard;
using Lucene.Net.Documents;
using Lucene.Net.Index;
using Lucene.Net.Store;
using Lucene.Net.Util;


namespace Indexer
{
	/// <summary>
	/// Summary description for Indexer.
	/// </summary>
	public class IntranetIndexer
	{
		private IndexWriter writer;
		private string docRootDirectory;
		private string pattern;

		/// <summary>
		/// Creates a new index in <c>directory</c>. Overwrites the existing index in that directory.
		/// </summary>
		/// <param name="directory">Path to index (will be created if not existing).</param>
		public IntranetIndexer(string directory)
		{
            writer = new IndexWriter(FSDirectory.Open(directory), new StandardAnalyzer(Version.LUCENE_30), true, IndexWriter.MaxFieldLength.LIMITED);
			writer.UseCompoundFile = true;
		}

		/// <summary>
		/// Add HTML files from <c>directory</c> and its subdirectories that match <c>pattern</c>.
		/// </summary>
		/// <param name="directory">Directory with the HTML files.</param>
		/// <param name="pattern">Search pattern, e.g. <c>"*.html"</c></param>
		public void AddDirectory(DirectoryInfo directory, string pattern)
		{
			this.docRootDirectory = directory.FullName;
			this.pattern = pattern;

			addSubDirectory(directory);
		}

		private void addSubDirectory(DirectoryInfo directory)
		{
			foreach (FileInfo fi in directory.GetFiles(pattern))
			{
				AddHtmlDocument(fi.FullName);
			}
			foreach (DirectoryInfo di in directory.GetDirectories())
			{
				addSubDirectory(di);
			}
		}

		/// <summary>
		/// Loads, parses and indexes an HTML file.
		/// </summary>
		/// <param name="path"></param>
		public void AddHtmlDocument(string path)
		{
			Document doc = new Document();

			string html;
			using (StreamReader sr = new StreamReader(path, System.Text.Encoding.Default))
			{
				html = sr.ReadToEnd();
			}

			int relativePathStartsAt = this.docRootDirectory.EndsWith("\\") ? this.docRootDirectory.Length : this.docRootDirectory.Length + 1;
			string relativePath = path.Substring(relativePathStartsAt);
	
            doc.Add(new Field("text", ParseHtml(html), Field.Store.YES, Field.Index.ANALYZED));
            doc.Add(new Field("path", relativePath, Field.Store.YES, Field.Index.NOT_ANALYZED));
            doc.Add(new Field("title", GetTitle(html), Field.Store.YES, Field.Index.ANALYZED));
			
			writer.AddDocument(doc);
		} 

		/// <summary>
		/// Very simple, inefficient, and memory consuming HTML parser. Take a look at Demo/HtmlParser in DotLucene package for a better HTML parser.
		/// </summary>
		/// <param name="html">HTML document</param>
		/// <returns>Plain text.</returns>
		private static string ParseHtml(string html)
		{
			string temp = Regex.Replace(html, "<[^>]*>", "");
			return temp.Replace("&nbsp;", " ");
		}

		/// <summary>
		/// Finds a title of HTML file. Doesn't work if the title spans two or more lines.
		/// </summary>
		/// <param name="html">HTML document.</param>
		/// <returns>Title string.</returns>
		private static string GetTitle(string html)
		{
			Match m = Regex.Match(html, "<title>(.*)</title>");
			if (m.Groups.Count == 2)
				return m.Groups[1].Value;
			return "(unknown)";
		}

		/// <summary>
		/// Optimizes and save the index.
		/// </summary>
		public void Close()
		{
			writer.Optimize();
			writer.Dispose();
		}


	}
}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The Apache License, Version 2.0


Written By
Czech Republic Czech Republic
My open-source event calendar/scheduling web UI components:

DayPilot for JavaScript, Angular, React and Vue

Comments and Discussions