Click here to Skip to main content
15,897,518 members
Articles / Programming Languages / C#

Lucene.Net - Text Analysis

Rate me:
Please Sign up or sign in to vote.
4.94/5 (41 votes)
6 Jan 2010Apache12 min read 192.9K   9.8K   169  
How to work with Lucene.Net's analysis.
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using Lucene.Net.Analysis;

namespace AnalyzerViewer
{
    public abstract class AnalyzerView
    {
        public abstract string Name { get; }

        public virtual string GetView(TokenStream tokenStream, out int numberOfTokens)
        {
            StringBuilder sb = new StringBuilder();

            Token token = tokenStream.Next();

            numberOfTokens = 0;

            while (token != null)
            {
                numberOfTokens++;
                sb.Append(GetTokenView(token));
                token = tokenStream.Next();
            }

            return sb.ToString();
        }

        protected abstract string GetTokenView(Token token);
    }

    public class TermAnalyzerView : AnalyzerView
    {
        public override string Name
        {
            get { return "Terms"; }
        }

        protected override string GetTokenView(Token token)
        {
            return "[" + token.TermText() + "]   ";
        }
    }

    public class TermWithOffsetsView : AnalyzerView
    {
        public override string Name
        {
            get { return "Terms With Offsets"; }
        }

        protected override string GetTokenView(Token token)
        {
            return token.TermText() + "   Start: " + token.StartOffset().ToString().PadLeft(5) + "  End: " + token.EndOffset().ToString().PadLeft(5) + "\r\n";
        }
    }

    public class TermFrequencies : AnalyzerView
    {
        public override string Name
        {
            get { return "Term Frequencies"; }
        }

        Dictionary<string, int> termDictionary = new Dictionary<string, int>();

        public override string GetView(TokenStream tokenStream, out int numberOfTokens)
        {
            StringBuilder sb = new StringBuilder();

            Token token = tokenStream.Next();

            numberOfTokens = 0;

            while (token != null)
            {
                numberOfTokens++;

                if (termDictionary.Keys.Contains(token.TermText()))
                    termDictionary[token.TermText()] = termDictionary[token.TermText()] + 1;
                else
                    termDictionary.Add(token.TermText(), 1);

                token = tokenStream.Next();
            }

            foreach (var item in termDictionary.OrderBy(x => x.Key))
            {
                sb.Append(item.Key + " [" + item.Value + "]   ");
            }

            termDictionary.Clear();

            return sb.ToString();
        }

        protected override string GetTokenView(Token token)
        {
            throw new NotImplementedException();
        }
    }



}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The Apache License, Version 2.0


Written By
Software Developer
United States United States
I'm a proud father and a software developer. I'm fascinated by a few particular .Net projects such as Lucene.Net, NHibernate, Quartz.Net, and others. I love learning and studying code to learn how other people solve software problems.

Comments and Discussions