Statistical parsing of English sentences

Richard Northedge

Rate me:

4.95/5 (88 votes)

13 Dec 200617 min read

23.6K

239

Shows how to generate parse trees for English language sentences, using a C# port of OpenNLP, a statistical natural language parsing library.

englishparsing_net2_0_src.zip
- Lithium
  - AssemblyInfo.cs
  - Collections
  - Connection.cs
  - Connector.cs
  - Delegates
    - Delegates.cs
  - doc.xml
  - Entity.cs
  - Enums
    - Enums.cs
  - GraphAbstract.cs
  - Interfaces
    - Interfaces.cs
  - IO
  - Lithium.csproj
  - Lithium.csproj.vspscc
  - LithiumControl.bmp
  - Proxy.cs
  - ShapeBase.cs
  - Shapes
  - UI
    - LithiumControl.cs
    - LithiumControl.resx
  - Visitors
    - DeleteVisitor.cs
    - ExpanderVisitor.cs
- ModelConverter
  - App.ico
  - AssemblyInfo.cs
  - Converter.cs
  - ModelConverter.csproj
  - ModelConverter.csproj.vspscc
- OpenNLP.sln
- OpenNLP
  - AssemblyInfo.cs
  - lgpl.txt
  - OpenNLP.csproj
  - OpenNLP.csproj.vspscc
  - SharpEntropy.dll
  - Tools
- ParseTree
  - App.ico
  - AssemblyInfo.cs
  - MainForm.cs
  - MainForm.resx
  - ParseTree.csproj
  - ParseTree.csproj.vspscc
- ToolsExample
  - App.ico
  - AssemblyInfo.cs
  - MainForm.cs
  - MainForm.resx
  - ToolsExample.csproj
  - ToolsExample.csproj.vspscc
englishparsing_net2_0_bin.zip
- lgpl.txt
- Lithium.dll
- ModelConverter.exe
- OpenNLP Tools.chm
- OpenNLP.dll
- ParseTree.exe
- SharpEntropy.dll
- ToolsExample.exe
englishparsing_bin.zip
- lgpl.txt
- Lithium.dll
- ModelConverter.exe
- OpenNLP Tools.chm
- OpenNLP.dll
- ParseTree.exe
- SharpEntropy.dll
- ToolsExample.exe
englishparsing_src.zip
- AssemblyInfo.cs
- ConnectionCollection.cs
- ConnectorCollection.cs
- ShapeCollection.cs
- Connection.cs
- Connector.cs
- Delegates.cs
- doc.xml
- Entity.cs
- Enums.cs
- GraphAbstract.cs
- Interfaces.cs
- DataType.cs
- EdgeType.cs
- GraphDataAttribute.cs
- GraphDataCollection.cs
- GraphSerializer.cs
- GraphType.cs
- NodeType.cs
- ParentChild.cs
- ParentChildCollection.cs
- Lithium.csproj
- Lithium.csproj.user
- LithiumControl.bmp
- Proxy.cs
- ShapeBase.cs
- Copy of SimpleRectangle.cs
- OvalShape.cs
- SimpleRectangle.cs
- TextLabel.cs
- LithiumControl.cs
- LithiumControl.resx
- DeleteVisitor.cs
- ExpanderVisitor.cs
- App.ico
- AssemblyInfo.cs
- Converter.cs
- ModelConverter.csproj
- ModelConverter.csproj.user
- OpenNLP.sln
- AssemblyInfo.cs
- lgpl.txt
- OpenNLP.csproj
- OpenNLP.csproj.user
- SharpEntropy.dll
- ChunkerEventReader.cs
- DefaultChunkerContextGenerator.cs
- EnglishTreebankChunker.cs
- IChunker.cs
- IChunkerContextGenerator.cs
- MaximumEntropyChunker.cs
- vssver.scc
- DefaultNameContextGenerator.cs
- EnglishNameFinder.cs
- INameContextGenerator.cs
- INameFinder.cs
- MaximumEntropyNameFinder.cs
- NameFinderEventReader.cs
- vssver.scc
- BuildContextGenerator.cs
- CheckContextGenerator.cs
- ChunkContextGenerator.cs
- EnglishHeadRules.cs
- EnglishTreebankParser.cs
- IHeadRules.cs
- IParserChunker.cs
- IParserTagger.cs
- MaximumEntropyParser.cs
- Parse.cs
- ParserEventReader.cs
- vssver.scc
- DefaultPosContextGenerator.cs
- EnglishMaximumEntropyPosTagger.cs
- IPosContextGenerator.cs
- IPosTagger.cs
- MaximumEntropyPosTagger.cs
- PosEventReader.cs
- POSEventStream.cs
- PosLookupList.cs
- PosLookupListWriter.cs
- vssver.scc
- DefaultEndOfSentenceScanner.cs
- EnglishMaximumEntropySentenceDetector.cs
- IEndOfSentenceScanner.cs
- ISentenceDectector.cs
- MaximumEntropySentenceDetector.cs
- SentenceDetectionContextGenerator.cs
- SentenceDetectionEvent.cs
- SentenceDetectionEventReader.cs
- vssver.scc
- EnglishMaximumEntropyTokenizer.cs
- ITokenizer.cs
- MaximumEntropyTokenizer.cs
- TokenContextGenerator.cs
- TokenEventReader.cs
- TokenSpanEventReader.cs
- vssver.scc
- BeamSearch.cs
- Cache.cs
- HashSet.cs
- IBeamSearchContextGenerator.cs
- IHeap.cs
- IntegerPool.cs
- ListHeap.cs
- ObjectIntPair.cs
- Pair.cs
- Sequence.cs
- Set.cs
- SortedSet.cs
- Span.cs
- StringTokenizer.cs
- TreeHeap.cs
- TreeSet.cs
- vssver.scc
- App.ico
- AssemblyInfo.cs
- MainForm.cs
- MainForm.resx
- ParseTree.csproj
- ParseTree.csproj.user
- App.ico
- AssemblyInfo.cs
- MainForm.cs
- MainForm.resx
- ToolsExample.csproj
- ToolsExample.csproj.user

//Copyright (C) 2005 Richard J. Northedge
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.

//This file is based on the SentenceDetectorME.java source file found in the
//original java implementation of OpenNLP.  That source file contains the following header:

// Copyright (C) 2004 Jason Baldridge, Gann Bierner and Tom Morton
// 
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
// 
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU Lesser General Public License for more details.
// 
// You should have received a copy of the GNU Lesser General Public
// License along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.

using System;
using System.Collections.Generic;

namespace OpenNLP.Tools.SentenceDetect
{
	/// <summary>
	/// A sentence detector for splitting up raw text into sentences.  A maximum
	/// entropy model is used to evaluate the characters ".", "!", and "?" in a
	/// string to determine if they signify the end of a sentence.
	/// </summary>
	public class MaximumEntropySentenceDetector : ISentenceDetector
	{
		/// <summary>
		/// The maximum entropy model to use to evaluate contexts.
		/// </summary>
		private SharpEntropy.IMaximumEntropyModel mModel;
		
		/// <summary>
		/// The feature context generator.
		/// </summary>
        private SharpEntropy.IContextGenerator<Util.Pair<System.Text.StringBuilder, int>> mContextGenerator;
		
		/// <summary>
		/// The EndOfSentenceScanner to use when scanning for end of
		/// sentence offsets.
		/// </summary>
		private IEndOfSentenceScanner mScanner;
				
		/// <summary>
		/// The list of probabilities associated with each decision
		/// </summary>
		private List<double> mSentenceProbs;
		
		/// <summary>
		/// Constructor which takes a IMaximumEntropyModel and calls the three-arg
		/// constructor with that model, a SentenceDetectionContextGenerator, and the
		/// default end of sentence scanner.
		/// </summary>
		/// <param name="model">
		/// The MaxentModel which this SentenceDetectorME will use to
		/// evaluate end-of-sentence decisions.
		/// </param>
		public MaximumEntropySentenceDetector(SharpEntropy.IMaximumEntropyModel model) : this(model, new SentenceDetectionContextGenerator(DefaultEndOfSentenceScanner.GetEndOfSentenceCharacters()), new DefaultEndOfSentenceScanner())
		{
            mSentenceProbs = new List<double>(50);
		}
		
		/// <summary>
		/// Constructor which takes a IMaximumEntropyModel and a IContextGenerator.
		/// calls the three-arg constructor with a default ed of sentence scanner.
		/// </summary>
		/// <param name="model">
		/// The MaxentModel which this SentenceDetectorME will use to
		/// evaluate end-of-sentence decisions.
		/// </param>
		/// <param name="contextGenerator">
		/// The IContextGenerator object which this MaximumEntropySentenceDetector
		/// will use to turn strings into contexts for the model to
		/// evaluate.
		/// </param>
        public MaximumEntropySentenceDetector(SharpEntropy.IMaximumEntropyModel model, SharpEntropy.IContextGenerator<Util.Pair<System.Text.StringBuilder, int>> contextGenerator)
            : this(model, contextGenerator, new DefaultEndOfSentenceScanner())
		{
		}
		
		/// <summary> 
		/// Creates a new <code>MaximumEntropySentenceDetector</code> instance.
		/// </summary>
		/// <param name="model">
		/// The IMaximumEntropyModel which this MaximumEntropySentenceDetector will use to
		/// evaluate end-of-sentence decisions.
		/// </param>
		/// <param name="contextGenerator">The IContextGenerator object which this MaximumEntropySentenceDetector
		/// will use to turn strings into contexts for the model to
		/// evaluate.
		/// </param>
		/// <param name="scanner">the EndOfSentenceScanner which this MaximumEntropySentenceDetector
		/// will use to locate end of sentence indexes.
		/// </param>
        public MaximumEntropySentenceDetector(SharpEntropy.IMaximumEntropyModel model, SharpEntropy.IContextGenerator<Util.Pair<System.Text.StringBuilder, int>> contextGenerator, IEndOfSentenceScanner scanner)
		{
			mModel = model;
			mContextGenerator = contextGenerator;
			mScanner = scanner;
		}
		
		/// <summary>
		/// Returns the probabilities associated with the most recent
		/// calls to SentenceDetect().
		/// </summary>
		/// <returns>
		/// probability for each sentence returned for the most recent
		/// call to SentenceDetect.  If not applicable an empty array is
		/// returned.
		/// </returns>
		public virtual double[] GetSentenceProbabilities()
		{
            return mSentenceProbs.ToArray();
		}

		/// <summary> 
		/// Detect sentences in a string.
		/// </summary>
		/// <param name="input">
		/// The string to be processed.
		/// </param>
		/// <returns>   
		/// A string array containing individual sentences as elements.
		/// </returns>
		public virtual string[] SentenceDetect(string input)
		{
			int[] startsList = SentencePositionDetect(input);
			if (startsList.Length == 0) 
			{
				return new string[] {input};
			}
			
			bool isLeftover = startsList[startsList.Length - 1] != input.Length;
			string[] sentences = new string[isLeftover ? startsList.Length + 1 : startsList.Length];

			sentences[0] = input.Substring(0, (startsList[0]) - (0));
			for (int currentStart = 1; currentStart < startsList.Length; currentStart++)
			{
				sentences[currentStart] = input.Substring(startsList[currentStart - 1], (startsList[currentStart]) - (startsList[currentStart - 1]));
			}
			
			if (isLeftover) 
			{
				sentences[sentences.Length - 1] = input.Substring(startsList[startsList.Length - 1]);
			}

			return (sentences);
		}
		
		private int GetFirstWhitespace(string input, int position) 
		{
			while (position < input.Length && !char.IsWhiteSpace(input[position]))
			{
				position++;
			}
			return position;
		}

		private int GetFirstNonWhitespace(string input, int position)
		{
			while (position < input.Length && char.IsWhiteSpace(input[position]))
			{
				position++;
			}
			return position;
		}
		
		/// <summary> 
		/// Detect the position of the first words of sentences in a string.
		/// </summary>
		/// <param name="input">
		/// The string to be processed.
		/// </param>
		/// <returns>
		/// A integer array containing the positions of the end index of
		/// every sentence
		/// </returns>
		public virtual int[] SentencePositionDetect(string input)
		{
			double sentenceProbability = 1;
			mSentenceProbs.Clear();
			System.Text.StringBuilder buffer = new System.Text.StringBuilder(input);
			List<int> endersList = mScanner.GetPositions(input);
			List<int> positions = new List<int>(endersList.Count);
			
			for (int currentEnder = 0, enderCount = endersList.Count, index = 0; currentEnder < enderCount; currentEnder++)
			{
				int candidate = endersList[currentEnder];
				int cInt = candidate;
								
				// skip over the leading parts of non-token final delimiters
				int firstWhiteSpace = GetFirstWhitespace(input, cInt + 1);
				if (((currentEnder + 1) < enderCount) && ((endersList[currentEnder + 1]) < firstWhiteSpace)) 
				{
					continue;
				}

                Util.Pair<System.Text.StringBuilder, int> pair = new Util.Pair<System.Text.StringBuilder, int>(buffer, candidate);
				double[] probabilities = mModel.Evaluate(mContextGenerator.GetContext(pair));
				string bestOutcome = mModel.GetBestOutcome(probabilities);
				sentenceProbability *= probabilities[mModel.GetOutcomeIndex(bestOutcome)];
				if (bestOutcome.Equals("T") && IsAcceptableBreak(input, index, cInt))
				{
					if (index != cInt)
					{
						positions.Add(GetFirstNonWhitespace(input, GetFirstWhitespace(input, cInt + 1)));//moIntegerPool.GetInteger(GetFirstNonWhitespace(input, GetFirstWhitespace(input, cInt + 1))));
						mSentenceProbs.Add(probabilities[mModel.GetOutcomeIndex(bestOutcome)]);
					}
					index = cInt + 1;
				}
			}
			
            return positions.ToArray();
		}
		
		/// <summary>
		/// Allows subclasses to check an overzealous (read: poorly
		/// trained) model from flagging obvious non-breaks as breaks based
		/// on some boolean determination of a break's acceptability.
		/// 
		/// <p>The implementation here always returns true, which means
		/// that the IMaximumEntropyModel's outcome is taken as is.</p>
		/// </summary>
		/// <param name="input">
		/// the string in which the break occured. 
		/// </param>
		/// <param name="fromIndex">
		/// the start of the segment currently being evaluated 
		/// </param>
		/// <param name="candidateIndex">
		/// the index of the candidate sentence ending 
		/// </param>
		/// <returns> true if the break is acceptable 
		/// </returns>
		protected internal virtual bool IsAcceptableBreak(string input, int fromIndex, int candidateIndex)
		{
			return true;
		}
		
		public static SharpEntropy.GisModel TrainModel(SharpEntropy.ITrainingEventReader eventReader, int iterations, int cut)
		{
			SharpEntropy.GisTrainer trainer = new SharpEntropy.GisTrainer();
			trainer.TrainModel(eventReader, iterations, cut);
			return new SharpEntropy.GisModel(trainer);
		}
		
		/// <summary> Use this training method if you wish to supply an end of
		/// sentence scanner which provides a different set of ending chars
		/// other than the default ones.  They are "\\.|!|\\?|\\\"|\\)".
		/// </summary>
		public static SharpEntropy.GisModel TrainModel(string inFile, int iterations, int cut, IEndOfSentenceScanner scanner)
		{
			SharpEntropy.ITrainingEventReader eventReader;
			SharpEntropy.ITrainingDataReader<string> dataReader;
			System.IO.StreamReader streamReader;
			
			using (streamReader = new System.IO.StreamReader(inFile, System.Text.Encoding.UTF7)) 
			{
				dataReader = new SharpEntropy.PlainTextByLineDataReader(streamReader);
				eventReader = new SentenceDetectionEventReader(dataReader, scanner);

				SharpEntropy.GisTrainer trainer = new SharpEntropy.GisTrainer();
				trainer.TrainModel(eventReader, iterations, cut);
				return new SharpEntropy.GisModel(trainer);
			}
		}
		
	}
}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article has no explicit license attached to it but may contain usage terms in the article text or the download files themselves. If in doubt please contact the author via the discussion board below.

A list of licenses authors might use can be found here

Written By

Richard Northedge

Web Developer

United Kingdom

Richard Northedge is a senior developer with a UK Microsoft Gold Partner company. He has a postgraduate degree in English Literature, has been programming professionally since 1998 and has been an MCSD since 2000.

Statistical parsing of English sentences

License

Comments and Discussions