Statistical parsing of English sentences

Richard Northedge

Rate me:

4.95/5 (88 votes)

13 Dec 200617 min read

23.6K

239

Shows how to generate parse trees for English language sentences, using a C# port of OpenNLP, a statistical natural language parsing library.

englishparsing_net2_0_src.zip
- Lithium
  - AssemblyInfo.cs
  - Collections
  - Connection.cs
  - Connector.cs
  - Delegates
    - Delegates.cs
  - doc.xml
  - Entity.cs
  - Enums
    - Enums.cs
  - GraphAbstract.cs
  - Interfaces
    - Interfaces.cs
  - IO
  - Lithium.csproj
  - Lithium.csproj.vspscc
  - LithiumControl.bmp
  - Proxy.cs
  - ShapeBase.cs
  - Shapes
  - UI
    - LithiumControl.cs
    - LithiumControl.resx
  - Visitors
    - DeleteVisitor.cs
    - ExpanderVisitor.cs
- ModelConverter
  - App.ico
  - AssemblyInfo.cs
  - Converter.cs
  - ModelConverter.csproj
  - ModelConverter.csproj.vspscc
- OpenNLP.sln
- OpenNLP
  - AssemblyInfo.cs
  - lgpl.txt
  - OpenNLP.csproj
  - OpenNLP.csproj.vspscc
  - SharpEntropy.dll
  - Tools
- ParseTree
  - App.ico
  - AssemblyInfo.cs
  - MainForm.cs
  - MainForm.resx
  - ParseTree.csproj
  - ParseTree.csproj.vspscc
- ToolsExample
  - App.ico
  - AssemblyInfo.cs
  - MainForm.cs
  - MainForm.resx
  - ToolsExample.csproj
  - ToolsExample.csproj.vspscc
englishparsing_net2_0_bin.zip
- lgpl.txt
- Lithium.dll
- ModelConverter.exe
- OpenNLP Tools.chm
- OpenNLP.dll
- ParseTree.exe
- SharpEntropy.dll
- ToolsExample.exe
englishparsing_bin.zip
- lgpl.txt
- Lithium.dll
- ModelConverter.exe
- OpenNLP Tools.chm
- OpenNLP.dll
- ParseTree.exe
- SharpEntropy.dll
- ToolsExample.exe
englishparsing_src.zip
- AssemblyInfo.cs
- ConnectionCollection.cs
- ConnectorCollection.cs
- ShapeCollection.cs
- Connection.cs
- Connector.cs
- Delegates.cs
- doc.xml
- Entity.cs
- Enums.cs
- GraphAbstract.cs
- Interfaces.cs
- DataType.cs
- EdgeType.cs
- GraphDataAttribute.cs
- GraphDataCollection.cs
- GraphSerializer.cs
- GraphType.cs
- NodeType.cs
- ParentChild.cs
- ParentChildCollection.cs
- Lithium.csproj
- Lithium.csproj.user
- LithiumControl.bmp
- Proxy.cs
- ShapeBase.cs
- Copy of SimpleRectangle.cs
- OvalShape.cs
- SimpleRectangle.cs
- TextLabel.cs
- LithiumControl.cs
- LithiumControl.resx
- DeleteVisitor.cs
- ExpanderVisitor.cs
- App.ico
- AssemblyInfo.cs
- Converter.cs
- ModelConverter.csproj
- ModelConverter.csproj.user
- OpenNLP.sln
- AssemblyInfo.cs
- lgpl.txt
- OpenNLP.csproj
- OpenNLP.csproj.user
- SharpEntropy.dll
- ChunkerEventReader.cs
- DefaultChunkerContextGenerator.cs
- EnglishTreebankChunker.cs
- IChunker.cs
- IChunkerContextGenerator.cs
- MaximumEntropyChunker.cs
- vssver.scc
- DefaultNameContextGenerator.cs
- EnglishNameFinder.cs
- INameContextGenerator.cs
- INameFinder.cs
- MaximumEntropyNameFinder.cs
- NameFinderEventReader.cs
- vssver.scc
- BuildContextGenerator.cs
- CheckContextGenerator.cs
- ChunkContextGenerator.cs
- EnglishHeadRules.cs
- EnglishTreebankParser.cs
- IHeadRules.cs
- IParserChunker.cs
- IParserTagger.cs
- MaximumEntropyParser.cs
- Parse.cs
- ParserEventReader.cs
- vssver.scc
- DefaultPosContextGenerator.cs
- EnglishMaximumEntropyPosTagger.cs
- IPosContextGenerator.cs
- IPosTagger.cs
- MaximumEntropyPosTagger.cs
- PosEventReader.cs
- POSEventStream.cs
- PosLookupList.cs
- PosLookupListWriter.cs
- vssver.scc
- DefaultEndOfSentenceScanner.cs
- EnglishMaximumEntropySentenceDetector.cs
- IEndOfSentenceScanner.cs
- ISentenceDectector.cs
- MaximumEntropySentenceDetector.cs
- SentenceDetectionContextGenerator.cs
- SentenceDetectionEvent.cs
- SentenceDetectionEventReader.cs
- vssver.scc
- EnglishMaximumEntropyTokenizer.cs
- ITokenizer.cs
- MaximumEntropyTokenizer.cs
- TokenContextGenerator.cs
- TokenEventReader.cs
- TokenSpanEventReader.cs
- vssver.scc
- BeamSearch.cs
- Cache.cs
- HashSet.cs
- IBeamSearchContextGenerator.cs
- IHeap.cs
- IntegerPool.cs
- ListHeap.cs
- ObjectIntPair.cs
- Pair.cs
- Sequence.cs
- Set.cs
- SortedSet.cs
- Span.cs
- StringTokenizer.cs
- TreeHeap.cs
- TreeSet.cs
- vssver.scc
- App.ico
- AssemblyInfo.cs
- MainForm.cs
- MainForm.resx
- ParseTree.csproj
- ParseTree.csproj.user
- App.ico
- AssemblyInfo.cs
- MainForm.cs
- MainForm.resx
- ToolsExample.csproj
- ToolsExample.csproj.user

//Copyright (C) 2005 Richard J. Northedge
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.

//This file is based on the ChunkerME.java source file found in the
//original java implementation of OpenNLP.  That source file contains the following header:

//Copyright (C) 2003 Thomas Morton
// 
//This library is free software; you can redistribute it and/or
//modify it under the terms of the GNU Lesser General Public
//License as published by the Free Software Foundation; either
//version 2.1 of the License, or (at your option) any later version.
// 
//This library is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//GNU Lesser General Public License for more details.
// 
//You should have received a copy of the GNU Lesser General Public
//License along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
using System;
using System.Collections;

namespace OpenNLP.Tools.Chunker
{
	/// <summary>
	/// This class represents a maximum-entropy-based chunker.  Such a chunker can be used to
	/// find flat structures based on sequence inputs such as noun phrases or named entities.
	/// </summary>
	public class MaximumEntropyChunker : IChunker
	{		
		private Util.BeamSearch mBeam;
		private Util.Sequence mBestSequence;
		private SharpEntropy.IMaximumEntropyModel mModel; 

		/// <summary>
		/// The beam used to search for sequences of chunk tag assignments.
		/// </summary>
		protected internal Util.BeamSearch Beam
		{
			get
			{
				return mBeam;
			}
		}

		/// <summary>
		/// The model used to assign chunk tags to a sequence of tokens.
		/// </summary>
		protected internal SharpEntropy.IMaximumEntropyModel Model
		{
			get
			{
				return mModel;
			}
		}

		/// <summary>
		/// Creates a chunker using the specified model.
		/// </summary>
		/// <param name="model">
		/// The maximum entropy model for this chunker.
		/// </param>
		public MaximumEntropyChunker(SharpEntropy.IMaximumEntropyModel model):this(model, new DefaultChunkerContextGenerator(), 10)
		{
		}
		
		/// <summary>
		/// Creates a chunker using the specified model and context generator.
		/// </summary>
		/// <param name="model">
		/// The maximum entropy model for this chunker.
		/// </param>
		/// <param name="contextGenerator">
		/// The context generator to be used by the specified model.
		/// </param>
		public MaximumEntropyChunker(SharpEntropy.IMaximumEntropyModel model, IChunkerContextGenerator contextGenerator):this(model, contextGenerator, 10)
		{
		}
		
		/// <summary>
		/// Creates a chunker using the specified model and context generator and decodes the
		/// model using a beam search of the specified size.
		/// </summary>
		/// <param name="model">
		/// The maximum entropy model for this chunker.
		/// </param>
		/// <param name="contextGenerator">
		/// The context generator to be used by the specified model.
		/// </param>
		/// <param name="beamSize">
		/// The size of the beam that should be used when decoding sequences.
		/// </param>
		public MaximumEntropyChunker(SharpEntropy.IMaximumEntropyModel model, IChunkerContextGenerator contextGenerator, int beamSize)
		{
			mBeam = new ChunkBeamSearch(this, beamSize, contextGenerator, model);
			mModel = model;
		}
		
		/// <summary>
		/// Performs a chunking operation.
		/// </summary>
		/// <param name="tokens">
		/// ArrayList of tokens
		/// </param>
		/// <param name="tags">
		/// ArrayList of tags corresponding to the tokens
		/// </param>
		/// <returns>
		/// ArrayList of results, containing a value for each token, indicating the chunk that that token belongs to.
		/// </returns>
		public virtual ArrayList Chunk(ArrayList tokens, ArrayList tags)
		{
			mBestSequence = mBeam.BestSequence(tokens, new object[] { (string[]) tags.ToArray(typeof(string)) });
			return new ArrayList(mBestSequence.Outcomes);
		}
		
		/// <summary>
		/// Performs a chunking operation.
		/// </summary>
		/// <param name="tokens">
		/// Object array of tokens
		/// </param>
		/// <param name="tags">
		/// String array of POS tags corresponding to the tokens in the object array
		/// </param>
		/// <returns>
		/// String array containing a value for each token, indicating the chunk that that token belongs to.
		/// </returns>
		public virtual string[] Chunk(object[] tokens, string[] tags)
		{
			mBestSequence = mBeam.BestSequence(new ArrayList(tokens), new object[]{tags});
            return mBestSequence.Outcomes.ToArray();
		}
		
		/// <summary>
		/// Gets a list of all the possible chunking tags.
		/// </summary>
		/// <returns>
		/// String array, each entry containing a chunking tag.
		/// </returns>
		public virtual string[] AllTags()
		{
			string[] tags = new string[mModel.OutcomeCount];
			for (int currentTag = 0; currentTag < mModel.OutcomeCount; currentTag++)
			{
				tags[currentTag] = mModel.GetOutcomeName(currentTag);
			}
			return tags;
		}
		/// <summary>
		/// This method determines wheter the outcome is valid for the preceding sequence.  
		/// This can be used to implement constraints on what sequences are valid.  
		/// </summary>
		/// <param name="outcome">
		/// The outcome.
		/// </param>
		/// <param name="sequence">
		/// The preceding sequence of outcomes assignments. 
		/// </param>
		/// <returns>
		/// true if the outcome is valid for the sequence, false otherwise.
		/// </returns>
		protected internal virtual bool ValidOutcome(string outcome, Util.Sequence sequence)
		{
			return true;
		}
		
		/// <summary>
		/// This method determines wheter the outcome is valid for the preceeding sequence.  
		/// This can be used to implement constraints on what sequences are valid.  
		/// </summary>
		/// <param name="outcome">
		/// The outcome.
		/// </param>
		/// <param name="sequence">
		/// The preceding sequence of outcomes assignments. 
		/// </param>
		/// <returns>
		/// true if the outcome is valid for the sequence, false otherwise.
		/// </returns>
		protected internal virtual bool ValidOutcome(string outcome, string[] sequence) 
		{
			return true;
		}

		/// <summary>
		/// This class implements the abstract BeamSearch class to allow for the chunker to use
		/// the common beam search code. 
		/// </summary>
		private class ChunkBeamSearch : Util.BeamSearch
		{
			private MaximumEntropyChunker mMaxentChunker;
			
			public ChunkBeamSearch(MaximumEntropyChunker maxentChunker, int size, IChunkerContextGenerator contextGenerator, SharpEntropy.IMaximumEntropyModel model):base(size, contextGenerator, model)
			{
				mMaxentChunker = maxentChunker;
			}
			
			protected internal override bool ValidSequence(int index, ArrayList inputSequence, Util.Sequence outcomesSequence, string outcome)
			{
				return mMaxentChunker.ValidOutcome(outcome, outcomesSequence);
			}
    
			protected internal override bool ValidSequence(int index, object[] inputSequence, string[] outcomesSequence, string outcome) 
			{
				return mMaxentChunker.ValidOutcome(outcome, outcomesSequence);
			}
		}
		
		/// <summary>
		/// Populates the specified array with the probabilities of the last decoded sequence.  The
		/// sequence was determined based on the previous call to <code>chunk</code>.  The 
		/// specified array should be at least as large as the numbe of tokens in the previous call to <code>chunk</code>.
		/// </summary>
		/// <param name="probabilities">
		/// An array used to hold the probabilities of the last decoded sequence.
		/// </param>
		public virtual void GetProbabilities(double[] probabilities)
		{
			mBestSequence.GetProbabilities(probabilities);
		}
		
		/// <summary>
		/// Returns an array with the probabilities of the last decoded sequence.  The
		/// sequence was determined based on the previous call to <code>chunk</code>.
		/// </summary>
		/// <returns>
		/// An array with the same number of probabilities as tokens were sent to <code>chunk</code>
		/// when it was last called.   
		/// </returns>
		public virtual double[] GetProbabilities()
		{
			return mBestSequence.GetProbabilities();
		}
		
		/// <summary>
		/// Trains the chunker.  Training file should be one word per line where each line consists of a
		/// space-delimited triple of "word pos outcome".  Sentence breaks are indicated by blank lines.
		/// </summary>
		/// <param name="eventReader">
		/// The chunker event reader.
		/// </param>
		/// <returns>
		/// Trained model.
		/// </returns>
		public static SharpEntropy.GisModel Train(SharpEntropy.ITrainingEventReader eventReader)
		{
			return Train(eventReader, 100, 5);
		}

		/// <summary>
		/// Trains the chunker.  Training file should be one word per line where each line consists of a
		/// space-delimited triple of "word pos outcome".  Sentence breaks are indicated by blank lines.
		/// </summary>
		/// <param name="eventReader">
		/// The chunker event reader.
		/// </param>
		/// <param name="iterations">
		/// The number of iterations to perform.
		/// </param>
		/// <param name="cutoff">
		/// The number of times a predicate must be seen in order
		/// to be relevant for training.
		/// </param>
		/// <returns>
		/// Trained model.
		/// </returns>
		public static SharpEntropy.GisModel Train(SharpEntropy.ITrainingEventReader eventReader, int iterations, int cutoff)
		{
			SharpEntropy.GisTrainer trainer = new SharpEntropy.GisTrainer();
			trainer.TrainModel(iterations, new SharpEntropy.TwoPassDataIndexer(eventReader, cutoff));
			return new SharpEntropy.GisModel(trainer);
		}
	}
}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article has no explicit license attached to it but may contain usage terms in the article text or the download files themselves. If in doubt please contact the author via the discussion board below.

A list of licenses authors might use can be found here

Written By

Richard Northedge

Web Developer

United Kingdom

Richard Northedge is a senior developer with a UK Microsoft Gold Partner company. He has a postgraduate degree in English Literature, has been programming professionally since 1998 and has been an MCSD since 2000.

Statistical parsing of English sentences

License

Comments and Discussions