Click here to Skip to main content
Click here to Skip to main content
Add your own
alternative version

Detect a written text's language

, 21 Oct 2009
An article on how to detect the language of a written text.
using System;
using System.Collections;
using System.Runtime.Serialization;
namespace DialogueMaster.Babel
{
	/// <summary>
	/// Holds the statistic for a single Token
	/// </summary>
	[Serializable]
    public sealed partial class TokenStats : IComparable, ISerializable, ITokenStats
	{

		#region�Fields�(4)�

		private string m_Token;
		private int m_Occurences = 1;
		private int m_Position = 0;
		private int m_Rank = 0;

		#endregion�Fields�

		#region�Constructors�(2)�

        /// <summary>
        /// Creates a new TokenStats for the given NGram with the initial number of occurences
        /// </summary>
        /// <param name="token">the token</param>
        /// <param name="occurences">number of initial occurences</param>
		public TokenStats(string token, int occurences)
		{
			this.m_Token = token;
			this.m_Occurences = occurences;
		}

        /// <summary>
        /// Creates a new TokenStats for the given NGram with initially one   occurence
        /// </summary>
        /// <param name="token">the token</param>
        public TokenStats(string token)
            : this(token, 1)
		{
		}

		#endregion�Constructors�

		#region�Properties�(4)�

        /// <summary>
        /// Number of occuences (within the test data)
        /// </summary>
		public int Occurences
		{
			get {return this.m_Occurences;}
            set {this.m_Occurences = value;}
		}

        /// <summary>
        /// Postion in the statistics table (unique among all TokenStats in one Table)
        /// </summary>
		public int Position
		{
			get
			{
				return this.m_Position;
			}
			set
			{
				this.m_Position = value;
			}
		}

        /// <summary>
        /// Rank within the Table (multiple tokens with identical occurences share one rank)
        /// </summary>
		public int Rank
		{
			get
			{
				return this.m_Rank;
			}
			set
			{
				this.m_Rank = value;
			}
		}

        /// <summary>
        /// the token
        /// </summary>
		public string Token
		{
			get {return this.m_Token;}
		}

		#endregion�Properties�

		#region�Methods�(3)�


		//�Public�Methods�(3)�
        /// <summary>
        /// Increases the occurences by one
        /// </summary>
		public void AddOccurence()
		{
			this.m_Occurences++;
		}

        /// <summary>
        /// The hash code for a TokenStats is the HashCode of its token
        /// </summary>
        /// <returns></returns>
		public override int GetHashCode()
		{
			return this.m_Token.GetHashCode();
		}

		public override string ToString()
		{
			return this.m_Token+":"+this.m_Rank.ToString()+":"+this.m_Occurences.ToString();
		}


		#endregion�Methods�
		
        
        // compares by the score, not the NGRam name
		#region IComparable Members

		public int CompareTo(object obj)
		{
			if (obj is TokenStats)
			{

				int result = -1 * this.m_Occurences.CompareTo( ((TokenStats)obj).Occurences);
				// same number of occurences?
				if (result == 0) 
				{
					// sort by length
					result = this.m_Token.Length.CompareTo(((TokenStats)obj).Token.Length);
					// same length=
					if (result == 0)
					{
						// sort by alpha
						result = this.m_Token.CompareTo(((TokenStats)obj).Token);
					}
				}
				return result;
			}
			return 0;
		}

		#endregion

		#region ISerializable Members
		private TokenStats(SerializationInfo info, StreamingContext context)
		{
			this.m_Token = info.GetString("Token");
			this.m_Occurences = info.GetInt32("Occurences");
			this.m_Rank = info.GetInt32("Rank");
			this.m_Position = info.GetInt32("Position");
		}

		public void GetObjectData(SerializationInfo info, StreamingContext context)
		{
			info.AddValue("Token",this.m_Token);
			info.AddValue("Occurences",this.m_Occurences);
			info.AddValue("Rank",this.m_Rank);
			info.AddValue("Position",this.m_Position);
		}

		#endregion
	}

}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)

Share

About the Author

Carsten Zeumer
Software Developer (Senior)
Germany Germany
Carsten started programming Basic and Assembler back in the 80’s when he got his first C64. After switching to a x86 based system he started programming in Pascal and C. He started Windows programming with the arrival of Windows 3.0. After working for various internet companies developing a linguistic text analysis and classification software for 25hours communications he is now working as a contractor.
 
Carsten lives in Hamburg, Germany with his wife and five children.

| Advertise | Privacy | Mobile
Web04 | 2.8.140827.1 | Last Updated 21 Oct 2009
Article Copyright 2009 by Carsten Zeumer
Everything else Copyright © CodeProject, 1999-2014
Terms of Service
Layout: fixed | fluid