|
using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.Text.RegularExpressions;
namespace IRProj
{
public class Tokenizer
{
public List<string> Partition(string input)
{
List<string> filter = new List<string>();
Regex r = new Regex("([ \\t{}():;. \n])");
input = input.ToLower();
String[] tokens = r.Split(input);
for (int i = 0; i < tokens.Length; i++)
{
MatchCollection mc = r.Matches(tokens[i]);
if (mc.Count <= 0 && tokens[i].Trim().Length > 0
&& !StopWordsHandler.IsStopword(tokens[i]))
filter.Add(tokens[i]);
}
return filter;
}
}
}
|
By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.
If a file you wish to view isn't highlighted, and is a text file (not binary), please
let us know and we'll add colourisation support for it.