public class WordListBuilder { private static final Pattern english = Pattern.compile("[a-zA-Z0-9]+"); public static void main(final String[] args) throws Exception { List<String> lines = IOUtils.readLines ("E:/Ms Thesis/Implementation Month Task/English Novels/english.txt", 0); LanguageUtils languageUtils = new LanguageUtils(); HashMap<String, Integer> wordFreq = new HashMap<String, Integer>(); Set<String> used = new HashSet<String>(); for (String line : lines) { if(used.contains(line)){ continue; }else { used.add(line); } line = line.toLowerCase(); ArrayList<String> words = languageUtils.lineToWords(line, true); for (String word : words) { int freq = 1; if(english.matcher(word).matches()){ continue; } if(word.length() < 2) continue; if (wordFreq.keySet().contains(word)) { freq = freq + wordFreq.get(word); System.out.println(word+ " " +freq); } // System.out.println(freq); wordFreq.put(word, freq); } } System.out.println("Size: "+wordFreq.size()); Collection<Integer> freqs = wordFreq.values(); HashSet<Integer> freqSet = new HashSet<Integer>(); for(Integer i : freqs){ freqSet.add(i); } Comparator<Object> comparator = Collections.reverseOrder(); Set<Integer> treeSet = new TreeSet<Integer>(comparator); treeSet.addAll(freqSet); //Collections.sort(freqSet.toArray(new Integer [freqSet.size()]), comparator); StringBuilder sb = new StringBuilder(); int limit = 1000000; for (Integer freq : treeSet) { System.out.println(freq); for (String word : wordFreq.keySet()) { if(!word.matches("\\p{L}*")) continue; int freq1 = wordFreq.get(word); if (freq == freq1) { limit = limit - 1; String out = word+ freq + "\n"; sb.append(out); if(limit < 1){ System.out.println(limit); break; } } } } writeOut("E:/Ms Thesis/Implementation Month Task/English Novels/temp.csv", sb.toString(), true); System.out.println(sb.toString()); } public static synchronized void writeOut(String filename, String outString, boolean append) throws IOException { FileWriter fstream = new FileWriter(filename, append); BufferedWriter out = new BufferedWriter(fstream); out.write(outString); if (append) { out.write("\n"); } out.close(); fstream.close(); System.out.println("Done writing"); } } ........................ Method to convert lines to words(From language utils class) public ArrayList<String> lineToWords(String line, boolean hasSpace){ this.hasSpace = hasSpace; return lineToWords(line); // store and return it to lineToWords } // Line to words conversion public ArrayList<String> lineToWords(String string) { ArrayList<String> result = new ArrayList<String>(); if(!hasSpace) { String[] result0 = string.split(""); result = new ArrayList<String>(Arrays.asList(result0)); return result; // if no space create a new array list and assign it to result } TokenizerFactory tokenizerFactory = new IndoEuropeanTokenizerFactory(); Tokenizer tokenizer = tokenizerFactory.tokenizer(string.toCharArray(), 0, string.length()); while (true) { String token = tokenizer.nextToken(); if (token == null) { break; } // Iterate loop till the end of string. If Null then break result.add(token); } return result; }
writeOut
FileWriter
FileWriter fstream = new FileWriter(filename, append);
append
false
var
This content, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)