package com.ensemble.wordcount; import java.io.BufferedReader; import java.io.FileReader; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; /** * This class contains various utility functions for counting words * @author Joe Robinson * */ public class WordUtil { /** * Reads all the words from a given file * * @param filename A String containing the filename or path to read from * @return A List containing all words in the file */ public List readWordsFromFile(String filename) throws IOException { FileReader fr = new FileReader(filename); BufferedReader br = new BufferedReader(fr); String line = br.readLine(); String[] tmpWords; List words = new ArrayList(); while (line != null) { tmpWords = line.split(" "); //Remove all punctuation and convert all words to lower case for processing for (String tmpWord : tmpWords) { words.add(tmpWord.replaceAll("[^a-zA-Z]", "").toLowerCase()); } line = br.readLine(); } br.close(); return words; } /** * Counts the number of occurrences of a word in a given list of words * Words will not be counted if they are a substring of another word in the list * * @param words A list containing single word strings * @return A map with the word as the key, and the number of occurrences as the value */ public Map countWords(List words) { Map wordCounts = new ConcurrentHashMap(); for (String newWord : words) { //If the exact word is already a key in the map, increment it's count if (wordCounts.containsKey(newWord)) { wordCounts.put(newWord, wordCounts.get(newWord) + 1); } else { boolean addWord = true; for (String countedWord : wordCounts.keySet()) { //If an existing word in the map contains this word, don't count it if (countedWord.contains(newWord)) { addWord = false; break; //If this word contains any of the existing words, remove them from the map } else if (newWord.contains(countedWord)) { wordCounts.remove(countedWord); } } //Add the word to the map as long as it wasn't found in an existing word if (addWord) { wordCounts.put(newWord, 1); } } } return wordCounts; } /** * Extract a sorted list of the words in the map based on their length * @param wordCounts A map containing strings as the key. Value is not used * * @return A list of the keys sorted into descending order of length */ public List sortWords(Map wordCounts) { LengthComparator lengthCompare = new LengthComparator(); List sortedWords = new ArrayList(wordCounts.keySet()); Collections.sort(sortedWords, lengthCompare); return sortedWords; } }