From dd14976a3e13053b554e164c582084ddf8e0e833 Mon Sep 17 00:00:00 2001 From: Joe Robinson Date: Mon, 26 Jan 2015 22:32:55 +0000 Subject: Initial commit --- src/com/ensemble/wordcount/WordUtil.java | 108 +++++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) create mode 100644 src/com/ensemble/wordcount/WordUtil.java (limited to 'src/com/ensemble/wordcount/WordUtil.java') diff --git a/src/com/ensemble/wordcount/WordUtil.java b/src/com/ensemble/wordcount/WordUtil.java new file mode 100644 index 0000000..0344f4c --- /dev/null +++ b/src/com/ensemble/wordcount/WordUtil.java @@ -0,0 +1,108 @@ +package com.ensemble.wordcount; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +/** + * This class contains various utility functions for counting words + * @author Joe Robinson + * + */ +public class WordUtil { + + /** + * Reads all the words from a given file + * + * @param filename A String containing the filename or path to read from + * @return A List containing all words in the file + */ + public List readWordsFromFile(String filename) throws IOException { + + FileReader fr = new FileReader(filename); + BufferedReader br = new BufferedReader(fr); + + String line = br.readLine(); + String[] tmpWords; + List words = new ArrayList(); + + while (line != null) { + + tmpWords = line.split(" "); + + //Remove all punctuation and convert all words to lower case for processing + for (String tmpWord : tmpWords) { + words.add(tmpWord.replaceAll("[^a-zA-Z]", "").toLowerCase()); + } + + line = br.readLine(); + } + + br.close(); + + return words; + } + + /** + * Counts the number of occurrences of a word in a given list of words + * Words will not be counted if they are a substring of another word in the list + * + * @param words A list containing single word strings + * @return A map with the word as the key, and the number of occurrences as the value + */ + public Map countWords(List words) { + + Map wordCounts = new ConcurrentHashMap(); + + for (String newWord : words) { + + //If the exact word is already a key in the map, increment it's count + if (wordCounts.containsKey(newWord)) { + wordCounts.put(newWord, wordCounts.get(newWord) + 1); + } else { + + boolean addWord = true; + + for (String countedWord : wordCounts.keySet()) { + + //If an existing word in the map contains this word, don't count it + if (countedWord.contains(newWord)) { + addWord = false; + break; + //If this word contains any of the existing words, remove them from the map + } else if (newWord.contains(countedWord)) { + wordCounts.remove(countedWord); + } + } + + //Add the word to the map as long as it wasn't found in an existing word + if (addWord) { + wordCounts.put(newWord, 1); + } + } + } + return wordCounts; + } + + /** + * Extract a sorted list of the words in the map based on their length + * @param wordCounts A map containing strings as the key. Value is not used + * + * @return A list of the keys sorted into descending order of length + */ + public List sortWords(Map wordCounts) { + + LengthComparator lengthCompare = new LengthComparator(); + List sortedWords = new ArrayList(wordCounts.keySet()); + + Collections.sort(sortedWords, lengthCompare); + + return sortedWords; + } + +} -- cgit v1.2.3