From dd14976a3e13053b554e164c582084ddf8e0e833 Mon Sep 17 00:00:00 2001
From: Joe Robinson <joe@lc8n.com>
Date: Mon, 26 Jan 2015 22:32:55 +0000
Subject: Initial commit

---
 src/com/ensemble/wordcount/WordUtil.java | 108 +++++++++++++++++++++++++++++++
 1 file changed, 108 insertions(+)
 create mode 100644 src/com/ensemble/wordcount/WordUtil.java

(limited to 'src/com/ensemble/wordcount/WordUtil.java')
diff --git a/src/com/ensemble/wordcount/WordUtil.java b/src/com/ensemble/wordcount/WordUtil.java
new file mode 100644
index 0000000..0344f4c
--- /dev/null
+++ b/src/com/ensemble/wordcount/WordUtil.java
@@ -0,0 +1,108 @@
+package com.ensemble.wordcount;
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+
+/**
+ * This class contains various utility functions for counting words
+ * @author Joe Robinson
+ *
+ */
+public class WordUtil {
+
+	/**
+	 * Reads all the words from a given file
+	 * 
+	 * @param filename	A String containing the filename or path to read from
+	 * @return			A List containing all words in the file
+	 */
+	public List<String> readWordsFromFile(String filename) throws IOException {
+		
+		FileReader fr = new FileReader(filename);
+		BufferedReader br = new BufferedReader(fr);
+		
+		String line = br.readLine();
+		String[] tmpWords;
+		List<String> words = new ArrayList<String>();
+		
+		while (line != null) {
+			
+			tmpWords = line.split(" ");
+			
+			//Remove all punctuation and convert all words to lower case for processing
+			for (String tmpWord : tmpWords) {
+				words.add(tmpWord.replaceAll("[^a-zA-Z]", "").toLowerCase());
+			}
+			
+			line = br.readLine();
+		}
+		
+		br.close();
+	
+		return words;
+	}
+	
+	/**
+	 * Counts the number of occurrences of a word in a given list of words
+	 * Words will not be counted if they are a substring of another word in the list
+	 * 
+	 * @param words	A list containing single word strings
+	 * @return		A map with the word as the key, and the number of occurrences as the value
+	 */
+	public Map<String,Integer> countWords(List<String> words) {
+		
+		Map<String,Integer> wordCounts = new ConcurrentHashMap<String,Integer>();
+		
+		for (String newWord : words) {
+			
+			//If the exact word is already a key in the map, increment it's count
+			if (wordCounts.containsKey(newWord)) {
+				wordCounts.put(newWord, wordCounts.get(newWord) + 1);
+			} else {
+				
+				boolean addWord = true;
+				
+				for (String countedWord : wordCounts.keySet()) {
+					
+					//If an existing word in the map contains this word, don't count it
+					if (countedWord.contains(newWord)) {
+						addWord = false;
+						break;
+					//If this word contains any of the existing words, remove them from the map
+					} else if (newWord.contains(countedWord)) {
+						wordCounts.remove(countedWord);
+					}
+				}
+				
+				//Add the word to the map as long as it wasn't found in an existing word
+				if (addWord) {
+					wordCounts.put(newWord, 1);
+				}
+			}
+		}
+		return wordCounts;
+	}
+	
+	/**
+	 * Extract a sorted list of the words in the map based on their length
+	 * @param wordCounts	A map containing strings as the key. Value is not used
+	 * 
+	 * @return				A list of the keys sorted into descending order of length
+	 */
+	public List<String> sortWords(Map<String, ?> wordCounts) {
+		
+		LengthComparator lengthCompare = new LengthComparator();
+		List<String> sortedWords = new ArrayList<String>(wordCounts.keySet());
+		
+		Collections.sort(sortedWords, lengthCompare);
+		
+		return sortedWords;
+	}
+
+}
-- 
cgit v1.2.3