summaryrefslogtreecommitdiff
path: root/src/com/ensemble/wordcount/WordUtil.java
diff options
context:
space:
mode:
Diffstat (limited to 'src/com/ensemble/wordcount/WordUtil.java')
-rw-r--r--src/com/ensemble/wordcount/WordUtil.java108
1 files changed, 108 insertions, 0 deletions
diff --git a/src/com/ensemble/wordcount/WordUtil.java b/src/com/ensemble/wordcount/WordUtil.java
new file mode 100644
index 0000000..0344f4c
--- /dev/null
+++ b/src/com/ensemble/wordcount/WordUtil.java
@@ -0,0 +1,108 @@
+package com.ensemble.wordcount;
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+
+/**
+ * This class contains various utility functions for counting words
+ * @author Joe Robinson
+ *
+ */
+public class WordUtil {
+
+ /**
+ * Reads all the words from a given file
+ *
+ * @param filename A String containing the filename or path to read from
+ * @return A List containing all words in the file
+ */
+ public List<String> readWordsFromFile(String filename) throws IOException {
+
+ FileReader fr = new FileReader(filename);
+ BufferedReader br = new BufferedReader(fr);
+
+ String line = br.readLine();
+ String[] tmpWords;
+ List<String> words = new ArrayList<String>();
+
+ while (line != null) {
+
+ tmpWords = line.split(" ");
+
+ //Remove all punctuation and convert all words to lower case for processing
+ for (String tmpWord : tmpWords) {
+ words.add(tmpWord.replaceAll("[^a-zA-Z]", "").toLowerCase());
+ }
+
+ line = br.readLine();
+ }
+
+ br.close();
+
+ return words;
+ }
+
+ /**
+ * Counts the number of occurrences of a word in a given list of words
+ * Words will not be counted if they are a substring of another word in the list
+ *
+ * @param words A list containing single word strings
+ * @return A map with the word as the key, and the number of occurrences as the value
+ */
+ public Map<String,Integer> countWords(List<String> words) {
+
+ Map<String,Integer> wordCounts = new ConcurrentHashMap<String,Integer>();
+
+ for (String newWord : words) {
+
+ //If the exact word is already a key in the map, increment it's count
+ if (wordCounts.containsKey(newWord)) {
+ wordCounts.put(newWord, wordCounts.get(newWord) + 1);
+ } else {
+
+ boolean addWord = true;
+
+ for (String countedWord : wordCounts.keySet()) {
+
+ //If an existing word in the map contains this word, don't count it
+ if (countedWord.contains(newWord)) {
+ addWord = false;
+ break;
+ //If this word contains any of the existing words, remove them from the map
+ } else if (newWord.contains(countedWord)) {
+ wordCounts.remove(countedWord);
+ }
+ }
+
+ //Add the word to the map as long as it wasn't found in an existing word
+ if (addWord) {
+ wordCounts.put(newWord, 1);
+ }
+ }
+ }
+ return wordCounts;
+ }
+
+ /**
+ * Extract a sorted list of the words in the map based on their length
+ * @param wordCounts A map containing strings as the key. Value is not used
+ *
+ * @return A list of the keys sorted into descending order of length
+ */
+ public List<String> sortWords(Map<String, ?> wordCounts) {
+
+ LengthComparator lengthCompare = new LengthComparator();
+ List<String> sortedWords = new ArrayList<String>(wordCounts.keySet());
+
+ Collections.sort(sortedWords, lengthCompare);
+
+ return sortedWords;
+ }
+
+}