From dd14976a3e13053b554e164c582084ddf8e0e833 Mon Sep 17 00:00:00 2001 From: Joe Robinson Date: Mon, 26 Jan 2015 22:32:55 +0000 Subject: Initial commit --- WordCount.jar | Bin 0 -> 4262 bytes .../com/ensemble/wordcount/LengthComparator.class | Bin 0 -> 590 bytes classes/com/ensemble/wordcount/WordCount.class | Bin 0 -> 2741 bytes classes/com/ensemble/wordcount/WordUtil.class | Bin 0 -> 2690 bytes manifest.txt | 2 + readme.txt | 38 ++++++++ specification.txt | 38 ++++++++ src/com/ensemble/wordcount/LengthComparator.java | 17 ++++ src/com/ensemble/wordcount/WordCount.java | 103 ++++++++++++++++++++ src/com/ensemble/wordcount/WordUtil.java | 108 +++++++++++++++++++++ test.txt | 1 + 11 files changed, 307 insertions(+) create mode 100644 WordCount.jar create mode 100644 classes/com/ensemble/wordcount/LengthComparator.class create mode 100644 classes/com/ensemble/wordcount/WordCount.class create mode 100644 classes/com/ensemble/wordcount/WordUtil.class create mode 100644 manifest.txt create mode 100755 readme.txt create mode 100644 specification.txt create mode 100644 src/com/ensemble/wordcount/LengthComparator.java create mode 100644 src/com/ensemble/wordcount/WordCount.java create mode 100644 src/com/ensemble/wordcount/WordUtil.java create mode 100644 test.txt diff --git a/WordCount.jar b/WordCount.jar new file mode 100644 index 0000000..def59d7 Binary files /dev/null and b/WordCount.jar differ diff --git a/classes/com/ensemble/wordcount/LengthComparator.class b/classes/com/ensemble/wordcount/LengthComparator.class new file mode 100644 index 0000000..a1c4e00 Binary files /dev/null and b/classes/com/ensemble/wordcount/LengthComparator.class differ diff --git a/classes/com/ensemble/wordcount/WordCount.class b/classes/com/ensemble/wordcount/WordCount.class new file mode 100644 index 0000000..63b145a Binary files /dev/null and b/classes/com/ensemble/wordcount/WordCount.class differ diff --git a/classes/com/ensemble/wordcount/WordUtil.class b/classes/com/ensemble/wordcount/WordUtil.class new file mode 100644 index 0000000..5fef3bb Binary files /dev/null and b/classes/com/ensemble/wordcount/WordUtil.class differ diff --git a/manifest.txt b/manifest.txt new file mode 100644 index 0000000..9a2b78e --- /dev/null +++ b/manifest.txt @@ -0,0 +1,2 @@ +Main-Class: com.ensemble.wordcount.WordCount +Class-Path: classes/ diff --git a/readme.txt b/readme.txt new file mode 100755 index 0000000..af07ce4 --- /dev/null +++ b/readme.txt @@ -0,0 +1,38 @@ +Word Count Program +Joe Robinson +Java Test for Ensemble + +This program can be used to count the number of occurences of a word in a given file. + +It will not count words which are used again as part of longer words. + +E.g: "his" and "is" will not be counted if "this" is used. + +To run the program, run: + +java -jar WordCount.jar + +This will prompt you for a text file to count the words from. You can enter just the filename if it is in the same directory as WordCount.jar, or provide a full path to a file. + +You can also enter the filename as an argument, e.g: + +java -jar WordCount.jar test.txt + +Source code is located in src/com/ensemble/wordcount + +The file can be compiled and built with the following commands: + +javac -d classes src/com/ensemble/wordcount/*.java + +jar cvfm WordCount.jar manifest.txt classes/com/ensemble/wordcount/*.class + +Two example files are provided + +test.txt - the example line provided in the specification. The program will produce the same output as given in the specification. + +specification.txt - The specification for the program as provided by email. + +Known issue: Currently all punctuation is removed, as otherwise it was counting words with punctuation following them as a different word (e.g "maybe." would discard "maybe". This is not ideal as it would also remove punctuation in the middle of the words, and we probably only want to remove punctuation at the start/end of words. I considered this to be outside of the specification, as it would take some extra time to ensure this was handled properly, and it was not specified how or if this +should be handled. + +Note: This has been tested on Linux and Windows systems, but should work on any system which can run Java. diff --git a/specification.txt b/specification.txt new file mode 100644 index 0000000..7734850 --- /dev/null +++ b/specification.txt @@ -0,0 +1,38 @@ +Code Test + +A customer wishes to create a word count program. However unlike a +traditional word count they + +wish to ignore words which are partial matches of longer words For +example using the following + +text: + +A mate material may maybe right maybe + +A is discarded as it is contained in mate, material, may and maybe. + +A mate material may maybe right maybe + +Mate is discarded as it is contained in material. + +A mate material may maybe right maybe + +May is discarded as it is contained in maybe + +A mate material may maybe right maybe + +Write a Java program which takes an input file and outputs the word +count, for example based on + +the previous example text the output would be: + +Material: 1 + +Maybe: 2 + +Right: 1 + +The output should be sorted by longest word in descending order. + +Ensure that your code is written to commercial standards. diff --git a/src/com/ensemble/wordcount/LengthComparator.java b/src/com/ensemble/wordcount/LengthComparator.java new file mode 100644 index 0000000..9692097 --- /dev/null +++ b/src/com/ensemble/wordcount/LengthComparator.java @@ -0,0 +1,17 @@ +package com.ensemble.wordcount; + +import java.util.Comparator; + + /** + * A Comparator class for comparing the lengths of strings + * This is used for sorting the list of words by length + * + * @author Joe Robinson + */ + public class LengthComparator implements Comparator { + + @Override + public int compare(String string1, String string2) { + return string2.length() - string1.length(); + } + } \ No newline at end of file diff --git a/src/com/ensemble/wordcount/WordCount.java b/src/com/ensemble/wordcount/WordCount.java new file mode 100644 index 0000000..c6a63d0 --- /dev/null +++ b/src/com/ensemble/wordcount/WordCount.java @@ -0,0 +1,103 @@ +package com.ensemble.wordcount; + +import java.io.BufferedReader; +import java.io.File; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +/** + * This program may be used to count the number of occurences of words in a given file. + * Words are not counted if they are contained within a larger word (eg "is" and "this") + * + * @author Joe Robinson + * + */ +public class WordCount { + + public static void main(String args[]) { + + //Need to use a concurrent hashmap as we will be modifying it as we loop through + Map wordCounts = new ConcurrentHashMap(); + List words = new ArrayList(); + WordUtil wordUtil = new WordUtil(); + String filename = ""; + + while (filename.equals("")) { + + //If an argument has been entered, use it as the filename, if not prompt for one + if (args.length > 0 && args[0] != null) { + filename = args[0]; + } else { + filename = readFilename(); + } + + try { + words = wordUtil.readWordsFromFile(filename); + + } catch (IOException e) { + System.out.println("Could not open file. Please try another."); + filename = ""; + words = new ArrayList(); + } + + //Check that the file actually contains some words, if not then prompt for another + if (words.size() == 0) { + System.out.println("File does not contain any words. Please try another file."); + filename = ""; + words = new ArrayList(); + } + + } + wordCounts = wordUtil.countWords(words); + + List sortedWords = wordUtil.sortWords(wordCounts); + + System.out.println(); + + for (String word : sortedWords) { + + //Capitalise the first letter of the word for output + char firstChar = Character.toUpperCase(word.charAt(0)); + String outputWord = firstChar + word.substring(1); + + System.out.println(outputWord + ": " + wordCounts.get(word)); + } + } + + /** + * Prompt the user for a filename to read until one is entered, and confirm that it exists + * @return The filename, which is confirmed to exist + */ + private static String readFilename() { + + String filename = ""; + boolean fileExists = false; + while (filename == null || filename.equals("") || !fileExists) { + BufferedReader br = new BufferedReader(new InputStreamReader(System.in)); + try { + System.out.println("Please enter a file name to read, or type \"q\" to quit"); + filename = br.readLine(); + } catch (IOException e) { + System.out.println("Could not read input"); + System.exit(1); + } + + if (filename.equals("q")) { + System.exit(0); + } + + File file = new File(filename); + + if (file.exists()) { + fileExists = true; + } else { + System.out.println("File does not exist. Try again"); + } + } + return filename; + } +} diff --git a/src/com/ensemble/wordcount/WordUtil.java b/src/com/ensemble/wordcount/WordUtil.java new file mode 100644 index 0000000..0344f4c --- /dev/null +++ b/src/com/ensemble/wordcount/WordUtil.java @@ -0,0 +1,108 @@ +package com.ensemble.wordcount; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +/** + * This class contains various utility functions for counting words + * @author Joe Robinson + * + */ +public class WordUtil { + + /** + * Reads all the words from a given file + * + * @param filename A String containing the filename or path to read from + * @return A List containing all words in the file + */ + public List readWordsFromFile(String filename) throws IOException { + + FileReader fr = new FileReader(filename); + BufferedReader br = new BufferedReader(fr); + + String line = br.readLine(); + String[] tmpWords; + List words = new ArrayList(); + + while (line != null) { + + tmpWords = line.split(" "); + + //Remove all punctuation and convert all words to lower case for processing + for (String tmpWord : tmpWords) { + words.add(tmpWord.replaceAll("[^a-zA-Z]", "").toLowerCase()); + } + + line = br.readLine(); + } + + br.close(); + + return words; + } + + /** + * Counts the number of occurrences of a word in a given list of words + * Words will not be counted if they are a substring of another word in the list + * + * @param words A list containing single word strings + * @return A map with the word as the key, and the number of occurrences as the value + */ + public Map countWords(List words) { + + Map wordCounts = new ConcurrentHashMap(); + + for (String newWord : words) { + + //If the exact word is already a key in the map, increment it's count + if (wordCounts.containsKey(newWord)) { + wordCounts.put(newWord, wordCounts.get(newWord) + 1); + } else { + + boolean addWord = true; + + for (String countedWord : wordCounts.keySet()) { + + //If an existing word in the map contains this word, don't count it + if (countedWord.contains(newWord)) { + addWord = false; + break; + //If this word contains any of the existing words, remove them from the map + } else if (newWord.contains(countedWord)) { + wordCounts.remove(countedWord); + } + } + + //Add the word to the map as long as it wasn't found in an existing word + if (addWord) { + wordCounts.put(newWord, 1); + } + } + } + return wordCounts; + } + + /** + * Extract a sorted list of the words in the map based on their length + * @param wordCounts A map containing strings as the key. Value is not used + * + * @return A list of the keys sorted into descending order of length + */ + public List sortWords(Map wordCounts) { + + LengthComparator lengthCompare = new LengthComparator(); + List sortedWords = new ArrayList(wordCounts.keySet()); + + Collections.sort(sortedWords, lengthCompare); + + return sortedWords; + } + +} diff --git a/test.txt b/test.txt new file mode 100644 index 0000000..d64914f --- /dev/null +++ b/test.txt @@ -0,0 +1 @@ +A mate material may maybe right maybe -- cgit v1.2.3