summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--WordCount.jarbin0 -> 4262 bytes
-rw-r--r--classes/com/ensemble/wordcount/LengthComparator.classbin0 -> 590 bytes
-rw-r--r--classes/com/ensemble/wordcount/WordCount.classbin0 -> 2741 bytes
-rw-r--r--classes/com/ensemble/wordcount/WordUtil.classbin0 -> 2690 bytes
-rw-r--r--manifest.txt2
-rwxr-xr-xreadme.txt38
-rw-r--r--specification.txt38
-rw-r--r--src/com/ensemble/wordcount/LengthComparator.java17
-rw-r--r--src/com/ensemble/wordcount/WordCount.java103
-rw-r--r--src/com/ensemble/wordcount/WordUtil.java108
-rw-r--r--test.txt1
11 files changed, 307 insertions, 0 deletions
diff --git a/WordCount.jar b/WordCount.jar
new file mode 100644
index 0000000..def59d7
--- /dev/null
+++ b/WordCount.jar
Binary files differ
diff --git a/classes/com/ensemble/wordcount/LengthComparator.class b/classes/com/ensemble/wordcount/LengthComparator.class
new file mode 100644
index 0000000..a1c4e00
--- /dev/null
+++ b/classes/com/ensemble/wordcount/LengthComparator.class
Binary files differ
diff --git a/classes/com/ensemble/wordcount/WordCount.class b/classes/com/ensemble/wordcount/WordCount.class
new file mode 100644
index 0000000..63b145a
--- /dev/null
+++ b/classes/com/ensemble/wordcount/WordCount.class
Binary files differ
diff --git a/classes/com/ensemble/wordcount/WordUtil.class b/classes/com/ensemble/wordcount/WordUtil.class
new file mode 100644
index 0000000..5fef3bb
--- /dev/null
+++ b/classes/com/ensemble/wordcount/WordUtil.class
Binary files differ
diff --git a/manifest.txt b/manifest.txt
new file mode 100644
index 0000000..9a2b78e
--- /dev/null
+++ b/manifest.txt
@@ -0,0 +1,2 @@
+Main-Class: com.ensemble.wordcount.WordCount
+Class-Path: classes/
diff --git a/readme.txt b/readme.txt
new file mode 100755
index 0000000..af07ce4
--- /dev/null
+++ b/readme.txt
@@ -0,0 +1,38 @@
+Word Count Program
+Joe Robinson
+Java Test for Ensemble
+
+This program can be used to count the number of occurences of a word in a given file.
+
+It will not count words which are used again as part of longer words.
+
+E.g: "his" and "is" will not be counted if "this" is used.
+
+To run the program, run:
+
+java -jar WordCount.jar
+
+This will prompt you for a text file to count the words from. You can enter just the filename if it is in the same directory as WordCount.jar, or provide a full path to a file.
+
+You can also enter the filename as an argument, e.g:
+
+java -jar WordCount.jar test.txt
+
+Source code is located in src/com/ensemble/wordcount
+
+The file can be compiled and built with the following commands:
+
+javac -d classes src/com/ensemble/wordcount/*.java
+
+jar cvfm WordCount.jar manifest.txt classes/com/ensemble/wordcount/*.class
+
+Two example files are provided
+
+test.txt - the example line provided in the specification. The program will produce the same output as given in the specification.
+
+specification.txt - The specification for the program as provided by email.
+
+Known issue: Currently all punctuation is removed, as otherwise it was counting words with punctuation following them as a different word (e.g "maybe." would discard "maybe". This is not ideal as it would also remove punctuation in the middle of the words, and we probably only want to remove punctuation at the start/end of words. I considered this to be outside of the specification, as it would take some extra time to ensure this was handled properly, and it was not specified how or if this
+should be handled.
+
+Note: This has been tested on Linux and Windows systems, but should work on any system which can run Java.
diff --git a/specification.txt b/specification.txt
new file mode 100644
index 0000000..7734850
--- /dev/null
+++ b/specification.txt
@@ -0,0 +1,38 @@
+Code Test
+
+A customer wishes to create a word count program. However unlike a
+traditional word count they
+
+wish to ignore words which are partial matches of longer words For
+example using the following
+
+text:
+
+A mate material may maybe right maybe
+
+A is discarded as it is contained in mate, material, may and maybe.
+
+A mate material may maybe right maybe
+
+Mate is discarded as it is contained in material.
+
+A mate material may maybe right maybe
+
+May is discarded as it is contained in maybe
+
+A mate material may maybe right maybe
+
+Write a Java program which takes an input file and outputs the word
+count, for example based on
+
+the previous example text the output would be:
+
+Material: 1
+
+Maybe: 2
+
+Right: 1
+
+The output should be sorted by longest word in descending order.
+
+Ensure that your code is written to commercial standards.
diff --git a/src/com/ensemble/wordcount/LengthComparator.java b/src/com/ensemble/wordcount/LengthComparator.java
new file mode 100644
index 0000000..9692097
--- /dev/null
+++ b/src/com/ensemble/wordcount/LengthComparator.java
@@ -0,0 +1,17 @@
+package com.ensemble.wordcount;
+
+import java.util.Comparator;
+
+ /**
+ * A Comparator class for comparing the lengths of strings
+ * This is used for sorting the list of words by length
+ *
+ * @author Joe Robinson
+ */
+ public class LengthComparator implements Comparator<String> {
+
+ @Override
+ public int compare(String string1, String string2) {
+ return string2.length() - string1.length();
+ }
+ } \ No newline at end of file
diff --git a/src/com/ensemble/wordcount/WordCount.java b/src/com/ensemble/wordcount/WordCount.java
new file mode 100644
index 0000000..c6a63d0
--- /dev/null
+++ b/src/com/ensemble/wordcount/WordCount.java
@@ -0,0 +1,103 @@
+package com.ensemble.wordcount;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+
+/**
+ * This program may be used to count the number of occurences of words in a given file.
+ * Words are not counted if they are contained within a larger word (eg "is" and "this")
+ *
+ * @author Joe Robinson
+ *
+ */
+public class WordCount {
+
+ public static void main(String args[]) {
+
+ //Need to use a concurrent hashmap as we will be modifying it as we loop through
+ Map<String,Integer> wordCounts = new ConcurrentHashMap<String,Integer>();
+ List<String> words = new ArrayList<String>();
+ WordUtil wordUtil = new WordUtil();
+ String filename = "";
+
+ while (filename.equals("")) {
+
+ //If an argument has been entered, use it as the filename, if not prompt for one
+ if (args.length > 0 && args[0] != null) {
+ filename = args[0];
+ } else {
+ filename = readFilename();
+ }
+
+ try {
+ words = wordUtil.readWordsFromFile(filename);
+
+ } catch (IOException e) {
+ System.out.println("Could not open file. Please try another.");
+ filename = "";
+ words = new ArrayList<String>();
+ }
+
+ //Check that the file actually contains some words, if not then prompt for another
+ if (words.size() == 0) {
+ System.out.println("File does not contain any words. Please try another file.");
+ filename = "";
+ words = new ArrayList<String>();
+ }
+
+ }
+ wordCounts = wordUtil.countWords(words);
+
+ List<String> sortedWords = wordUtil.sortWords(wordCounts);
+
+ System.out.println();
+
+ for (String word : sortedWords) {
+
+ //Capitalise the first letter of the word for output
+ char firstChar = Character.toUpperCase(word.charAt(0));
+ String outputWord = firstChar + word.substring(1);
+
+ System.out.println(outputWord + ": " + wordCounts.get(word));
+ }
+ }
+
+ /**
+ * Prompt the user for a filename to read until one is entered, and confirm that it exists
+ * @return The filename, which is confirmed to exist
+ */
+ private static String readFilename() {
+
+ String filename = "";
+ boolean fileExists = false;
+ while (filename == null || filename.equals("") || !fileExists) {
+ BufferedReader br = new BufferedReader(new InputStreamReader(System.in));
+ try {
+ System.out.println("Please enter a file name to read, or type \"q\" to quit");
+ filename = br.readLine();
+ } catch (IOException e) {
+ System.out.println("Could not read input");
+ System.exit(1);
+ }
+
+ if (filename.equals("q")) {
+ System.exit(0);
+ }
+
+ File file = new File(filename);
+
+ if (file.exists()) {
+ fileExists = true;
+ } else {
+ System.out.println("File does not exist. Try again");
+ }
+ }
+ return filename;
+ }
+}
diff --git a/src/com/ensemble/wordcount/WordUtil.java b/src/com/ensemble/wordcount/WordUtil.java
new file mode 100644
index 0000000..0344f4c
--- /dev/null
+++ b/src/com/ensemble/wordcount/WordUtil.java
@@ -0,0 +1,108 @@
+package com.ensemble.wordcount;
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+
+/**
+ * This class contains various utility functions for counting words
+ * @author Joe Robinson
+ *
+ */
+public class WordUtil {
+
+ /**
+ * Reads all the words from a given file
+ *
+ * @param filename A String containing the filename or path to read from
+ * @return A List containing all words in the file
+ */
+ public List<String> readWordsFromFile(String filename) throws IOException {
+
+ FileReader fr = new FileReader(filename);
+ BufferedReader br = new BufferedReader(fr);
+
+ String line = br.readLine();
+ String[] tmpWords;
+ List<String> words = new ArrayList<String>();
+
+ while (line != null) {
+
+ tmpWords = line.split(" ");
+
+ //Remove all punctuation and convert all words to lower case for processing
+ for (String tmpWord : tmpWords) {
+ words.add(tmpWord.replaceAll("[^a-zA-Z]", "").toLowerCase());
+ }
+
+ line = br.readLine();
+ }
+
+ br.close();
+
+ return words;
+ }
+
+ /**
+ * Counts the number of occurrences of a word in a given list of words
+ * Words will not be counted if they are a substring of another word in the list
+ *
+ * @param words A list containing single word strings
+ * @return A map with the word as the key, and the number of occurrences as the value
+ */
+ public Map<String,Integer> countWords(List<String> words) {
+
+ Map<String,Integer> wordCounts = new ConcurrentHashMap<String,Integer>();
+
+ for (String newWord : words) {
+
+ //If the exact word is already a key in the map, increment it's count
+ if (wordCounts.containsKey(newWord)) {
+ wordCounts.put(newWord, wordCounts.get(newWord) + 1);
+ } else {
+
+ boolean addWord = true;
+
+ for (String countedWord : wordCounts.keySet()) {
+
+ //If an existing word in the map contains this word, don't count it
+ if (countedWord.contains(newWord)) {
+ addWord = false;
+ break;
+ //If this word contains any of the existing words, remove them from the map
+ } else if (newWord.contains(countedWord)) {
+ wordCounts.remove(countedWord);
+ }
+ }
+
+ //Add the word to the map as long as it wasn't found in an existing word
+ if (addWord) {
+ wordCounts.put(newWord, 1);
+ }
+ }
+ }
+ return wordCounts;
+ }
+
+ /**
+ * Extract a sorted list of the words in the map based on their length
+ * @param wordCounts A map containing strings as the key. Value is not used
+ *
+ * @return A list of the keys sorted into descending order of length
+ */
+ public List<String> sortWords(Map<String, ?> wordCounts) {
+
+ LengthComparator lengthCompare = new LengthComparator();
+ List<String> sortedWords = new ArrayList<String>(wordCounts.keySet());
+
+ Collections.sort(sortedWords, lengthCompare);
+
+ return sortedWords;
+ }
+
+}
diff --git a/test.txt b/test.txt
new file mode 100644
index 0000000..d64914f
--- /dev/null
+++ b/test.txt
@@ -0,0 +1 @@
+A mate material may maybe right maybe