From 8a5c561a575e9377b69587d03c4406fd0c15b2bd Mon Sep 17 00:00:00 2001
From: Joe Robinson <joe@mumsnet.com>
Date: Fri, 19 Sep 2014 18:08:50 +0100
Subject: Lots of fixes for OED definitions. Also allow multi word definitions

---
 bladictionary.py | 145 ++++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 122 insertions(+), 23 deletions(-)

diff --git a/bladictionary.py b/bladictionary.py
index 36059bd..29b007f 100755
--- a/bladictionary.py
+++ b/bladictionary.py
@@ -7,7 +7,7 @@ import optparse
 from lxml import etree
 import sqlite3
 
-VERSION = "2.1.4b"
+VERSION = "2.1.5b"
 class Definition(object):
 
 	#ID is relative to the word type, eg noun 1, noun 2, verb 1, verb 2, not to the entire list
@@ -142,6 +142,7 @@ def parse_xml(xml):
 def parse_args():
 	args = sys.argv[1:]
 
+	#If there are no args, try stdin
 	if not args:
 		args = shlex.split(sys.stdin.read())
 
@@ -157,25 +158,62 @@ def parse_args():
 	word_type = ""
 	word_dict = ""
 
-	if len(args) > 0:
-		word = args[0]
+	stop_id = 0
 
+	#See if the dictionary or type have been specified
 	if len(args) > 1:
+		cur_id = 1
 		for arg in args[1:]:
 			if arg in types:
 				word_type = arg
+				if stop_id == 0:
+					stop_id = cur_id
 			elif arg in dicts:
 				word_dict = arg
+				if stop_id == 0:
+					stop_id = cur_id
+			cur_id +=1
+
+	#If no type or dictionary has been set, use all the args
+	if stop_id == 0:
+		stop_id = len(args)
+
+	#Read everything before type/dict as the desired word
+	if len(args) > 0:
+		cur_id = 0
+		for arg in args[:stop_id]:
+			word += args[cur_id] + " "
+			cur_id += 1
+
+		word = word.strip()
+	else:
+		print "You must specify a word to define"
+		sys.exit(1)
 
 	return word, word_type, word_dict, options
 
+#Get definitions from the OED text file
 def parse_oed(word):
-	types = ["n.", "—n.", "v.", "—v.", "adj.", "—adj.", "adv.", "—adv."]
+
+	#For some reason some definitions have — in front of the word type. We'll convert it to the standard format later
+	#Note, that's not a -, it's some other unicode dash.
+	#types = ["n.", "—n.", "v.", "—v.", "adj.", "—adj.", "adv.", "—adv."]
+	types = {'n.' : 'n', '—n.' : 'n', 'v.' : 'v', '—v.' : 'v', 'adj.' : 'adj', '—adj.' : 'adj', 'adv.' : 'adv', '—adv.' : 'adv'}
+
 	file = open("dictionaries/oed.txt")
 	word_line = ""
-
+	word_len = len(word.split(" "))
+	#Find the definition based on the first word on each line
+	#TODO: Fix for multi word definitions
 	for line in file:
-		word_part = line.split(" ")[0]
+		word_parts = line.split(" ")
+		word_part = ""
+
+		#Read the appropriate number of words depending on how many were specified
+		for part in word_parts[0:word_len]:
+			word_part += part + " "
+		word_part = word_part.strip()
+		#Ignore case
 
 		if word_part.lower() == word.lower():
 			word_line = line
@@ -188,34 +226,56 @@ def parse_oed(word):
 	items = []
 	id = 0
 	definition = ""
-
-	for part in line.split(" ")[1:]:
+	word_type = ""
+	print line
+	for part in line.split(" ")[word_len:]:
 		
-
-		if part in types:
-			if id > 0:
+		#If we've reached a word type, add the current data to the list
+		if part in types.keys():
+			#Don't add empty definitions
+			if id > 0 and definition is not " " and definition is not "" and definition is not None:
 				item = Definition(word, id, "oed", word_type, definition, [], [], [])
 				items.append(item)
 				definition = ""
 
-			word_type = part
+			#Convert the word type to the simple version
+			word_type = types[part]
 
-		elif part.isdigit() and prev_part is not None and (prev_part[-1] == "." or prev_part[-1] == ")"):
-			item = Definition(word, id, "oed", word_type, definition, [], [], [])
-			items.append(item)
+		elif part.isdigit() and prev_part is not None:# and (prev_part[-1] == "." or prev_part[-1] == ")"):
+			#Ignore anything before the first definition
+			if part is not "1":
+				item = Definition(word, id, "oed", word_type, definition, [], [], [])
+				items.append(item)
 			definition = ""
 			id = part
 		else:
-			definition += part + " "
+			definition += part.strip() + " "
 			
 		prev_part = part
 
+	#Words with one definition don't have an id
 	if id == 0:
 		id = 1
-	item = Definition(word, id, "oed", word_type, definition, [], [])
+
+	#Add the last item
+	item = Definition(word, id, "oed", word_type, definition, [], [], [])
 	items.append(item)
+
+	#Sort the items by type
+	tmp_items = []
+	simple_types = ["n", "v", "adj", "adv"]
+	for simple_type in simple_types:
+		print simple_type
+		
+		for tmp_item in items:
+			print tmp_item.word_type
+			if tmp_item.word_type == simple_type:
+				tmp_items.append(tmp_item)
+
+	items = tmp_items
 	return items
 
+#Gets all matching WordNet definitions from the database
 def get_sql(word):
 
 	items = []
@@ -266,38 +326,77 @@ def main():
 			items = get_sql(word)
 
 	if items is None or len(items) == 0:
-		print "No definition found for "+word
-		return
+		print "No definitions found for "+word
+		sys.exit( 1 )
 
 	line_length = 0
 	num_more = 0
 	suppress_print = False
 
+	#Length limit for IRC messages
+	#RFC specifies message length is 510. This includes protocol stuff.
+	#37 is for the user/hostname of Bratchbot, then take off the length of the channel name
 	if options.channel:
 		max_length = 510 - 37 - len(options.channel)
 	else:
 		max_length = 460
 
+	types = ["n", "v", "adj", "adv"]
+	type_id = 0
 
+	all_types = word_type is ""
 
 	for item in items:
 
-		if word_type is "" or word_type == item.word_type:
+		#If no type is specified, we display one of each
+		if all_types:
+			found_type = False
+			cur_id = type_id
+
+			#Definitions should be ordered by type, so loop through all the types until we find a match
+			for cur_type in types[type_id:]:
+				if item.word_type == cur_type:
+					word_type = cur_type
+					type_id = cur_id
+					found_type = True
+
+				cur_id += 1
+
+			#If there were no matches, stick with the current type for now
+			if not found_type:
+				word_type = types[type_id]
+
+
+		if word_type == item.word_type:
+			#Keep track of how many we haven't printed
 			if suppress_print:
 				num_more+=1
 			else:
 				definition = item.word_type + " " + str(item.id) + ": " + item.definition
+				if definition[-1] is not "." and definition[-1] is not " ":
+					definition += "."
+				
+				#Print usage examples if they exist
 				if len(item.uses) > 0:
 					definition += "; \""+item.uses[0]['quote']+"\""
-				if line_length + len(definition) > max_length:
+
+				#Once we've reached the maximum length, stop printing any more
+				if line_length + len(definition) +1 > max_length:
 					suppress_print = True
 					num_more+= 1
 				else:
 					print definition,
 
-				line_length += len(definition)
+				line_length += len(definition) + 1
+
+			#Once we've printed one word of any given type, move on to the next type	
+			if all_types:
+				type_id += 1
+
+		elif all_types and item.word_type is not word_type:
+			num_more+=1
 	
-	if suppress_print:
+	if suppress_print or all_types:
 		print "(" + str(num_more) + " more)"
 
 if __name__ == "__main__":
-- 
cgit v1.2.3