summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJoe Robinson <joe@mumsnet.com>2014-09-19 18:08:50 +0100
committerJoe Robinson <joe@mumsnet.com>2014-09-19 18:08:50 +0100
commit8a5c561a575e9377b69587d03c4406fd0c15b2bd (patch)
treef87d7f0ffb1f1ecbf73f613f3f1be1ea15b5abdc
parentcdd4ec7a447c0212ea1b26fc337f84099d07d7cb (diff)
Lots of fixes for OED definitions. Also allow multi word definitions
-rwxr-xr-xbladictionary.py145
1 files changed, 122 insertions, 23 deletions
diff --git a/bladictionary.py b/bladictionary.py
index 36059bd..29b007f 100755
--- a/bladictionary.py
+++ b/bladictionary.py
@@ -7,7 +7,7 @@ import optparse
from lxml import etree
import sqlite3
-VERSION = "2.1.4b"
+VERSION = "2.1.5b"
class Definition(object):
#ID is relative to the word type, eg noun 1, noun 2, verb 1, verb 2, not to the entire list
@@ -142,6 +142,7 @@ def parse_xml(xml):
def parse_args():
args = sys.argv[1:]
+ #If there are no args, try stdin
if not args:
args = shlex.split(sys.stdin.read())
@@ -157,25 +158,62 @@ def parse_args():
word_type = ""
word_dict = ""
- if len(args) > 0:
- word = args[0]
+ stop_id = 0
+ #See if the dictionary or type have been specified
if len(args) > 1:
+ cur_id = 1
for arg in args[1:]:
if arg in types:
word_type = arg
+ if stop_id == 0:
+ stop_id = cur_id
elif arg in dicts:
word_dict = arg
+ if stop_id == 0:
+ stop_id = cur_id
+ cur_id +=1
+
+ #If no type or dictionary has been set, use all the args
+ if stop_id == 0:
+ stop_id = len(args)
+
+ #Read everything before type/dict as the desired word
+ if len(args) > 0:
+ cur_id = 0
+ for arg in args[:stop_id]:
+ word += args[cur_id] + " "
+ cur_id += 1
+
+ word = word.strip()
+ else:
+ print "You must specify a word to define"
+ sys.exit(1)
return word, word_type, word_dict, options
+#Get definitions from the OED text file
def parse_oed(word):
- types = ["n.", "—n.", "v.", "—v.", "adj.", "—adj.", "adv.", "—adv."]
+
+ #For some reason some definitions have — in front of the word type. We'll convert it to the standard format later
+ #Note, that's not a -, it's some other unicode dash.
+ #types = ["n.", "—n.", "v.", "—v.", "adj.", "—adj.", "adv.", "—adv."]
+ types = {'n.' : 'n', '—n.' : 'n', 'v.' : 'v', '—v.' : 'v', 'adj.' : 'adj', '—adj.' : 'adj', 'adv.' : 'adv', '—adv.' : 'adv'}
+
file = open("dictionaries/oed.txt")
word_line = ""
-
+ word_len = len(word.split(" "))
+ #Find the definition based on the first word on each line
+ #TODO: Fix for multi word definitions
for line in file:
- word_part = line.split(" ")[0]
+ word_parts = line.split(" ")
+ word_part = ""
+
+ #Read the appropriate number of words depending on how many were specified
+ for part in word_parts[0:word_len]:
+ word_part += part + " "
+ word_part = word_part.strip()
+ #Ignore case
if word_part.lower() == word.lower():
word_line = line
@@ -188,34 +226,56 @@ def parse_oed(word):
items = []
id = 0
definition = ""
-
- for part in line.split(" ")[1:]:
+ word_type = ""
+ print line
+ for part in line.split(" ")[word_len:]:
-
- if part in types:
- if id > 0:
+ #If we've reached a word type, add the current data to the list
+ if part in types.keys():
+ #Don't add empty definitions
+ if id > 0 and definition is not " " and definition is not "" and definition is not None:
item = Definition(word, id, "oed", word_type, definition, [], [], [])
items.append(item)
definition = ""
- word_type = part
+ #Convert the word type to the simple version
+ word_type = types[part]
- elif part.isdigit() and prev_part is not None and (prev_part[-1] == "." or prev_part[-1] == ")"):
- item = Definition(word, id, "oed", word_type, definition, [], [], [])
- items.append(item)
+ elif part.isdigit() and prev_part is not None:# and (prev_part[-1] == "." or prev_part[-1] == ")"):
+ #Ignore anything before the first definition
+ if part is not "1":
+ item = Definition(word, id, "oed", word_type, definition, [], [], [])
+ items.append(item)
definition = ""
id = part
else:
- definition += part + " "
+ definition += part.strip() + " "
prev_part = part
+ #Words with one definition don't have an id
if id == 0:
id = 1
- item = Definition(word, id, "oed", word_type, definition, [], [])
+
+ #Add the last item
+ item = Definition(word, id, "oed", word_type, definition, [], [], [])
items.append(item)
+
+ #Sort the items by type
+ tmp_items = []
+ simple_types = ["n", "v", "adj", "adv"]
+ for simple_type in simple_types:
+ print simple_type
+
+ for tmp_item in items:
+ print tmp_item.word_type
+ if tmp_item.word_type == simple_type:
+ tmp_items.append(tmp_item)
+
+ items = tmp_items
return items
+#Gets all matching WordNet definitions from the database
def get_sql(word):
items = []
@@ -266,38 +326,77 @@ def main():
items = get_sql(word)
if items is None or len(items) == 0:
- print "No definition found for "+word
- return
+ print "No definitions found for "+word
+ sys.exit( 1 )
line_length = 0
num_more = 0
suppress_print = False
+ #Length limit for IRC messages
+ #RFC specifies message length is 510. This includes protocol stuff.
+ #37 is for the user/hostname of Bratchbot, then take off the length of the channel name
if options.channel:
max_length = 510 - 37 - len(options.channel)
else:
max_length = 460
+ types = ["n", "v", "adj", "adv"]
+ type_id = 0
+ all_types = word_type is ""
for item in items:
- if word_type is "" or word_type == item.word_type:
+ #If no type is specified, we display one of each
+ if all_types:
+ found_type = False
+ cur_id = type_id
+
+ #Definitions should be ordered by type, so loop through all the types until we find a match
+ for cur_type in types[type_id:]:
+ if item.word_type == cur_type:
+ word_type = cur_type
+ type_id = cur_id
+ found_type = True
+
+ cur_id += 1
+
+ #If there were no matches, stick with the current type for now
+ if not found_type:
+ word_type = types[type_id]
+
+
+ if word_type == item.word_type:
+ #Keep track of how many we haven't printed
if suppress_print:
num_more+=1
else:
definition = item.word_type + " " + str(item.id) + ": " + item.definition
+ if definition[-1] is not "." and definition[-1] is not " ":
+ definition += "."
+
+ #Print usage examples if they exist
if len(item.uses) > 0:
definition += "; \""+item.uses[0]['quote']+"\""
- if line_length + len(definition) > max_length:
+
+ #Once we've reached the maximum length, stop printing any more
+ if line_length + len(definition) +1 > max_length:
suppress_print = True
num_more+= 1
else:
print definition,
- line_length += len(definition)
+ line_length += len(definition) + 1
+
+ #Once we've printed one word of any given type, move on to the next type
+ if all_types:
+ type_id += 1
+
+ elif all_types and item.word_type is not word_type:
+ num_more+=1
- if suppress_print:
+ if suppress_print or all_types:
print "(" + str(num_more) + " more)"
if __name__ == "__main__":