From 8a5c561a575e9377b69587d03c4406fd0c15b2bd Mon Sep 17 00:00:00 2001 From: Joe Robinson Date: Fri, 19 Sep 2014 18:08:50 +0100 Subject: Lots of fixes for OED definitions. Also allow multi word definitions --- bladictionary.py | 145 ++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 122 insertions(+), 23 deletions(-) diff --git a/bladictionary.py b/bladictionary.py index 36059bd..29b007f 100755 --- a/bladictionary.py +++ b/bladictionary.py @@ -7,7 +7,7 @@ import optparse from lxml import etree import sqlite3 -VERSION = "2.1.4b" +VERSION = "2.1.5b" class Definition(object): #ID is relative to the word type, eg noun 1, noun 2, verb 1, verb 2, not to the entire list @@ -142,6 +142,7 @@ def parse_xml(xml): def parse_args(): args = sys.argv[1:] + #If there are no args, try stdin if not args: args = shlex.split(sys.stdin.read()) @@ -157,25 +158,62 @@ def parse_args(): word_type = "" word_dict = "" - if len(args) > 0: - word = args[0] + stop_id = 0 + #See if the dictionary or type have been specified if len(args) > 1: + cur_id = 1 for arg in args[1:]: if arg in types: word_type = arg + if stop_id == 0: + stop_id = cur_id elif arg in dicts: word_dict = arg + if stop_id == 0: + stop_id = cur_id + cur_id +=1 + + #If no type or dictionary has been set, use all the args + if stop_id == 0: + stop_id = len(args) + + #Read everything before type/dict as the desired word + if len(args) > 0: + cur_id = 0 + for arg in args[:stop_id]: + word += args[cur_id] + " " + cur_id += 1 + + word = word.strip() + else: + print "You must specify a word to define" + sys.exit(1) return word, word_type, word_dict, options +#Get definitions from the OED text file def parse_oed(word): - types = ["n.", "—n.", "v.", "—v.", "adj.", "—adj.", "adv.", "—adv."] + + #For some reason some definitions have — in front of the word type. We'll convert it to the standard format later + #Note, that's not a -, it's some other unicode dash. + #types = ["n.", "—n.", "v.", "—v.", "adj.", "—adj.", "adv.", "—adv."] + types = {'n.' : 'n', '—n.' : 'n', 'v.' : 'v', '—v.' : 'v', 'adj.' : 'adj', '—adj.' : 'adj', 'adv.' : 'adv', '—adv.' : 'adv'} + file = open("dictionaries/oed.txt") word_line = "" - + word_len = len(word.split(" ")) + #Find the definition based on the first word on each line + #TODO: Fix for multi word definitions for line in file: - word_part = line.split(" ")[0] + word_parts = line.split(" ") + word_part = "" + + #Read the appropriate number of words depending on how many were specified + for part in word_parts[0:word_len]: + word_part += part + " " + word_part = word_part.strip() + #Ignore case if word_part.lower() == word.lower(): word_line = line @@ -188,34 +226,56 @@ def parse_oed(word): items = [] id = 0 definition = "" - - for part in line.split(" ")[1:]: + word_type = "" + print line + for part in line.split(" ")[word_len:]: - - if part in types: - if id > 0: + #If we've reached a word type, add the current data to the list + if part in types.keys(): + #Don't add empty definitions + if id > 0 and definition is not " " and definition is not "" and definition is not None: item = Definition(word, id, "oed", word_type, definition, [], [], []) items.append(item) definition = "" - word_type = part + #Convert the word type to the simple version + word_type = types[part] - elif part.isdigit() and prev_part is not None and (prev_part[-1] == "." or prev_part[-1] == ")"): - item = Definition(word, id, "oed", word_type, definition, [], [], []) - items.append(item) + elif part.isdigit() and prev_part is not None:# and (prev_part[-1] == "." or prev_part[-1] == ")"): + #Ignore anything before the first definition + if part is not "1": + item = Definition(word, id, "oed", word_type, definition, [], [], []) + items.append(item) definition = "" id = part else: - definition += part + " " + definition += part.strip() + " " prev_part = part + #Words with one definition don't have an id if id == 0: id = 1 - item = Definition(word, id, "oed", word_type, definition, [], []) + + #Add the last item + item = Definition(word, id, "oed", word_type, definition, [], [], []) items.append(item) + + #Sort the items by type + tmp_items = [] + simple_types = ["n", "v", "adj", "adv"] + for simple_type in simple_types: + print simple_type + + for tmp_item in items: + print tmp_item.word_type + if tmp_item.word_type == simple_type: + tmp_items.append(tmp_item) + + items = tmp_items return items +#Gets all matching WordNet definitions from the database def get_sql(word): items = [] @@ -266,38 +326,77 @@ def main(): items = get_sql(word) if items is None or len(items) == 0: - print "No definition found for "+word - return + print "No definitions found for "+word + sys.exit( 1 ) line_length = 0 num_more = 0 suppress_print = False + #Length limit for IRC messages + #RFC specifies message length is 510. This includes protocol stuff. + #37 is for the user/hostname of Bratchbot, then take off the length of the channel name if options.channel: max_length = 510 - 37 - len(options.channel) else: max_length = 460 + types = ["n", "v", "adj", "adv"] + type_id = 0 + all_types = word_type is "" for item in items: - if word_type is "" or word_type == item.word_type: + #If no type is specified, we display one of each + if all_types: + found_type = False + cur_id = type_id + + #Definitions should be ordered by type, so loop through all the types until we find a match + for cur_type in types[type_id:]: + if item.word_type == cur_type: + word_type = cur_type + type_id = cur_id + found_type = True + + cur_id += 1 + + #If there were no matches, stick with the current type for now + if not found_type: + word_type = types[type_id] + + + if word_type == item.word_type: + #Keep track of how many we haven't printed if suppress_print: num_more+=1 else: definition = item.word_type + " " + str(item.id) + ": " + item.definition + if definition[-1] is not "." and definition[-1] is not " ": + definition += "." + + #Print usage examples if they exist if len(item.uses) > 0: definition += "; \""+item.uses[0]['quote']+"\"" - if line_length + len(definition) > max_length: + + #Once we've reached the maximum length, stop printing any more + if line_length + len(definition) +1 > max_length: suppress_print = True num_more+= 1 else: print definition, - line_length += len(definition) + line_length += len(definition) + 1 + + #Once we've printed one word of any given type, move on to the next type + if all_types: + type_id += 1 + + elif all_types and item.word_type is not word_type: + num_more+=1 - if suppress_print: + if suppress_print or all_types: print "(" + str(num_more) + " more)" if __name__ == "__main__": -- cgit v1.2.3