#!/usr/bin/python2 # -*- coding: utf-8 -*- import sys import shlex import optparse from lxml import etree import sqlite3 import requests VERSION = "2.3.3" class Definition(object): #ID is relative to the word type, eg noun 1, noun 2, verb 1, verb 2, not to the entire list id = 0 word = "" dictionary = "" word_type = "" definition = "" uses = [] synonyms = [] antonyms = [] def __init__(self, word, id, dictionary, word_type, definition, uses, synonyms, antonyms): self.word = word self.id = id self.dictionary = dictionary self.word_type = word_type self.definition = definition self.uses = uses self.synonyms = synonyms self.antonyms = antonyms def get_xml(word, word_dict): api_url = "http://services.aonaware.com/DictService/DictService.asmx/DefineInDict?dictId="+word_dict+"&word="+word try: xml = etree.parse(api_url) except IOError: print "Error: Could not access the Dictionary service." return #Root element tag is WordDefinition, which is the same as the element which contains the definition #That's akward, so let's just for element in xml.iter("{http://services.aonaware.com/webservices/}Definitions"): root = element break; return root def parse_xml(xml): #Only worrying about one definition for now, add multiples later for element in xml.iter("{http://services.aonaware.com/webservices/}WordDefinition"): definition = element.text break; definition_lines = definition.split("\n") items = [] synonyms = [] antonyms = [] id = 0 syn_line = False ant_line = False word = definition_lines[0] for line in definition_lines[1:]: extra_parts = line.split("[") line_parts = extra_parts[0].split(":") #Work out if this is a line that includes identifying information (word type or id) #This is done by checking the indenting - if the first 8 characters are spaces, it's not #It's a bit hacky, maybe change this later id_line = False for char in line_parts[0][0:7]: if char != " ": id_line = True if id_line: if id > 0: #Add a previous line to the array of definitions, if there is one item = Definition(word, id, "wn", word_type, definition, [], synonyms, antonyms) items.append(item) synonyms = [] antonyms = [] syn_line = False ant_line = False id_parts = line_parts[0].strip().split(" ") if id_parts[0] == "[also": #This line comes at the end, don't worry about it for now break; if id_parts[0].isdigit() != True: word_type = id_parts[0] if len(id_parts) < 2: id = 1 else : id = id_parts[1] definition = line_parts[1].strip() + " " else: id = id_parts[0] definition = line_parts[1].strip() + " " for word in line_parts[2:]: definition += word + " " elif syn_line: for part in line_parts: extra_words = part.strip().split(" ") for syn in extra_words: synonyms.append(syn.strip("{},] ")) elif ant_line: for part in line_parts: extra_words = part.strip().split(" ") for ant in extra_words: antonyms.append(syn.strip("{},] ")) else: definition += line_parts[0].strip() + " " if len(line_parts) > 1: for part in line_parts[1:-1]: definition += part + ":" definition += line_parts[-1] if len(extra_parts) > 0: for part in extra_parts: extra_words = part.split(" ") if extra_words[0] == "syn:": for syn in extra_words[1:]: synonyms.append(syn.strip("{},] ")) syn_line = True elif extra_words[0] == "ant:": for ant in extra_words[1:]: antonyms.append(ant.strip("{},] ")) ant_line = True item = Definition(word, id, "wn", word_type, definition, [], synonyms, antonyms) items.append(item) return items def parse_args(): args = sys.argv[1:] #If there are no args, try stdin if not args: args = shlex.split(sys.stdin.read()) parser = optparse.OptionParser( usage = "!define [type] [dictionary] | types: noun, verb, adjective, adverb | dictionaries: wordnet, oed, foldoc, urban" ) parser.add_option( "-v", "--version", action = "store_true", help = "Print the version number" ) parser.add_option( "-c", "--channel", action = "store", help = "The IRC channel of the request") options, args = parser.parse_args( args ) types = ["n", "noun", "v", "verb", "adj", "adjective", "adv", "adverb", "tech"] dicts = ["wn", "wordnet", "oed", "db", "foldoc", "urban"] word = "" word_type = "" word_dict = "" page_num = 1 stop_id = 0 #See if the dictionary or type have been specified if len(args) > 1: cur_id = 1 for arg in args[1:]: if arg in types: word_type = arg if stop_id == 0: stop_id = cur_id elif arg in dicts: word_dict = arg if stop_id == 0: stop_id = cur_id elif len(arg) < 3 and arg.isdigit(): page_num = int(arg) if stop_id == 0: stop_id = cur_id cur_id +=1 #If no type or dictionary has been set, use all the args if stop_id == 0: stop_id = len(args) #Read everything before type/dict as the desired word if len(args) > 0: cur_id = 0 for arg in args[:stop_id]: word += args[cur_id] + " " cur_id += 1 word = word.strip() elif not options.version: print "You must specify a word to define" sys.exit(1) return word, word_type, word_dict, page_num, options #Get definitions from the OED text file def parse_oed(word): #For some reason some definitions have — in front of the word type. We'll convert it to the standard format later #Note, that's not a -, it's some other unicode dash. #types = ["n.", "—n.", "v.", "—v.", "adj.", "—adj.", "adv.", "—adv."] types = {'n.' : 'n', '—n.' : 'n', 'v.' : 'v', '—v.' : 'v', 'adj.' : 'adj', '—adj.' : 'adj', 'adv.' : 'adv', '—adv.' : 'adv'} file = open("dictionaries/oed.txt") word_line = "" word_len = len(word.split(" ")) #Find the definition based on the first word on each line #TODO: Fix for multi word definitions for line in file: word_parts = line.split(" ") word_part = "" #Read the appropriate number of words depending on how many were specified for part in word_parts[0:word_len]: word_part += part + " " word_part = word_part.strip() #Ignore case if word_part.lower() == word.lower(): word_line = line break; if word_line == "": return prev_part = "" items = [] id = 0 definition = "" word_type = "" for part in line.split(" ")[word_len:]: #If we've reached a word type, add the current data to the list if part in types.keys(): #Don't add empty definitions if id > 0 and definition is not " " and definition is not "" and definition is not None: item = Definition(word, id, "oed", word_type, definition, [], [], []) items.append(item) definition = "" #Convert the word type to the simple version word_type = types[part] elif part.isdigit() and prev_part is not None:# and (prev_part[-1] == "." or prev_part[-1] == ")"): #Ignore anything before the first definition if part is not "1": item = Definition(word, id, "oed", word_type, definition, [], [], []) items.append(item) definition = "" id = part else: definition += part.strip() + " " prev_part = part #Words with one definition don't have an id if id == 0: id = 1 #Add the last item item = Definition(word, id, "oed", word_type, definition, [], [], []) items.append(item) #Sort the items by type tmp_items = [] simple_types = ["n", "v", "adj", "adv"] for simple_type in simple_types: for tmp_item in items: if tmp_item.word_type == simple_type: tmp_items.append(tmp_item) items = tmp_items return items #Gets all matching WordNet definitions from the database def get_sql(word): items = [] types = ["n", "v", "adj", "adv"] con = sqlite3.connect('dictionaries/wordnet.db'); con.row_factory = sqlite3.Row with con: cur = con.cursor() cur.execute("SELECT * from definitions where word = ? ORDER BY type_id,sub_id;", [word]) rows = cur.fetchall() for row in rows: id = row['id'] word = row['word'] sub_id = row['sub_id'] type = types[row['type_id']-1] definition = row['definition'] synset_id = row['synset_id'] cur.execute("SELECT * from uses where definition_id = ?", [id]) use_rows = cur.fetchall() uses = [] for use in use_rows: uses.append(use['quote']) item = Definition(word, sub_id, "wn", type,definition, uses, synset_id, []) items.append(item) return items def parse_foldoc(word, refer = None): file = open("dictionaries/foldoc.txt") word_line = "" word_len = len(word.split(" ")) found = False count_blank = 0 items = [] multiple = False id = 0 skip = False end = False referring = False definition = "" for line in file: #line = line.strip() word_parts = line.split(" ") word_part = "" if not found: #Read the appropriate number of words depending on how many were specified for part in word_parts[0:word_len]: word_part += part + " " word_part = word_part.rstrip() #Ignore case if word_part.lower() == word.lower(): found = True #Foldoc definitions are split over multiple lines, so keep reading once we've found it else: line = line.strip() #Line with the specified word is followed by one blank line #Skip the first blank line, and then stop when any further blank lines are found if len(line) == 0: count_blank += 1 if count_blank == 1: continue elif multiple: if end: break skip = True else: break subject_parts = line.split(">") if len(subject_parts) > 1: see_also = subject_parts[1].strip(" .") else: see_also = line #If the line is just one string enclosed in {}s, then it means "see also", so look up that word if len(line) > 0 and see_also[0] == "{" and see_also[-1] == "}": refer_items = parse_foldoc(see_also.strip("{} "), word) referring = True items += refer_items word_parts = "" #For handling words with multiple definitions if line[0:2] == "1.": multiple = True if referring: definition = "" if multiple: id_parts = line.split(".") #This section is very hacky to deal with various edge cases. Also it was 5am when I was wrote this, and by the time I was done I'd forgotten how it even worked #Maybe one day I'll fix it, but for now I don't want to look at it any more. Removing any one line may break some definitions in various ways. if id_parts[0].isdigit() and len(id_parts) > 1 and id_parts[0] != 0 and definition.strip() is not None and definition.strip(". ") is not word and definition.strip(". ") != refer: definition = definition[3:] if refer is not None: definition = word + ". " + definition if id == 0: id = 1 if not referring: if word != definition.strip(". "): item = Definition(word, id, "foldoc", "tech", definition, [], [], []) items.append(item) id = id_parts[0] definition = "" skip = False end = False referring = False elif referring: end = False elif skip: end = True if not skip and not end: for part in word_parts: definition += part.strip().replace("{", "").replace("}", "") + " " if not found : return else: if id == 0: id = 1 if definition is not None and len(definition) > 1 and definition.strip(". ") != word and definition.strip(". ") != refer and not referring : if definition[0].isdigit() and definition[1] == ".": definition = definition[3:] if refer is not None: definition = word + ". " + definition item = Definition(word, id, "foldoc", "tech", definition, [], [], []) items.append(item) return items def parse_urban(word): try: r = requests.get("http://urbanscraper.herokuapp.com/search/" + word) json = r.json() except: #Should probably print an error, but then that would break printing other definitions, so let's not return items = [] id = 1 for json_item in json: print json_item if json_item['definition'] != "" and json_item['definition'] is not None: item = Definition(word, id, "urban", "urban", json_item['definition'], [json_item['example']], [], []) items.append(item) id += 1 return items def main(): word, word_type, word_dict, page_num, options = parse_args() if options.version: print VERSION sys.exit( 0 ) if word_dict == "oed": items = parse_oed(word) elif word_dict == "wn": xml = get_xml(word, word_dict) if xml is None: print "Error finding definitions for " + word sys.exit( 1 ) items = parse_xml(xml) elif word_dict == "foldoc" or word_type == "tech": items = parse_foldoc(word) elif word_dict == "urban": items = parse_urban(word) else: if word_dict is None or word_dict == "": word_dict = "wn" items = get_sql(word) foldoc_items = parse_foldoc(word) if foldoc_items is not None and len(foldoc_items) > 0: items += foldoc_items urban_items = parse_urban(word) if urban_items is not None and len(urban_items) > 0: items += urban_items if items is None or len(items) == 0: print "No definitions found for "+word sys.exit( 1 ) line_length = 0 num_more = 0 suppress_print = False #Length limit for IRC messages #RFC specifies message length is 510. This includes protocol stuff. #37 is for the user/hostname of Bratchbot, then take off the length of the channel name if options.channel: max_length = 510 - 37 - len(options.channel) else: max_length = 460 types = ["n", "v", "adj", "adv", "tech", "urban"] type_id = 0 all_types = word_type is "" #Normally we try and display 1 of each type if no type is specified #But if the word only has one type of definition, display all of them if all_types: word_types = [] for item in items: if item.word_type not in word_types: word_types.append(item.word_type) if len(word_types) == 1: word_type = word_types[0] all_types = False for item in items: #If no type is specified, we display one of each if all_types: found_type = False cur_id = type_id if item.id < page_num: if item.id is not 0: continue elif item.id == 0 and page_num > 1: continue #Definitions should be ordered by type, so loop through all the types until we find a match for cur_type in types[type_id:]: if item.word_type == cur_type: word_type = cur_type type_id = cur_id found_type = True cur_id += 1 #If there were no matches, stick with the current type for now if not found_type: if type_id < len(types): word_type = types[type_id] if word_type == item.word_type: #Keep track of how many we haven't printed if suppress_print: num_more+=1 else: if item.id > 0: definition = item.word_type + " " + str(item.id) + ": " + item.definition else: definition = item.word_type + ": " + item.definition if definition[-1] is not "." and definition[-1] is not " " and len(item.uses) == 0 : definition += ". " elif definition[-1] == "." : definition += " " #Print usage examples if they exist if len(item.uses) > 0: definition = definition.rstrip(". ") definition += "; \""+item.uses[0]+"\" " if not all_types and line_length + len(definition) < (max_length * (page_num -1)): line_length += len(definition) +1 continue #Once we've reached the maximum length, stop printing any more elif line_length + len(definition) +1 > (max_length * page_num): #If we haven't printed anything so far, it's just one really long definition if line_length == 0: #So truncate it definition = definition[:max_length-3] + "..." print definition, else: num_more += 1 suppress_print = True else: print definition, line_length += len(definition) + 1 #Once we've printed one word of any given type, move on to the next type if all_types: type_id += 1 elif all_types and item.word_type is not word_type: num_more+=1 if (suppress_print or all_types) and num_more > 0: print "(" + str(num_more) + " more)" if __name__ == "__main__": main()