diff options
author | Joe Robinson <joe@lc8n.com> | 2014-10-04 17:58:43 +0100 |
---|---|---|
committer | Joe Robinson <joe@lc8n.com> | 2014-10-04 17:58:43 +0100 |
commit | 7fa9083eb83e96eb53f7b8ca39634bc34d8f38c0 (patch) | |
tree | 4c048446fe1f26b5565a445539998ab92dab7fbb | |
parent | 05b3990f3d81ea66cbca6e4e6e2bfb8294318931 (diff) |
Fixes, cleanup, move definition class to own file
-rwxr-xr-x | bladictionary.py | 151 | ||||
-rwxr-xr-x | convert.py | 26 | ||||
-rw-r--r-- | definition.py | 24 |
3 files changed, 37 insertions, 164 deletions
diff --git a/bladictionary.py b/bladictionary.py index 58ddfeb..932d54e 100755 --- a/bladictionary.py +++ b/bladictionary.py @@ -7,29 +7,9 @@ import optparse from lxml import etree import sqlite3 import requests +from definition import Definition -VERSION = "2.4.0" -class Definition(object): - - #ID is relative to the word type, eg noun 1, noun 2, verb 1, verb 2, not to the entire list - id = 0 - word = "" - dictionary = "" - word_type = "" - definition = "" - uses = [] - synonyms = [] - antonyms = [] - - def __init__(self, word, id, dictionary, word_type, definition, uses, synonyms, antonyms): - self.word = word - self.id = id - self.dictionary = dictionary - self.word_type = word_type - self.definition = definition - self.uses = uses - self.synonyms = synonyms - self.antonyms = antonyms +VERSION = "2.4.1" def get_xml(word, word_dict): api_url = "http://services.aonaware.com/DictService/DictService.asmx/DefineInDict?dictId="+word_dict+"&word="+word @@ -238,7 +218,7 @@ def parse_oed(word): #If we've reached a word type, add the current data to the list if part in types.keys(): #Don't add empty definitions - if id > 0 and definition is not " " and definition is not "" and definition is not None: + if id > 0 and definition is not " " and definition: item = Definition(word, id, "oed", word_type, definition, [], [], []) items.append(item) definition = "" @@ -246,7 +226,7 @@ def parse_oed(word): #Convert the word type to the simple version word_type = types[part] - elif part.isdigit() and prev_part is not None:# and (prev_part[-1] == "." or prev_part[-1] == ")"): + elif part.isdigit() and prev_part is not None: #Ignore anything before the first definition if part is not "1": item = Definition(word, id, "oed", word_type, definition, [], [], []) @@ -335,117 +315,8 @@ def get_sql(word, dictionary = "wn"): return items -def parse_foldoc(word, refer = None): - - file = open("dictionaries/foldoc.txt") - word_line = "" - word_len = len(word.split(" ")) - found = False - count_blank = 0 - items = [] - multiple = False - id = 0 - skip = False - end = False - referring = False - - definition = "" - - for line in file: - - #line = line.strip() - word_parts = line.split(" ") - word_part = "" - if not found: - #Read the appropriate number of words depending on how many were specified - for part in word_parts[0:word_len]: - word_part += part + " " - word_part = word_part.rstrip() - #Ignore case - if word_part.lower() == word.lower(): - found = True - - #Foldoc definitions are split over multiple lines, so keep reading once we've found it - else: - line = line.strip() - #Line with the specified word is followed by one blank line - #Skip the first blank line, and then stop when any further blank lines are found - if len(line) == 0: - count_blank += 1 - if count_blank == 1: - continue - elif multiple: - if end: - break - skip = True - else: - break - - subject_parts = line.split(">") - if len(subject_parts) > 1: - see_also = subject_parts[1].strip(" .") - else: - see_also = line - - #If the line is just one string enclosed in {}s, then it means "see also", so look up that word - if len(line) > 0 and see_also[0] == "{" and see_also[-1] == "}": - refer_items = parse_foldoc(see_also.strip("{} "), word) - referring = True - items += refer_items - word_parts = "" - - #For handling words with multiple definitions - if line[0:2] == "1.": - multiple = True - if referring: - definition = "" - if multiple: - id_parts = line.split(".") - - #This section is very hacky to deal with various edge cases. Also it was 5am when I was wrote this, and by the time I was done I'd forgotten how it even worked - #Maybe one day I'll fix it, but for now I don't want to look at it any more. Removing any one line may break some definitions in various ways. - if id_parts[0].isdigit() and len(id_parts) > 1 and id_parts[0] != 0 and definition.strip() is not None and definition.strip(". ") is not word and definition.strip(". ") != refer: - definition = definition[3:] - if refer is not None: - definition = word + ". " + definition - if id == 0: - id = 1 - if not referring: - if word != definition.strip(". "): - item = Definition(word, id, "foldoc", "tech", definition, [], [], []) - items.append(item) - id = id_parts[0] - definition = "" - skip = False - end = False - - referring = False - - elif referring: - end = False - elif skip: - end = True - - if not skip and not end: - for part in word_parts: - definition += part.strip().replace("{", "").replace("}", "") + " " - - if not found : - return - else: - if id == 0: - id = 1 - if definition is not None and len(definition) > 1 and definition.strip(". ") != word and definition.strip(". ") != refer and not referring : - if definition[0].isdigit() and definition[1] == ".": - definition = definition[3:] - if refer is not None: - definition = word + ". " + definition - item = Definition(word, id, "foldoc", "tech", definition, [], [], []) - items.append(item) - return items - def parse_urban(word): word = word.replace(" ", "+") @@ -460,7 +331,7 @@ def parse_urban(word): items = [] id = 1 for json_item in json: - if json_item['definition'] != "" and json_item['definition'] is not None: + if json_item['definition']: item = Definition(word, id, "urban", "urban", json_item['definition'], [json_item['example']], [], []) items.append(item) id += 1 @@ -486,7 +357,7 @@ def main(): items = parse_xml(xml) elif word_dict == "foldoc" or word_type == "tech": - items = get_sql(word, word_dict) + items = get_sql(word, "foldoc") elif word_dict == "urban": items = parse_urban(word) else: @@ -564,15 +435,17 @@ def main(): if suppress_print: num_more+=1 else: + # Ignore anything after a line break as this breaks output + item_definition = item.definition.split("\n")[0] if item.id > 0: - definition = item.word_type + " " + str(item.id) + ": " + item.definition + definition = item.word_type + " " + str(item.id) + ": " + item_definition else: - definition = item.word_type + ": " + item.definition - if definition[-1] is not "." and definition[-1] is not " " and len(item.uses) == 0 : + definition = item.word_type + ": " + item_definition + definition = definition.strip() + if definition[-1] != "." and definition[-1] != " " and len(item.uses) == 0 : definition += ". " elif definition[-1] == "." : definition += " " - #Print usage examples if they exist if len(item.uses) > 0: definition = definition.rstrip(". ") @@ -4,31 +4,7 @@ import MySQLdb as mysql import sqlite3 import sys - -class Definition(object): - - #ID is relative to the word type, eg noun 1, noun 2, verb 1, verb 2, not to the entire list - id = 0 - word = "" - dictionary = "" - word_type = "" - definition = "" - uses = [] - synonyms = [] - antonyms = [] - categories = [] - - def __init__(self, word, id, dictionary, word_type, definition, uses = [], synonyms = [], antonyms = [], categories= [], see_also = False): - self.word = word - self.id = id - self.dictionary = dictionary - self.word_type = word_type - self.definition = definition - self.uses = uses - self.synonyms = synonyms - self.antonyms = antonyms - self.categories = categories - self.see_also = see_also +from definition import Definition def create(): try: diff --git a/definition.py b/definition.py new file mode 100644 index 0000000..877888d --- /dev/null +++ b/definition.py @@ -0,0 +1,24 @@ +class Definition(object): + + #ID is relative to the word type, eg noun 1, noun 2, verb 1, verb 2, not to the entire list + id = 0 + word = "" + dictionary = "" + word_type = "" + definition = "" + uses = [] + synonyms = [] + antonyms = [] + categories = [] + + def __init__(self, word, id, dictionary, word_type, definition, uses = [], synonyms = [], antonyms = [], categories= [], see_also = False): + self.word = word + self.id = id + self.dictionary = dictionary + self.word_type = word_type + self.definition = definition + self.uses = uses + self.synonyms = synonyms + self.antonyms = antonyms + self.categories = categories + self.see_also = see_also
\ No newline at end of file |