From 5c9c234cc572220e73b89d3169e27c639c5e0270 Mon Sep 17 00:00:00 2001 From: Joe Robinson Date: Sun, 21 Sep 2014 07:01:48 +0100 Subject: Various hacky fixes for FOLDOC --- bladictionary.py | 73 +++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 54 insertions(+), 19 deletions(-) diff --git a/bladictionary.py b/bladictionary.py index 0adb568..6cdaa03 100755 --- a/bladictionary.py +++ b/bladictionary.py @@ -7,7 +7,7 @@ import optparse from lxml import etree import sqlite3 -VERSION = "2.2.1" +VERSION = "2.2.2" class Definition(object): #ID is relative to the word type, eg noun 1, noun 2, verb 1, verb 2, not to the entire list @@ -302,7 +302,7 @@ def get_sql(word): return items -def parse_foldoc(word, refer = False): +def parse_foldoc(word, refer = None): file = open("dictionaries/foldoc.txt") word_line = "" @@ -313,15 +313,13 @@ def parse_foldoc(word, refer = False): multiple = False id = 0 skip = False + end = False + referring = False - #If it's been referred from another definition, include the new word - if refer: - definition = word + ". " - else: - definition = "" + definition = "" for line in file: - + #line = line.strip() word_parts = line.split(" ") word_part = "" @@ -344,38 +342,75 @@ def parse_foldoc(word, refer = False): if count_blank == 1: continue elif multiple: + if end: + break skip = True else: break + + subject_parts = line.split(">") + if len(subject_parts) > 1: + see_also = subject_parts[1].strip(" .") + else: + see_also = line #If the line is just one string enclosed in {}s, then it means "see also", so look up that word - - if len(line) > 0 and line[0] == "{" and line[-1] == "}": - item = parse_foldoc(line.strip("{} "), True)[0] - items.append(item) - return items + if len(line) > 0 and see_also[0] == "{" and see_also[-1] == "}": + refer_items = parse_foldoc(see_also.strip("{} "), word) + referring = True + items += refer_items + word_parts = "" + + #For handling words with multiple definitions if line[0:2] == "1.": multiple = True + + if referring: + definition = "" if multiple: id_parts = line.split(".") - if id_parts[0].isdigit() and definition is not None and definition != "": + + #This section is very hacky to deal with various edge cases. Also it was 5am when I was wrote this, and by the time I was done I'd forgotten how it even worked + #Maybe one day I'll fix it, but for now I don't want to look at it any more. Removing any one line may break some definitions in various ways. + if id_parts[0].isdigit() and len(id_parts) > 1 and id_parts[0] != 0 and definition.strip() is not None and definition.strip(". ") is not word and definition.strip(". ") != refer: definition = definition[3:] - item = Definition(word, id, "foldoc", "tech", definition, [], [], []) - items.append(item) + if refer is not None: + definition = word + ". " + definition + if id == 0: + id = 1 + if not referring: + if word != definition.strip(". "): + item = Definition(word, id, "foldoc", "tech", definition, [], [], []) + items.append(item) id = id_parts[0] definition = "" skip = False + end = False - if not skip: + referring = False + + elif referring: + end = False + elif skip: + end = True + + if not skip and not end: for part in word_parts: definition += part.strip().replace("{", "").replace("}", "") + " " if not found : return else: - item = Definition(word, id, "foldoc", "tech", definition, [], [], []) - items.append(item) + if id == 0: + id = 1 + if definition is not None and len(definition) > 1 and definition.strip(". ") != word and definition.strip(". ") != refer and not referring : + if definition[0].isdigit() and definition[1] == ".": + definition = definition[3:] + if refer is not None: + definition = word + ". " + definition + item = Definition(word, id, "foldoc", "tech", definition, [], [], []) + items.append(item) return items -- cgit v1.2.3