From 7a1c8cc2a638518d2936e7e919d6d2edb0710980 Mon Sep 17 00:00:00 2001 From: Joe Robinson Date: Wed, 24 Sep 2014 01:21:09 +0100 Subject: Adding foldoc to the SQL conversion script --- convert.py | 150 ++++++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 130 insertions(+), 20 deletions(-) diff --git a/convert.py b/convert.py index 2be3d28..e5e35eb 100755 --- a/convert.py +++ b/convert.py @@ -16,8 +16,9 @@ class Definition(object): uses = [] synonyms = [] antonyms = [] + categories = [] - def __init__(self, word, id, dictionary, word_type, definition, uses, synonyms, antonyms): + def __init__(self, word, id, dictionary, word_type, definition, uses = [], synonyms = [], antonyms = [], categories= [], see_also = False): self.word = word self.id = id self.dictionary = dictionary @@ -26,6 +27,8 @@ class Definition(object): self.uses = uses self.synonyms = synonyms self.antonyms = antonyms + self.categories = categories + self.see_also = see_also def create(): try: @@ -126,12 +129,12 @@ def update_db(version): if version < 2: cur = con.cursor() - cur.execute("INSERT INTO types (type, abbreviation) VALUES('urban', 'urb.')") + cur.execute("INSERT INTO types (type, abbreviation) VALUES('technical', 'tech.')") cur.execute("ALTER TABLE definitions ADD COLUMN dictionary_id int") cur.execute("CREATE TABLE categories (id integer primary key not null, category text)") cur.execute("CREATE TABLE dictionaries (id integer primary key not null, name text, abbreviation text)") cur.execute("INSERT INTO dictionaries (name, abbreviation) VALUES('WordNet', 'wn')") - cur.execute("INSERT INTO dictionaries (name, abbreviation) VALUES('Urban Dictionary', 'urban')") + cur.execute("INSERT INTO dictionaries (name, abbreviation) VALUES('FOLDOC', 'foldoc')") cur.execute("CREATE TABLE definition_categories (id integer primary key not null, definition_id int category_id int)") cur.execute("UPDATE info set value = 2 where key = 'version'") @@ -139,25 +142,132 @@ def update_db(version): print "Database Error %s" % (e.args[0]) sys.exit(1) +def parse_foldoc(): -def update_db_1(): - try: - con = sqlite3.connect('dictionaries/wordnet.db'); + file = open("dictionaries/foldoc.txt") + word_line = "" + see_also = False + items = [] + id = 1 + word = "" + categories = [] - with con: - cur = con.cursor() - cur.execute("INSERT INTO types (type, abbreviation) VALUES('urban', 'urb.')") - cur.execute("ALTER TABLE definitions ADD COLUMN dictionary_id int") - cur.execute("CREATE TABLE categories (id integer primary key not null, category text)") - cur.execute("CREATE TABLE dictionaries (id integer primary key not null, name text, abbreviation text") - cur.execute("INSERT INTO dictionaries (name, abbreviation) VALUES('WordNet', 'wn')") - cur.execute("INSERT INTO dictionaries (name, abbreviation) VALUES('Urban Dictionary', 'urban')") - cur.execute("CREATE TABLE definition_categories (id integer primary key not null, definition_id int category_id int)") - cur.execute("UPDATE info set value = 2 where key = 'version'") + definition = "" + + for line in file: + #Find lines of word headings, these are ones without an indent + if line[0] != "\t" and len(line) > 0 and line[0] != "\n" and line[0] != " ": + + if definition is not None and len(definition) > 1: + item = Definition(word, id, "foldoc", "tech", definition.strip("\n "), [], [], [], categories, see_also) + items.append(item) + + # Start the new definition + word = line.strip() + id = 1 + definition = "" + categories = [] + see_also = False + continue + elif len(line) == 0 or line[0] == "\n": + definition += "\n" + continue + else: + line = line.strip() + + if definition != "" and (len(line) == 0 or line[0] == "\n"): + definition += "" + elif line[0].isdigit(): + id_parts = line.split(".") + if len(id_parts[0]) <=2: + + + if definition is not None and len(definition) > 1: + item = Definition(word, id, "foldoc", "tech", definition.strip("\n "), [], [], [], categories, see_also) + items.append(item) + + # Start the new definition + id = id_parts[0] + definition = "" + see_also = False + categories = [] + + remaining = "" + + for part in id_parts[1:-1]: + remaining += part + ". " + remaining += id_parts[-1] + + remaining = remaining.strip() + + #Get the categories, at the start of the line enclosed in <>s + if len(remaining) > 0 and remaining[0] == "<": + categories = [] + category_parts = remaining.split(">") + def_categories = category_parts[0].strip().split(",") + + for category in def_categories: + categories.append(category.strip("<> ")) + + remaining = "" + for part in category_parts[1:]: + remaining += part + + remaining = remaining.strip(". ") + + # Check if it's a 'see also' definition + if len(remaining) > 0 and remaining[0] == "{" and remaining[-1] == "}": + count_braces = 0 + # Avoid false positives that happen to start with a { and end with a }, but have others in between + for char in remaining: + if char == "}": + count_braces += 1 + if count_braces == 1: + print "print " + see_also = True + + definition += remaining.strip("<> ").replace("{", "").replace("}", "") + + # Check if it's a 'see also' definition + elif line[0] == "{" and line[-1] == "}": + count_braces = 0 + + # Avoid false positives that happen to start with a { and end with a }, but have others in between + for char in line: + if char == "}": + count_braces += 1 + if count_braces == 1: + see_also = True + definition = definition = line.strip("{} ") + else: + definition += line.replace("{", "").replace("}", "") + + else: + # Get the categories + if len(line) > 0 and line[0] == "<": + categories = [] + category_parts = line.strip().split(">") + def_categories = category_parts[0].strip().split(",") + for category in def_categories: + categories.append(category.strip("<> ")) + + for part in category_parts[1:]: + definition += part.replace("{", "").replace("}", "") + elif len(line) > 0: + #Ignore date lines + if line[0] != "(" and line[-1] != ")" and not line[1].isdigit(): + definition += line.replace("{", "").replace("}", "") + if len(definition) > 0 and definition[-1] != " ": + definition += " " + + #Add the last item + item = Definition(word, id, "foldoc", "tech", definition.strip("\n "), [], [], [], categories) + items.append(item) + + for item in items: + # if item.see_also: + print item.word + "| " + str(item.id) + " : " + item.definition + str(item.see_also) + "<" + str(item.categories) - except sqlite3.Error, e: - print "Database Error %d: %s" % (e.args[0],e.args[1]) - sys.exit(1) def main(): @@ -169,7 +279,7 @@ def main(): insert(items) update_db(version) - + parse_foldoc() if __name__ == "__main__": main() -- cgit v1.2.3