From 05b3990f3d81ea66cbca6e4e6e2bfb8294318931 Mon Sep 17 00:00:00 2001 From: Joe Robinson Date: Wed, 1 Oct 2014 17:57:40 +0100 Subject: Moved foldoc definitions to SQL database --- bladictionary.py | 37 +++++++++++++++++++++++++++++-------- convert.py | 54 +++++++++++++++++++++++++++++++++++++++++------------- 2 files changed, 70 insertions(+), 21 deletions(-) diff --git a/bladictionary.py b/bladictionary.py index 6d16240..58ddfeb 100755 --- a/bladictionary.py +++ b/bladictionary.py @@ -8,7 +8,7 @@ from lxml import etree import sqlite3 import requests -VERSION = "2.3.4" +VERSION = "2.4.0" class Definition(object): #ID is relative to the word type, eg noun 1, noun 2, verb 1, verb 2, not to the entire list @@ -278,10 +278,18 @@ def parse_oed(word): return items #Gets all matching WordNet definitions from the database -def get_sql(word): +def get_sql(word, dictionary = "wn"): items = [] - types = ["n", "v", "adj", "adv"] + types = ["n", "v", "adj", "adv", "tech"] + + if dictionary == "wordnet" or dictionary == "wn": + dict_id = 1 + elif dictionary == "foldoc" or dictionary == "tech": + dict_id = 2 + else: + print "Invalid dictionary" + sys.exit(1) con = sqlite3.connect('dictionaries/wordnet.db'); con.row_factory = sqlite3.Row @@ -289,10 +297,15 @@ def get_sql(word): with con: cur = con.cursor() - cur.execute("SELECT * from definitions where word = ? ORDER BY type_id,sub_id;", [word]) + cur.execute("SELECT * from definitions where word = ? and dictionary_id = ? ORDER BY type_id,sub_id;", [word, dict_id]) rows = cur.fetchall() + # If no results were found, try a case insensitive search + if rows is None or len(rows) == 0: + cur.execute("SELECT * from definitions where word = ? COLLATE NOCASE and dictionary_id = ? ORDER BY type_id,sub_id;", [word, dict_id]) + rows = cur.fetchall() + for row in rows: id = row['id'] @@ -301,6 +314,7 @@ def get_sql(word): type = types[row['type_id']-1] definition = row['definition'] synset_id = row['synset_id'] + dictionary_id = row['dictionary_id'] cur.execute("SELECT * from uses where definition_id = ?", [id]) use_rows = cur.fetchall() @@ -309,7 +323,14 @@ def get_sql(word): for use in use_rows: uses.append(use['quote']) - item = Definition(word, sub_id, "wn", type,definition, uses, synset_id, []) + cur.execute("SELECT name from dictionaries where id = ?", [dictionary_id]) + dict_row = cur.fetchone() + if dict_row is not None: + dictionary = dict_row[0] + else: + dictionary = "wn" + + item = Definition(word, sub_id, dictionary, type,definition, uses, synset_id, []) items.append(item) return items @@ -465,15 +486,15 @@ def main(): items = parse_xml(xml) elif word_dict == "foldoc" or word_type == "tech": - items = parse_foldoc(word) + items = get_sql(word, word_dict) elif word_dict == "urban": items = parse_urban(word) else: if word_dict is None or word_dict == "": word_dict = "wn" - items = get_sql(word) + items = get_sql(word, "wn") - foldoc_items = parse_foldoc(word) + foldoc_items = get_sql(word, "foldoc") if foldoc_items is not None and len(foldoc_items) > 0: items += foldoc_items diff --git a/convert.py b/convert.py index 005507f..90f5a96 100755 --- a/convert.py +++ b/convert.py @@ -58,7 +58,6 @@ def select_wn(): cur = con.cursor(mysql.cursors.DictCursor) cur.execute("SELECT lemma, pos, sensenum, synsetid, definition, sampleset from dict") rows = cur.fetchall() - print len(rows) except mysql.Error, e: print "Database Error %d: %s" % (e.args[0],e.args[1]) sys.exit(1) @@ -118,11 +117,11 @@ def get_db_version(): sys.exit(1) def update_db(version): + version = int(version) try: con = sqlite3.connect('dictionaries/wordnet.db'); with con: cur = con.cursor() - if version < 1: cur.execute("CREATE TABLE info (id integer primary key not null , key text, value text)") cur.execute("INSERT INTO info (key, value) VALUES('version', '1')") @@ -135,7 +134,7 @@ def update_db(version): cur.execute("CREATE TABLE dictionaries (id integer primary key not null, name text, abbreviation text)") cur.execute("INSERT INTO dictionaries (name, abbreviation) VALUES('WordNet', 'wn')") cur.execute("INSERT INTO dictionaries (name, abbreviation) VALUES('FOLDOC', 'foldoc')") - cur.execute("CREATE TABLE definition_categories (id integer primary key not null, definition_id int category_id int)") + cur.execute("CREATE TABLE definition_categories (id integer primary key not null, definition_id int, category_id int)") cur.execute("UPDATE info set value = 2 where key = 'version'") except sqlite3.Error, e: @@ -156,6 +155,7 @@ def parse_foldoc(): for line in file: #Find lines of word headings, these are ones without an indent + remaining = "" if line[0] != "\t" and len(line) > 0 and line[0] != "\n" and line[0] != " ": if definition is not None and len(definition) > 1: @@ -181,7 +181,6 @@ def parse_foldoc(): id_parts = line.split(".") if len(id_parts[0]) <=2: - if definition is not None and len(definition) > 1: item = Definition(word, id, "foldoc", "tech", definition.strip("\n "), [], [], [], categories, see_also) items.append(item) @@ -225,7 +224,10 @@ def parse_foldoc(): if count_braces == 1: see_also = True - definition += remaining.strip("<> ").replace("{", "").replace("}", "") + elif line[0] != "[" and line[-1] != "]" and not (line[-1] == "." and line[-2] == "]"): + remaining = line + + definition += remaining.strip("<> ").replace("{", "").replace("}", "") # Check if it's a 'see also' definition elif line[0] == "{" and line[-1] == "}": @@ -240,21 +242,24 @@ def parse_foldoc(): definition = definition = line.strip("{} ") else: definition += line.replace("{", "").replace("}", "") - - else: + #Ignore the note lines enclosed in []s + + elif len(line) > 1 and line[0] != "[" and line[-1] != "]" and not (line[-1] == "." and line[-2] == "]"): # Get the categories if len(line) > 0 and line[0] == "<": categories = [] category_parts = line.strip().split(">") def_categories = category_parts[0].strip().split(",") for category in def_categories: - categories.append(category.strip("<> ")) + if "@" not in category: + categories.append(category.strip("<> ")) for part in category_parts[1:]: definition += part.replace("{", "").replace("}", "") elif len(line) > 0: #Ignore date lines - if len(line) > 1 and line[0] != "(" and line[-1] != ")" and not line[1].isdigit(): + #TODO: Make this a date regex + if len(line) > 1 and line[0] != "(" and line[-1] != ")": definition += line.replace("{", "").replace("}", "") if len(definition) > 0 and definition[-1] != " ": definition += " " @@ -262,10 +267,33 @@ def parse_foldoc(): #Add the last item item = Definition(word, id, "foldoc", "tech", definition.strip("\n "), [], [], [], categories) items.append(item) - - for item in items: - # if item.see_also: - print item.word + "| " + str(item.id) + " : " + item.definition + str(item.see_also) + "<" + str(item.categories) + + try: + con = sqlite3.connect('dictionaries/wordnet.db'); + con.text_factory = str + with con: + cur = con.cursor() + for item in items: + cur.execute("INSERT INTO definitions(word, dictionary_id, type_id, sub_id, definition) values(?, ?, ?, ?, ?)", [item.word, 2, 5, item.id, item.definition]) + def_id = cur.lastrowid + + for category in item.categories: + cur.execute("SELECT * FROM categories WHERE category = ?", [category]) + db_category = cur.fetchone() + + # Only make a new category if it doesn't exist, otherwise use the existing ID + if db_category is not None: + cat_id = db_category[0] + else: + cur.execute("INSERT INTO categories(category) values(?)", [category]) + cat_id = cur.lastrowid + + cur.execute("INSERT INTO definition_categories(category_id, definition_id) values(?, ?)", [cat_id, def_id]) + + + except sqlite3.Error, e: + print "Database Error %s" % (e.args[0]) + sys.exit(1) def main(): -- cgit v1.2.3