summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJoe Robinson <joe@lc8n.com>2014-10-01 17:57:40 +0100
committerJoe Robinson <joe@lc8n.com>2014-10-01 17:57:40 +0100
commit05b3990f3d81ea66cbca6e4e6e2bfb8294318931 (patch)
treee1fa10e0fb7c178f76bae5b821012b451eb83f51
parent42a874e4feac64ed20f11f0cbef93a47bed528b5 (diff)
Moved foldoc definitions to SQL database
-rwxr-xr-xbladictionary.py37
-rwxr-xr-xconvert.py54
2 files changed, 70 insertions, 21 deletions
diff --git a/bladictionary.py b/bladictionary.py
index 6d16240..58ddfeb 100755
--- a/bladictionary.py
+++ b/bladictionary.py
@@ -8,7 +8,7 @@ from lxml import etree
import sqlite3
import requests
-VERSION = "2.3.4"
+VERSION = "2.4.0"
class Definition(object):
#ID is relative to the word type, eg noun 1, noun 2, verb 1, verb 2, not to the entire list
@@ -278,10 +278,18 @@ def parse_oed(word):
return items
#Gets all matching WordNet definitions from the database
-def get_sql(word):
+def get_sql(word, dictionary = "wn"):
items = []
- types = ["n", "v", "adj", "adv"]
+ types = ["n", "v", "adj", "adv", "tech"]
+
+ if dictionary == "wordnet" or dictionary == "wn":
+ dict_id = 1
+ elif dictionary == "foldoc" or dictionary == "tech":
+ dict_id = 2
+ else:
+ print "Invalid dictionary"
+ sys.exit(1)
con = sqlite3.connect('dictionaries/wordnet.db');
con.row_factory = sqlite3.Row
@@ -289,10 +297,15 @@ def get_sql(word):
with con:
cur = con.cursor()
- cur.execute("SELECT * from definitions where word = ? ORDER BY type_id,sub_id;", [word])
+ cur.execute("SELECT * from definitions where word = ? and dictionary_id = ? ORDER BY type_id,sub_id;", [word, dict_id])
rows = cur.fetchall()
+ # If no results were found, try a case insensitive search
+ if rows is None or len(rows) == 0:
+ cur.execute("SELECT * from definitions where word = ? COLLATE NOCASE and dictionary_id = ? ORDER BY type_id,sub_id;", [word, dict_id])
+ rows = cur.fetchall()
+
for row in rows:
id = row['id']
@@ -301,6 +314,7 @@ def get_sql(word):
type = types[row['type_id']-1]
definition = row['definition']
synset_id = row['synset_id']
+ dictionary_id = row['dictionary_id']
cur.execute("SELECT * from uses where definition_id = ?", [id])
use_rows = cur.fetchall()
@@ -309,7 +323,14 @@ def get_sql(word):
for use in use_rows:
uses.append(use['quote'])
- item = Definition(word, sub_id, "wn", type,definition, uses, synset_id, [])
+ cur.execute("SELECT name from dictionaries where id = ?", [dictionary_id])
+ dict_row = cur.fetchone()
+ if dict_row is not None:
+ dictionary = dict_row[0]
+ else:
+ dictionary = "wn"
+
+ item = Definition(word, sub_id, dictionary, type,definition, uses, synset_id, [])
items.append(item)
return items
@@ -465,15 +486,15 @@ def main():
items = parse_xml(xml)
elif word_dict == "foldoc" or word_type == "tech":
- items = parse_foldoc(word)
+ items = get_sql(word, word_dict)
elif word_dict == "urban":
items = parse_urban(word)
else:
if word_dict is None or word_dict == "":
word_dict = "wn"
- items = get_sql(word)
+ items = get_sql(word, "wn")
- foldoc_items = parse_foldoc(word)
+ foldoc_items = get_sql(word, "foldoc")
if foldoc_items is not None and len(foldoc_items) > 0:
items += foldoc_items
diff --git a/convert.py b/convert.py
index 005507f..90f5a96 100755
--- a/convert.py
+++ b/convert.py
@@ -58,7 +58,6 @@ def select_wn():
cur = con.cursor(mysql.cursors.DictCursor)
cur.execute("SELECT lemma, pos, sensenum, synsetid, definition, sampleset from dict")
rows = cur.fetchall()
- print len(rows)
except mysql.Error, e:
print "Database Error %d: %s" % (e.args[0],e.args[1])
sys.exit(1)
@@ -118,11 +117,11 @@ def get_db_version():
sys.exit(1)
def update_db(version):
+ version = int(version)
try:
con = sqlite3.connect('dictionaries/wordnet.db');
with con:
cur = con.cursor()
-
if version < 1:
cur.execute("CREATE TABLE info (id integer primary key not null , key text, value text)")
cur.execute("INSERT INTO info (key, value) VALUES('version', '1')")
@@ -135,7 +134,7 @@ def update_db(version):
cur.execute("CREATE TABLE dictionaries (id integer primary key not null, name text, abbreviation text)")
cur.execute("INSERT INTO dictionaries (name, abbreviation) VALUES('WordNet', 'wn')")
cur.execute("INSERT INTO dictionaries (name, abbreviation) VALUES('FOLDOC', 'foldoc')")
- cur.execute("CREATE TABLE definition_categories (id integer primary key not null, definition_id int category_id int)")
+ cur.execute("CREATE TABLE definition_categories (id integer primary key not null, definition_id int, category_id int)")
cur.execute("UPDATE info set value = 2 where key = 'version'")
except sqlite3.Error, e:
@@ -156,6 +155,7 @@ def parse_foldoc():
for line in file:
#Find lines of word headings, these are ones without an indent
+ remaining = ""
if line[0] != "\t" and len(line) > 0 and line[0] != "\n" and line[0] != " ":
if definition is not None and len(definition) > 1:
@@ -181,7 +181,6 @@ def parse_foldoc():
id_parts = line.split(".")
if len(id_parts[0]) <=2:
-
if definition is not None and len(definition) > 1:
item = Definition(word, id, "foldoc", "tech", definition.strip("\n "), [], [], [], categories, see_also)
items.append(item)
@@ -225,7 +224,10 @@ def parse_foldoc():
if count_braces == 1:
see_also = True
- definition += remaining.strip("<> ").replace("{", "").replace("}", "")
+ elif line[0] != "[" and line[-1] != "]" and not (line[-1] == "." and line[-2] == "]"):
+ remaining = line
+
+ definition += remaining.strip("<> ").replace("{", "").replace("}", "")
# Check if it's a 'see also' definition
elif line[0] == "{" and line[-1] == "}":
@@ -240,21 +242,24 @@ def parse_foldoc():
definition = definition = line.strip("{} ")
else:
definition += line.replace("{", "").replace("}", "")
-
- else:
+ #Ignore the note lines enclosed in []s
+
+ elif len(line) > 1 and line[0] != "[" and line[-1] != "]" and not (line[-1] == "." and line[-2] == "]"):
# Get the categories
if len(line) > 0 and line[0] == "<":
categories = []
category_parts = line.strip().split(">")
def_categories = category_parts[0].strip().split(",")
for category in def_categories:
- categories.append(category.strip("<> "))
+ if "@" not in category:
+ categories.append(category.strip("<> "))
for part in category_parts[1:]:
definition += part.replace("{", "").replace("}", "")
elif len(line) > 0:
#Ignore date lines
- if len(line) > 1 and line[0] != "(" and line[-1] != ")" and not line[1].isdigit():
+ #TODO: Make this a date regex
+ if len(line) > 1 and line[0] != "(" and line[-1] != ")":
definition += line.replace("{", "").replace("}", "")
if len(definition) > 0 and definition[-1] != " ":
definition += " "
@@ -262,10 +267,33 @@ def parse_foldoc():
#Add the last item
item = Definition(word, id, "foldoc", "tech", definition.strip("\n "), [], [], [], categories)
items.append(item)
-
- for item in items:
- # if item.see_also:
- print item.word + "| " + str(item.id) + " : " + item.definition + str(item.see_also) + "<" + str(item.categories)
+
+ try:
+ con = sqlite3.connect('dictionaries/wordnet.db');
+ con.text_factory = str
+ with con:
+ cur = con.cursor()
+ for item in items:
+ cur.execute("INSERT INTO definitions(word, dictionary_id, type_id, sub_id, definition) values(?, ?, ?, ?, ?)", [item.word, 2, 5, item.id, item.definition])
+ def_id = cur.lastrowid
+
+ for category in item.categories:
+ cur.execute("SELECT * FROM categories WHERE category = ?", [category])
+ db_category = cur.fetchone()
+
+ # Only make a new category if it doesn't exist, otherwise use the existing ID
+ if db_category is not None:
+ cat_id = db_category[0]
+ else:
+ cur.execute("INSERT INTO categories(category) values(?)", [category])
+ cat_id = cur.lastrowid
+
+ cur.execute("INSERT INTO definition_categories(category_id, definition_id) values(?, ?)", [cat_id, def_id])
+
+
+ except sqlite3.Error, e:
+ print "Database Error %s" % (e.args[0])
+ sys.exit(1)
def main():