summaryrefslogtreecommitdiff
path: root/convert.py
diff options
context:
space:
mode:
authorJoe Robinson <joe@lc8n.com>2014-10-01 17:57:40 +0100
committerJoe Robinson <joe@lc8n.com>2014-10-01 17:57:40 +0100
commit05b3990f3d81ea66cbca6e4e6e2bfb8294318931 (patch)
treee1fa10e0fb7c178f76bae5b821012b451eb83f51 /convert.py
parent42a874e4feac64ed20f11f0cbef93a47bed528b5 (diff)
Moved foldoc definitions to SQL database
Diffstat (limited to 'convert.py')
-rwxr-xr-xconvert.py54
1 files changed, 41 insertions, 13 deletions
diff --git a/convert.py b/convert.py
index 005507f..90f5a96 100755
--- a/convert.py
+++ b/convert.py
@@ -58,7 +58,6 @@ def select_wn():
cur = con.cursor(mysql.cursors.DictCursor)
cur.execute("SELECT lemma, pos, sensenum, synsetid, definition, sampleset from dict")
rows = cur.fetchall()
- print len(rows)
except mysql.Error, e:
print "Database Error %d: %s" % (e.args[0],e.args[1])
sys.exit(1)
@@ -118,11 +117,11 @@ def get_db_version():
sys.exit(1)
def update_db(version):
+ version = int(version)
try:
con = sqlite3.connect('dictionaries/wordnet.db');
with con:
cur = con.cursor()
-
if version < 1:
cur.execute("CREATE TABLE info (id integer primary key not null , key text, value text)")
cur.execute("INSERT INTO info (key, value) VALUES('version', '1')")
@@ -135,7 +134,7 @@ def update_db(version):
cur.execute("CREATE TABLE dictionaries (id integer primary key not null, name text, abbreviation text)")
cur.execute("INSERT INTO dictionaries (name, abbreviation) VALUES('WordNet', 'wn')")
cur.execute("INSERT INTO dictionaries (name, abbreviation) VALUES('FOLDOC', 'foldoc')")
- cur.execute("CREATE TABLE definition_categories (id integer primary key not null, definition_id int category_id int)")
+ cur.execute("CREATE TABLE definition_categories (id integer primary key not null, definition_id int, category_id int)")
cur.execute("UPDATE info set value = 2 where key = 'version'")
except sqlite3.Error, e:
@@ -156,6 +155,7 @@ def parse_foldoc():
for line in file:
#Find lines of word headings, these are ones without an indent
+ remaining = ""
if line[0] != "\t" and len(line) > 0 and line[0] != "\n" and line[0] != " ":
if definition is not None and len(definition) > 1:
@@ -181,7 +181,6 @@ def parse_foldoc():
id_parts = line.split(".")
if len(id_parts[0]) <=2:
-
if definition is not None and len(definition) > 1:
item = Definition(word, id, "foldoc", "tech", definition.strip("\n "), [], [], [], categories, see_also)
items.append(item)
@@ -225,7 +224,10 @@ def parse_foldoc():
if count_braces == 1:
see_also = True
- definition += remaining.strip("<> ").replace("{", "").replace("}", "")
+ elif line[0] != "[" and line[-1] != "]" and not (line[-1] == "." and line[-2] == "]"):
+ remaining = line
+
+ definition += remaining.strip("<> ").replace("{", "").replace("}", "")
# Check if it's a 'see also' definition
elif line[0] == "{" and line[-1] == "}":
@@ -240,21 +242,24 @@ def parse_foldoc():
definition = definition = line.strip("{} ")
else:
definition += line.replace("{", "").replace("}", "")
-
- else:
+ #Ignore the note lines enclosed in []s
+
+ elif len(line) > 1 and line[0] != "[" and line[-1] != "]" and not (line[-1] == "." and line[-2] == "]"):
# Get the categories
if len(line) > 0 and line[0] == "<":
categories = []
category_parts = line.strip().split(">")
def_categories = category_parts[0].strip().split(",")
for category in def_categories:
- categories.append(category.strip("<> "))
+ if "@" not in category:
+ categories.append(category.strip("<> "))
for part in category_parts[1:]:
definition += part.replace("{", "").replace("}", "")
elif len(line) > 0:
#Ignore date lines
- if len(line) > 1 and line[0] != "(" and line[-1] != ")" and not line[1].isdigit():
+ #TODO: Make this a date regex
+ if len(line) > 1 and line[0] != "(" and line[-1] != ")":
definition += line.replace("{", "").replace("}", "")
if len(definition) > 0 and definition[-1] != " ":
definition += " "
@@ -262,10 +267,33 @@ def parse_foldoc():
#Add the last item
item = Definition(word, id, "foldoc", "tech", definition.strip("\n "), [], [], [], categories)
items.append(item)
-
- for item in items:
- # if item.see_also:
- print item.word + "| " + str(item.id) + " : " + item.definition + str(item.see_also) + "<" + str(item.categories)
+
+ try:
+ con = sqlite3.connect('dictionaries/wordnet.db');
+ con.text_factory = str
+ with con:
+ cur = con.cursor()
+ for item in items:
+ cur.execute("INSERT INTO definitions(word, dictionary_id, type_id, sub_id, definition) values(?, ?, ?, ?, ?)", [item.word, 2, 5, item.id, item.definition])
+ def_id = cur.lastrowid
+
+ for category in item.categories:
+ cur.execute("SELECT * FROM categories WHERE category = ?", [category])
+ db_category = cur.fetchone()
+
+ # Only make a new category if it doesn't exist, otherwise use the existing ID
+ if db_category is not None:
+ cat_id = db_category[0]
+ else:
+ cur.execute("INSERT INTO categories(category) values(?)", [category])
+ cat_id = cur.lastrowid
+
+ cur.execute("INSERT INTO definition_categories(category_id, definition_id) values(?, ?)", [cat_id, def_id])
+
+
+ except sqlite3.Error, e:
+ print "Database Error %s" % (e.args[0])
+ sys.exit(1)
def main():