diff options
author | Joe Robinson <joe@lc8n.com> | 2014-10-01 17:57:40 +0100 |
---|---|---|
committer | Joe Robinson <joe@lc8n.com> | 2014-10-01 17:57:40 +0100 |
commit | 05b3990f3d81ea66cbca6e4e6e2bfb8294318931 (patch) | |
tree | e1fa10e0fb7c178f76bae5b821012b451eb83f51 /convert.py | |
parent | 42a874e4feac64ed20f11f0cbef93a47bed528b5 (diff) |
Moved foldoc definitions to SQL database
Diffstat (limited to 'convert.py')
-rwxr-xr-x | convert.py | 54 |
1 files changed, 41 insertions, 13 deletions
@@ -58,7 +58,6 @@ def select_wn(): cur = con.cursor(mysql.cursors.DictCursor) cur.execute("SELECT lemma, pos, sensenum, synsetid, definition, sampleset from dict") rows = cur.fetchall() - print len(rows) except mysql.Error, e: print "Database Error %d: %s" % (e.args[0],e.args[1]) sys.exit(1) @@ -118,11 +117,11 @@ def get_db_version(): sys.exit(1) def update_db(version): + version = int(version) try: con = sqlite3.connect('dictionaries/wordnet.db'); with con: cur = con.cursor() - if version < 1: cur.execute("CREATE TABLE info (id integer primary key not null , key text, value text)") cur.execute("INSERT INTO info (key, value) VALUES('version', '1')") @@ -135,7 +134,7 @@ def update_db(version): cur.execute("CREATE TABLE dictionaries (id integer primary key not null, name text, abbreviation text)") cur.execute("INSERT INTO dictionaries (name, abbreviation) VALUES('WordNet', 'wn')") cur.execute("INSERT INTO dictionaries (name, abbreviation) VALUES('FOLDOC', 'foldoc')") - cur.execute("CREATE TABLE definition_categories (id integer primary key not null, definition_id int category_id int)") + cur.execute("CREATE TABLE definition_categories (id integer primary key not null, definition_id int, category_id int)") cur.execute("UPDATE info set value = 2 where key = 'version'") except sqlite3.Error, e: @@ -156,6 +155,7 @@ def parse_foldoc(): for line in file: #Find lines of word headings, these are ones without an indent + remaining = "" if line[0] != "\t" and len(line) > 0 and line[0] != "\n" and line[0] != " ": if definition is not None and len(definition) > 1: @@ -181,7 +181,6 @@ def parse_foldoc(): id_parts = line.split(".") if len(id_parts[0]) <=2: - if definition is not None and len(definition) > 1: item = Definition(word, id, "foldoc", "tech", definition.strip("\n "), [], [], [], categories, see_also) items.append(item) @@ -225,7 +224,10 @@ def parse_foldoc(): if count_braces == 1: see_also = True - definition += remaining.strip("<> ").replace("{", "").replace("}", "") + elif line[0] != "[" and line[-1] != "]" and not (line[-1] == "." and line[-2] == "]"): + remaining = line + + definition += remaining.strip("<> ").replace("{", "").replace("}", "") # Check if it's a 'see also' definition elif line[0] == "{" and line[-1] == "}": @@ -240,21 +242,24 @@ def parse_foldoc(): definition = definition = line.strip("{} ") else: definition += line.replace("{", "").replace("}", "") - - else: + #Ignore the note lines enclosed in []s + + elif len(line) > 1 and line[0] != "[" and line[-1] != "]" and not (line[-1] == "." and line[-2] == "]"): # Get the categories if len(line) > 0 and line[0] == "<": categories = [] category_parts = line.strip().split(">") def_categories = category_parts[0].strip().split(",") for category in def_categories: - categories.append(category.strip("<> ")) + if "@" not in category: + categories.append(category.strip("<> ")) for part in category_parts[1:]: definition += part.replace("{", "").replace("}", "") elif len(line) > 0: #Ignore date lines - if len(line) > 1 and line[0] != "(" and line[-1] != ")" and not line[1].isdigit(): + #TODO: Make this a date regex + if len(line) > 1 and line[0] != "(" and line[-1] != ")": definition += line.replace("{", "").replace("}", "") if len(definition) > 0 and definition[-1] != " ": definition += " " @@ -262,10 +267,33 @@ def parse_foldoc(): #Add the last item item = Definition(word, id, "foldoc", "tech", definition.strip("\n "), [], [], [], categories) items.append(item) - - for item in items: - # if item.see_also: - print item.word + "| " + str(item.id) + " : " + item.definition + str(item.see_also) + "<" + str(item.categories) + + try: + con = sqlite3.connect('dictionaries/wordnet.db'); + con.text_factory = str + with con: + cur = con.cursor() + for item in items: + cur.execute("INSERT INTO definitions(word, dictionary_id, type_id, sub_id, definition) values(?, ?, ?, ?, ?)", [item.word, 2, 5, item.id, item.definition]) + def_id = cur.lastrowid + + for category in item.categories: + cur.execute("SELECT * FROM categories WHERE category = ?", [category]) + db_category = cur.fetchone() + + # Only make a new category if it doesn't exist, otherwise use the existing ID + if db_category is not None: + cat_id = db_category[0] + else: + cur.execute("INSERT INTO categories(category) values(?)", [category]) + cat_id = cur.lastrowid + + cur.execute("INSERT INTO definition_categories(category_id, definition_id) values(?, ?)", [cat_id, def_id]) + + + except sqlite3.Error, e: + print "Database Error %s" % (e.args[0]) + sys.exit(1) def main(): |