#!/usr/bin/python2 # -*- coding: utf-8 -*- import MySQLdb as mysql import sqlite3 import sys class Definition(object): #ID is relative to the word type, eg noun 1, noun 2, verb 1, verb 2, not to the entire list id = 0 word = "" dictionary = "" word_type = "" definition = "" uses = [] synonyms = [] antonyms = [] categories = [] def __init__(self, word, id, dictionary, word_type, definition, uses = [], synonyms = [], antonyms = [], categories= [], see_also = False): self.word = word self.id = id self.dictionary = dictionary self.word_type = word_type self.definition = definition self.uses = uses self.synonyms = synonyms self.antonyms = antonyms self.categories = categories self.see_also = see_also def create(): try: con = sqlite3.connect('dictionaries/wordnet.db'); with con: cur = con.cursor() cur.execute("CREATE TABLE types (id integer primary key not null , type text, abbreviation text)") cur.execute("INSERT INTO types (type, abbreviation) VALUES('noun', 'n')") cur.execute("INSERT INTO types (type, abbreviation) VALUES('verb', 'v')") cur.execute("INSERT INTO types (type, abbreviation) VALUES('adjective', 'adj')") cur.execute("INSERT INTO types (type, abbreviation) VALUES('adverb', 'adv')") cur.execute("CREATE TABLE definitions (id integer primary key not null , word text, type_id int, sub_id int, synset_id bigint, definition text)") cur.execute("CREATE TABLE uses (id integer primary key not null, definition_id bigint, quote text)") cur.execute("CREATE TABLE info (id integer primary key not null , key text, value text)") cur.execute("INSERT INTO info (key, value) VALUES('version', '1')") except sqlite3.Error, e: print "Database Error %s" % (e.args[0]) sys.exit(1) def select_wn(): try: con = mysql.connect('localhost', 'wordnet', 'words', 'wordnet'); with con: cur = con.cursor(mysql.cursors.DictCursor) cur.execute("SELECT lemma, pos, sensenum, synsetid, definition, sampleset from dict") rows = cur.fetchall() except mysql.Error, e: print "Database Error %d: %s" % (e.args[0],e.args[1]) sys.exit(1) return rows def insert_wn(rows): try: con = sqlite3.connect('dictionaries/wordnet.db'); with con: cur = con.cursor() for row in rows: if row['pos'] == 'n': type_id = 1 elif row['pos'] == 'v': type_id = 2 elif row['pos'] == 'a' or row['pos'] == 's': type_id = 3 elif row['pos'] == 'r': type_id = 4 cur.execute("INSERT INTO definitions(word, type_id, sub_id, synset_id, definition) values(?, ?, ?, ?, ?)", [row['lemma'], type_id, row['sensenum'], row['synsetid'], row['definition']]) row_id = cur.lastrowid if row['sampleset'] is not None: uses = row['sampleset'].split("|") for use in uses: cur.execute("INSERT INTO uses(definition_id, quote) values(?, ?)", [row_id, use]) except sqlite3.Error, e: print "Database Error %s" % (e.args[0]) sys.exit(1) def get_db_version(): try: con = sqlite3.connect('dictionaries/wordnet.db'); with con: cur = con.cursor() # Check if info table exists at all, if not then version 0 cur.execute("SELECT count(name) FROM sqlite_master WHERE type='table' AND name='info'") row = cur.fetchone() if row[0] == 0: return 0 else: cur.execute("SELECT value FROM info WHERE key = 'version'") row = cur.fetchone() return row[0] except sqlite3.Error, e: print "Database Error %s" % (e.args[0]) sys.exit(1) def update_db(version): version = int(version) try: con = sqlite3.connect('dictionaries/wordnet.db'); with con: cur = con.cursor() if version < 1: cur.execute("CREATE TABLE info (id integer primary key not null , key text, value text)") cur.execute("INSERT INTO info (key, value) VALUES('version', '1')") if version < 2: cur = con.cursor() cur.execute("INSERT INTO types (type, abbreviation) VALUES('technical', 'tech.')") cur.execute("ALTER TABLE definitions ADD COLUMN dictionary_id int") cur.execute("CREATE TABLE categories (id integer primary key not null, category text)") cur.execute("CREATE TABLE dictionaries (id integer primary key not null, name text, abbreviation text)") cur.execute("INSERT INTO dictionaries (name, abbreviation) VALUES('WordNet', 'wn')") cur.execute("INSERT INTO dictionaries (name, abbreviation) VALUES('FOLDOC', 'foldoc')") cur.execute("CREATE TABLE definition_categories (id integer primary key not null, definition_id int, category_id int)") cur.execute("UPDATE info set value = 2 where key = 'version'") except sqlite3.Error, e: print "Database Error %s" % (e.args[0]) sys.exit(1) def parse_foldoc(): file = open("dictionaries/foldoc.txt") word_line = "" see_also = False items = [] id = 1 word = "" categories = [] definition = "" for line in file: #Find lines of word headings, these are ones without an indent remaining = "" if line[0] != "\t" and len(line) > 0 and line[0] != "\n" and line[0] != " ": if definition is not None and len(definition) > 1: item = Definition(word, id, "foldoc", "tech", definition.strip("\n "), [], [], [], categories, see_also) items.append(item) # Start the new definition word = line.strip() id = 1 definition = "" categories = [] see_also = False continue elif len(line) == 0 or line[0] == "\n": definition += "\n" continue else: line = line.strip() if definition != "" and (len(line) == 0 or line[0] == "\n"): definition += "" elif line[0].isdigit(): id_parts = line.split(".") if len(id_parts[0]) <=2: if definition is not None and len(definition) > 1: item = Definition(word, id, "foldoc", "tech", definition.strip("\n "), [], [], [], categories, see_also) items.append(item) # Start the new definition id = id_parts[0] definition = "" see_also = False categories = [] remaining = "" for part in id_parts[1:-1]: remaining += part + ". " remaining += id_parts[-1] remaining = remaining.strip() #Get the categories, at the start of the line enclosed in <>s if len(remaining) > 0 and remaining[0] == "<": categories = [] category_parts = remaining.split(">") def_categories = category_parts[0].strip().split(",") for category in def_categories: categories.append(category.strip("<> ")) remaining = "" for part in category_parts[1:]: remaining += part remaining = remaining.strip(". ") # Check if it's a 'see also' definition if len(remaining) > 0 and remaining[0] == "{" and remaining[-1] == "}": count_braces = 0 # Avoid false positives that happen to start with a { and end with a }, but have others in between for char in remaining: if char == "}": count_braces += 1 if count_braces == 1: see_also = True elif line[0] != "[" and line[-1] != "]" and not (line[-1] == "." and line[-2] == "]"): remaining = line definition += remaining.strip("<> ").replace("{", "").replace("}", "") # Check if it's a 'see also' definition elif line[0] == "{" and line[-1] == "}": count_braces = 0 # Avoid false positives that happen to start with a { and end with a }, but have others in between for char in line: if char == "}": count_braces += 1 if count_braces == 1: see_also = True definition = definition = line.strip("{} ") else: definition += line.replace("{", "").replace("}", "") #Ignore the note lines enclosed in []s elif len(line) > 1 and line[0] != "[" and line[-1] != "]" and not (line[-1] == "." and line[-2] == "]"): # Get the categories if len(line) > 0 and line[0] == "<": categories = [] category_parts = line.strip().split(">") def_categories = category_parts[0].strip().split(",") for category in def_categories: if "@" not in category: categories.append(category.strip("<> ")) for part in category_parts[1:]: definition += part.replace("{", "").replace("}", "") elif len(line) > 0: #Ignore date lines #TODO: Make this a date regex if len(line) > 1 and line[0] != "(" and line[-1] != ")": definition += line.replace("{", "").replace("}", "") if len(definition) > 0 and definition[-1] != " ": definition += " " #Add the last item item = Definition(word, id, "foldoc", "tech", definition.strip("\n "), [], [], [], categories) items.append(item) try: con = sqlite3.connect('dictionaries/wordnet.db'); con.text_factory = str with con: cur = con.cursor() for item in items: cur.execute("INSERT INTO definitions(word, dictionary_id, type_id, sub_id, definition) values(?, ?, ?, ?, ?)", [item.word, 2, 5, item.id, item.definition]) def_id = cur.lastrowid for category in item.categories: cur.execute("SELECT * FROM categories WHERE category = ?", [category]) db_category = cur.fetchone() # Only make a new category if it doesn't exist, otherwise use the existing ID if db_category is not None: cat_id = db_category[0] else: cur.execute("INSERT INTO categories(category) values(?)", [category]) cat_id = cur.lastrowid cur.execute("INSERT INTO definition_categories(category_id, definition_id) values(?, ?)", [cat_id, def_id]) except sqlite3.Error, e: print "Database Error %s" % (e.args[0]) sys.exit(1) def main(): version = get_db_version() # if version == 0: # create() # items = select_wn() # insert(items) update_db(version) parse_foldoc() if __name__ == "__main__": main()