diff options
author | Joe Robinson <joe@lc8n.com> | 2014-09-19 01:02:42 +0100 |
---|---|---|
committer | Joe Robinson <joe@lc8n.com> | 2014-09-19 01:02:42 +0100 |
commit | ac2251f199398b813f394bdb875bc2723557781c (patch) | |
tree | 2ce6695c21bbcc62b6b97c2468ecb6ece2c78cc1 | |
parent | 2dfb9a16bd458f16f216f20b56485d982fd9af0f (diff) |
Created script for converting WordNet database into a simpler/faster form
-rw-r--r-- | convert.py | 74 |
1 files changed, 74 insertions, 0 deletions
diff --git a/convert.py b/convert.py new file mode 100644 index 0000000..9e8bba4 --- /dev/null +++ b/convert.py @@ -0,0 +1,74 @@ +import MySQLdb as mysql + +def create(): + try: + con = mysql.connect('localhost', 'wordnet', 'words', 'wordnet'); + + with con: + cur = con.cursor(mysql.cursors.DictCursor) + cur.execute("CREATE TABLE types (id int not null auto_increment, type text, abbreviation text, primary key(id))") + cur.execute("INSERT INTO types (type, abbreviation) VALUES('noun', 'n')") + cur.execute("INSERT INTO types (type, abbreviation) VALUES('verb', 'v')") + cur.execute("INSERT INTO types (type, abbreviation) VALUES('adjective', 'adj')") + cur.execute("INSERT INTO types (type, abbreviation) VALUES('adverb', 'adv')") + cur.execute("CREATE TABLE definitions (id bigint not null auto_increment, word text, type_id int, sub_id int, synset_id bigint, definition text, primary key(id))") + cur.execute("CREATE TABLE uses (id bigint not null auto_increment, definition_id bigint, quote text, primary key(id))") + + except mysql.Error, e: + print "Database Error %d: %s" % (e.args[0],e.args[1]) + sys.exit(1) + +def select(): + try: + con = mysql.connect('localhost', 'wordnet', 'words', 'wordnet'); + + with con: + cur = con.cursor(mysql.cursors.DictCursor) + cur.execute("SELECT lemma, pos, sensenum, synsetid, definition, sampleset from dict") + rows = cur.fetchall() + print len(rows) + except mysql.Error, e: + print "Database Error %d: %s" % (e.args[0],e.args[1]) + sys.exit(1) + + return rows + +def insert(rows): + try: + con = mysql.connect('localhost', 'wordnet', 'words', 'wordnet'); + + with con: + cur = con.cursor(mysql.cursors.DictCursor) + + for row in rows: + if row['pos'] == 'n': + type_id = 1 + elif row['pos'] == 'v': + type_id = 2 + elif row['pos'] == 'a' or row['pos'] == 's': + type_id = 3 + elif row['pos'] == 'r': + type_id = 4 + + cur.execute("INSERT INTO definitions(word, type_id, sub_id, synset_id, definition) values(%s, %s, %s, %s, %s)", [row['lemma'], type_id, row['sensenum'], row['synsetid'], row['definition']]) + + row_id = cur.lastrowid + + if row['sampleset'] is not None: + uses = row['sampleset'].split("|") + + for use in uses: + cur.execute("INSERT INTO uses(definition_id, quote) values(%s, %s)", [row_id, use]) + + except mysql.Error, e: + print "Database Error %d: %s" % (e.args[0],e.args[1]) + sys.exit(1) + +def main(): + create() + items = select() + insert(items) + +if __name__ == "__main__": + main() + |