summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJoe Robinson <joe@lc8n.com>2014-09-24 01:21:09 +0100
committerJoe Robinson <joe@lc8n.com>2014-09-24 01:21:09 +0100
commit7a1c8cc2a638518d2936e7e919d6d2edb0710980 (patch)
tree067f6d7a9ccca2c7506da73b4affb03ca62a6edb
parente04bd0c36d530729b268f1710113515af44d3ccb (diff)
Adding foldoc to the SQL conversion script
-rwxr-xr-xconvert.py150
1 files changed, 130 insertions, 20 deletions
diff --git a/convert.py b/convert.py
index 2be3d28..e5e35eb 100755
--- a/convert.py
+++ b/convert.py
@@ -16,8 +16,9 @@ class Definition(object):
uses = []
synonyms = []
antonyms = []
+ categories = []
- def __init__(self, word, id, dictionary, word_type, definition, uses, synonyms, antonyms):
+ def __init__(self, word, id, dictionary, word_type, definition, uses = [], synonyms = [], antonyms = [], categories= [], see_also = False):
self.word = word
self.id = id
self.dictionary = dictionary
@@ -26,6 +27,8 @@ class Definition(object):
self.uses = uses
self.synonyms = synonyms
self.antonyms = antonyms
+ self.categories = categories
+ self.see_also = see_also
def create():
try:
@@ -126,12 +129,12 @@ def update_db(version):
if version < 2:
cur = con.cursor()
- cur.execute("INSERT INTO types (type, abbreviation) VALUES('urban', 'urb.')")
+ cur.execute("INSERT INTO types (type, abbreviation) VALUES('technical', 'tech.')")
cur.execute("ALTER TABLE definitions ADD COLUMN dictionary_id int")
cur.execute("CREATE TABLE categories (id integer primary key not null, category text)")
cur.execute("CREATE TABLE dictionaries (id integer primary key not null, name text, abbreviation text)")
cur.execute("INSERT INTO dictionaries (name, abbreviation) VALUES('WordNet', 'wn')")
- cur.execute("INSERT INTO dictionaries (name, abbreviation) VALUES('Urban Dictionary', 'urban')")
+ cur.execute("INSERT INTO dictionaries (name, abbreviation) VALUES('FOLDOC', 'foldoc')")
cur.execute("CREATE TABLE definition_categories (id integer primary key not null, definition_id int category_id int)")
cur.execute("UPDATE info set value = 2 where key = 'version'")
@@ -139,25 +142,132 @@ def update_db(version):
print "Database Error %s" % (e.args[0])
sys.exit(1)
+def parse_foldoc():
-def update_db_1():
- try:
- con = sqlite3.connect('dictionaries/wordnet.db');
+ file = open("dictionaries/foldoc.txt")
+ word_line = ""
+ see_also = False
+ items = []
+ id = 1
+ word = ""
+ categories = []
- with con:
- cur = con.cursor()
- cur.execute("INSERT INTO types (type, abbreviation) VALUES('urban', 'urb.')")
- cur.execute("ALTER TABLE definitions ADD COLUMN dictionary_id int")
- cur.execute("CREATE TABLE categories (id integer primary key not null, category text)")
- cur.execute("CREATE TABLE dictionaries (id integer primary key not null, name text, abbreviation text")
- cur.execute("INSERT INTO dictionaries (name, abbreviation) VALUES('WordNet', 'wn')")
- cur.execute("INSERT INTO dictionaries (name, abbreviation) VALUES('Urban Dictionary', 'urban')")
- cur.execute("CREATE TABLE definition_categories (id integer primary key not null, definition_id int category_id int)")
- cur.execute("UPDATE info set value = 2 where key = 'version'")
+ definition = ""
+
+ for line in file:
+ #Find lines of word headings, these are ones without an indent
+ if line[0] != "\t" and len(line) > 0 and line[0] != "\n" and line[0] != " ":
+
+ if definition is not None and len(definition) > 1:
+ item = Definition(word, id, "foldoc", "tech", definition.strip("\n "), [], [], [], categories, see_also)
+ items.append(item)
+
+ # Start the new definition
+ word = line.strip()
+ id = 1
+ definition = ""
+ categories = []
+ see_also = False
+ continue
+ elif len(line) == 0 or line[0] == "\n":
+ definition += "\n"
+ continue
+ else:
+ line = line.strip()
+
+ if definition != "" and (len(line) == 0 or line[0] == "\n"):
+ definition += ""
+ elif line[0].isdigit():
+ id_parts = line.split(".")
+ if len(id_parts[0]) <=2:
+
+
+ if definition is not None and len(definition) > 1:
+ item = Definition(word, id, "foldoc", "tech", definition.strip("\n "), [], [], [], categories, see_also)
+ items.append(item)
+
+ # Start the new definition
+ id = id_parts[0]
+ definition = ""
+ see_also = False
+ categories = []
+
+ remaining = ""
+
+ for part in id_parts[1:-1]:
+ remaining += part + ". "
+ remaining += id_parts[-1]
+
+ remaining = remaining.strip()
+
+ #Get the categories, at the start of the line enclosed in <>s
+ if len(remaining) > 0 and remaining[0] == "<":
+ categories = []
+ category_parts = remaining.split(">")
+ def_categories = category_parts[0].strip().split(",")
+
+ for category in def_categories:
+ categories.append(category.strip("<> "))
+
+ remaining = ""
+ for part in category_parts[1:]:
+ remaining += part
+
+ remaining = remaining.strip(". ")
+
+ # Check if it's a 'see also' definition
+ if len(remaining) > 0 and remaining[0] == "{" and remaining[-1] == "}":
+ count_braces = 0
+ # Avoid false positives that happen to start with a { and end with a }, but have others in between
+ for char in remaining:
+ if char == "}":
+ count_braces += 1
+ if count_braces == 1:
+ print "print "
+ see_also = True
+
+ definition += remaining.strip("<> ").replace("{", "").replace("}", "")
+
+ # Check if it's a 'see also' definition
+ elif line[0] == "{" and line[-1] == "}":
+ count_braces = 0
+
+ # Avoid false positives that happen to start with a { and end with a }, but have others in between
+ for char in line:
+ if char == "}":
+ count_braces += 1
+ if count_braces == 1:
+ see_also = True
+ definition = definition = line.strip("{} ")
+ else:
+ definition += line.replace("{", "").replace("}", "")
+
+ else:
+ # Get the categories
+ if len(line) > 0 and line[0] == "<":
+ categories = []
+ category_parts = line.strip().split(">")
+ def_categories = category_parts[0].strip().split(",")
+ for category in def_categories:
+ categories.append(category.strip("<> "))
+
+ for part in category_parts[1:]:
+ definition += part.replace("{", "").replace("}", "")
+ elif len(line) > 0:
+ #Ignore date lines
+ if line[0] != "(" and line[-1] != ")" and not line[1].isdigit():
+ definition += line.replace("{", "").replace("}", "")
+ if len(definition) > 0 and definition[-1] != " ":
+ definition += " "
+
+ #Add the last item
+ item = Definition(word, id, "foldoc", "tech", definition.strip("\n "), [], [], [], categories)
+ items.append(item)
+
+ for item in items:
+ # if item.see_also:
+ print item.word + "| " + str(item.id) + " : " + item.definition + str(item.see_also) + "<" + str(item.categories)
- except sqlite3.Error, e:
- print "Database Error %d: %s" % (e.args[0],e.args[1])
- sys.exit(1)
def main():
@@ -169,7 +279,7 @@ def main():
insert(items)
update_db(version)
-
+ parse_foldoc()
if __name__ == "__main__":
main()