diff options
| author | Joe Robinson <joe@lc8n.com> | 2014-10-01 17:57:40 +0100 | 
|---|---|---|
| committer | Joe Robinson <joe@lc8n.com> | 2014-10-01 17:57:40 +0100 | 
| commit | 05b3990f3d81ea66cbca6e4e6e2bfb8294318931 (patch) | |
| tree | e1fa10e0fb7c178f76bae5b821012b451eb83f51 /convert.py | |
| parent | 42a874e4feac64ed20f11f0cbef93a47bed528b5 (diff) | |
Moved foldoc definitions to SQL database
Diffstat (limited to 'convert.py')
| -rwxr-xr-x | convert.py | 54 | 
1 files changed, 41 insertions, 13 deletions
@@ -58,7 +58,6 @@ def select_wn():  			cur = con.cursor(mysql.cursors.DictCursor)  			cur.execute("SELECT lemma, pos, sensenum, synsetid, definition, sampleset from dict")  			rows = cur.fetchall() -			print len(rows)  	except mysql.Error, e:  		print "Database Error %d: %s" % (e.args[0],e.args[1])  		sys.exit(1) @@ -118,11 +117,11 @@ def get_db_version():  		sys.exit(1)  def update_db(version): +		version = int(version)  		try:  			con = sqlite3.connect('dictionaries/wordnet.db');  			with con:  				cur = con.cursor() -  				if version < 1:  					cur.execute("CREATE TABLE info (id integer primary key not null , key text, value text)")  					cur.execute("INSERT INTO info (key, value) VALUES('version', '1')") @@ -135,7 +134,7 @@ def update_db(version):  					cur.execute("CREATE TABLE dictionaries (id integer primary key not null, name text, abbreviation text)")  					cur.execute("INSERT INTO dictionaries (name, abbreviation) VALUES('WordNet', 'wn')")  					cur.execute("INSERT INTO dictionaries (name, abbreviation) VALUES('FOLDOC', 'foldoc')") -					cur.execute("CREATE TABLE definition_categories (id integer primary key not null, definition_id int category_id int)") +					cur.execute("CREATE TABLE definition_categories (id integer primary key not null, definition_id int, category_id int)")  					cur.execute("UPDATE info set value = 2 where key = 'version'")  		except sqlite3.Error, e: @@ -156,6 +155,7 @@ def parse_foldoc():  	for line in file:  		#Find lines of word headings, these are ones without an indent +		remaining = ""  		if line[0] != "\t" and len(line) > 0 and line[0] != "\n" and line[0] != " ":  			if definition is not None and len(definition) > 1: @@ -181,7 +181,6 @@ def parse_foldoc():  				id_parts = line.split(".")  				if len(id_parts[0]) <=2: -  					if definition is not None and len(definition) > 1:  						item = Definition(word, id, "foldoc", "tech", definition.strip("\n "), [], [], [], categories, see_also)  						items.append(item) @@ -225,7 +224,10 @@ def parse_foldoc():  						if count_braces == 1:  							see_also = True -					definition += remaining.strip("<> ").replace("{", "").replace("}", "") +				elif line[0] != "[" and line[-1] != "]" and not (line[-1] == "." and line[-2] == "]"): +					remaining = line +				 +				definition += remaining.strip("<> ").replace("{", "").replace("}", "")  			# Check if it's a 'see also' definition  			elif line[0] == "{" and line[-1] == "}": @@ -240,21 +242,24 @@ def parse_foldoc():  					definition = definition = line.strip("{} ")  				else:  					definition += line.replace("{", "").replace("}", "")  -				 -			else: +			#Ignore the note lines enclosed in []s + +			elif len(line) > 1 and line[0] != "[" and line[-1] != "]" and not (line[-1] == "." and line[-2] == "]"):  				# Get the categories  				if len(line) > 0 and line[0] == "<":  					categories = []  					category_parts = line.strip().split(">")  					def_categories = category_parts[0].strip().split(",")  					for category in def_categories: -						categories.append(category.strip("<> ")) +						if "@" not in category: +							categories.append(category.strip("<> "))  					for part in category_parts[1:]:  						definition += part.replace("{", "").replace("}", "")  				elif len(line) > 0:  					#Ignore date lines -					if  len(line) > 1 and line[0] != "(" and line[-1] != ")" and not line[1].isdigit(): +					#TODO: Make this a date regex +					if  len(line) > 1 and line[0] != "(" and line[-1] != ")":  						definition += line.replace("{", "").replace("}", "")  		if len(definition) > 0 and definition[-1] != " ":  			definition += " " @@ -262,10 +267,33 @@ def parse_foldoc():  	#Add the last item  	item = Definition(word, id, "foldoc", "tech", definition.strip("\n "), [], [], [], categories)  	items.append(item) -				 -	for item in items: -		# if item.see_also: -		print item.word + "| " + str(item.id) + " : " + item.definition + str(item.see_also) + "<" + str(item.categories) +	 +	try: +		con = sqlite3.connect('dictionaries/wordnet.db'); +		con.text_factory = str +		with con: +			cur = con.cursor() +			for item in items: +				cur.execute("INSERT INTO definitions(word, dictionary_id, type_id, sub_id, definition) values(?, ?, ?, ?, ?)", [item.word, 2, 5, item.id, item.definition]) +				def_id = cur.lastrowid + +				for category in item.categories: +					cur.execute("SELECT * FROM categories WHERE category = ?", [category]) +					db_category = cur.fetchone() + +					# Only make a new category if it doesn't exist, otherwise use the existing ID +					if db_category is not None: +						cat_id = db_category[0] +					else: +						cur.execute("INSERT INTO categories(category) values(?)", [category]) +						cat_id = cur.lastrowid + +					cur.execute("INSERT INTO definition_categories(category_id, definition_id) values(?, ?)", [cat_id, def_id]) + + +	except sqlite3.Error, e: +		print "Database Error %s" % (e.args[0]) +		sys.exit(1)  def main():  | 
