Added FOLDOC dictionary parsing for technical words

author: Joe Robinson <joe@lc8n.com> 2014-09-21 01:08:15 +0100
committer: Joe Robinson <joe@lc8n.com> 2014-09-21 01:08:15 +0100
commit: 7a1af5825afb1ea7f282608bac259507a1a398e3 (patch)
tree: bd92f10ce1ffe1233862bc17f0860160f4e702e9 /bladictionary.py
parent: 36102ee26a1d6c1b90fa14ea30bb346c66164213 (diff)
1 files changed, 57 insertions, 5 deletions
diff --git a/bladictionary.py b/bladictionary.py
index e7038b7..c3e135a 100755
--- a/bladictionary.py
+++ b/bladictionary.py
@@ -7,7 +7,7 @@ import optparse
 from lxml import etree
 import sqlite3
 
-VERSION = "2.1.8b"
+VERSION = "2.1.10b"
 class Definition(object):
 
 	#ID is relative to the word type, eg noun 1, noun 2, verb 1, verb 2, not to the entire list
@@ -151,8 +151,8 @@ def parse_args():
 	parser.add_option( "-c", "--channel", action = "store", help = "The IRC channel of the request")
 	options, args = parser.parse_args( args )
 
-	types = ["n", "noun", "v", "verb", "adj", "adjective", "adv", "adverb"]
-	dicts = ["wn", "wordnet", "oed", "db"]
+	types = ["n", "noun", "v", "verb", "adj", "adjective", "adv", "adverb", "tech"]
+	dicts = ["wn", "wordnet", "oed", "db", "foldoc"]
 
 	word = ""
 	word_type = ""
@@ -296,6 +296,55 @@ def get_sql(word):
 
 	return items
 
+def parse_foldoc(word, refer = False):
+
+	file = open("dictionaries/foldoc.txt")
+	word_line = ""
+	word_len = len(word.split(" "))
+	found = False
+	count_blank = 0
+	items = []
+	#If it's been referred from another definition, include the new word
+	if refer:
+		definition = word + ". "
+	else: 
+		definition = ""
+
+	for line in file:
+		line = line.strip()
+		word_parts = line.split(" ")
+		word_part = ""
+		if not found:
+			#Read the appropriate number of words depending on how many were specified
+			for part in word_parts[0:word_len]:
+				word_part += part + " "
+			word_part = word_part.strip()
+
+			#Ignore case
+			if word_part.lower() == word.lower():
+				found = True
+
+		#Foldoc definitions are split over multiple lines, so keep reading once we've found it
+		else:
+			
+			if len(line) == 0:
+				count_blank += 1
+				if count_blank == 1:
+					continue
+				else:
+					break
+			if line[0] == "{" and line[-1] == "}":
+				parse_foldoc(line.strip("{} "), True)
+			for part in word_parts:
+				definition += part.strip("{").replace("}", "") + " "
+
+	if not found:
+		return
+	else:
+		item = Definition(word, 0, "foldoc", "tech", definition, [], [], [])
+		items.append(item)
+		return items
+
 
 
 def main():
@@ -312,9 +361,12 @@ def main():
 		xml = get_xml(word, word_dict)
 
 		if xml is None:
-			return
+			print "Error finding definitions for " + word
+			sys.exit( 1 )
 
 		items = parse_xml(xml)
+	elif word_dict == "foldoc" or word_type == "tech":
+		items = parse_foldoc(word)
 	else:
 		if word_dict is None or word_dict == "":
 			word_dict = "db"
@@ -336,7 +388,7 @@ def main():
 	else:
 		max_length = 460
 
-	types = ["n", "v", "adj", "adv"]
+	types = ["n", "v", "adj", "adv", "tech"]
 	type_id = 0
 
 	all_types = word_type is ""
author	Joe Robinson <joe@lc8n.com>	2014-09-21 01:08:15 +0100
committer	Joe Robinson <joe@lc8n.com>	2014-09-21 01:08:15 +0100
commit	7a1af5825afb1ea7f282608bac259507a1a398e3 (patch)
tree	bd92f10ce1ffe1233862bc17f0860160f4e702e9 /bladictionary.py
parent	36102ee26a1d6c1b90fa14ea30bb346c66164213 (diff)