Work in progress on parsing Oxford English Dictionary file

author: Joe Robinson <joe@mumsnet.com> 2014-09-18 10:40:29 +0100
committer: Joe Robinson <joe@mumsnet.com> 2014-09-18 10:40:29 +0100
commit: 80ea16da36729475643c9dc591a09f516bb5473c (patch)
tree: 3dde7d858c3583a0fa89dc0b0c8e7f54f017bbf9 /bladictionary.py
parent: adc06d3e9905fcde60d90134f3e4a38241d54546 (diff)
1 files changed, 79 insertions, 3 deletions
diff --git a/bladictionary.py b/bladictionary.py
index cbe6b22..8e1c7b9 100644
--- a/bladictionary.py
+++ b/bladictionary.py
@@ -1,4 +1,9 @@
+#!/usr/bin/python2
+# -*- coding: utf-8 -*-
+
 import requests
+import sys
+import shlex
 from lxml import etree
 
 version = "2.0.1b"
@@ -6,12 +11,14 @@ class Definition(object):
 
 	#ID is relative to the word type, eg noun 1, noun 2, verb 1, verb 2, not to the entire list
 	id = 0
+	word = ""
 	dictionary = ""
 	word_type = ""
 	definition = ""
 	synonyms = []
 
-	def __init__(self, id, dictionary, word_type, definition, synonyms, antonyms):
+	def __init__(self, word, id, dictionary, word_type, definition, synonyms, antonyms):
+		self.word = word
 		self.id = id
 		self.dictionary = dictionary
 		self.word_type = word_type
@@ -69,7 +76,7 @@ def parse_xml(xml):
 		if id_line:
 			if id > 0:
 				#Add a previous line to the array of definitions, if there is one
-				item = Definition(id, "wn", word_type, definition, [], [])
+				item = Definition(id, "bla", "wn", word_type, definition, [], [])
 				items.append(item)
 				synonyms = []
 				antonyms = []
@@ -106,7 +113,7 @@ def parse_xml(xml):
 				if extra_words[0] == "syn:":
 					for syn in extra_words[1:]:
 						print syn.strip("{},]")
-						synonyms.append(syn.strip("{}"))
+						synonyms.append(syn.strip("{},]"))
 						syn_line = True
 				elif extra_words[0] == "ant:":
 					for ant in extra_words[1:]:
@@ -118,11 +125,80 @@ def parse_xml(xml):
 
 	return items
 
+def parse_args():
+	args = sys.argv[1:]
+
+	if not args:
+		args = shlex.split(sys.stdin.read())
+
+	types = ["n", "noun", "v", "verb", "adj", "adjective", "adv", "adverb"]
+	dicts = ["wn", "wordnet", "oed"]
+
+	word = args[0]
+	word_type = ""
+	word_dict = ""
+	if len(args) > 1:
+		for arg in args[1:]:
+			if arg in types:
+				word_type = arg
+			elif arg in dicts:
+				word_dict = arg
+
+	return word, word_type, word_dict
+
+def parse_oed(word):
+	types = ["n.", "—n.", "v.", "—v.", "adj.", "—adj.", "adv.", "—adv."]
+	print "test"
+	file = open("oed.txt")
+	for line in file:
+		word_part = line.split(" ")[0]
+
+		if word_part == word:
+			word_line = line
+			break;
+
+	if word_line is None:
+		return
+
+	prev_part = ""
+	items = []
+	id = 0
+	definition = ""
+	print line
+	for part in line.split(" ")[1:]:
 		
+		if part in types:
+			if id > 0:
+				item = Definition(id, word, "oed", word_type, definition, [], [])
+				items.append(item)
+				definition = ""
+
+			word_type = part
+
+		elif part.isdigit() and prev_part is not None and prev_part[-1] == ".":
+			item = Definition(id, word, "oed", word_type, definition, [], [])
+			items.append(item)
+			definition = ""
+			id = part
+		else:
+			definition += part + " "
+			
+
+		prev_part = part
+
+	for item in items:
+		print item.word
+		print item.word_type
+		print item.definition
 
 
 
 def main():
+	word, word_type, word_dict = parse_args()
+	print word_dict
+	if word_dict == "oed":
+		parse_oed(word)
+
 	xml = get_xml()
 
 	if xml is None:
author	Joe Robinson <joe@mumsnet.com>	2014-09-18 10:40:29 +0100
committer	Joe Robinson <joe@mumsnet.com>	2014-09-18 10:40:29 +0100
commit	80ea16da36729475643c9dc591a09f516bb5473c (patch)
tree	3dde7d858c3583a0fa89dc0b0c8e7f54f017bbf9 /bladictionary.py
parent	adc06d3e9905fcde60d90134f3e4a38241d54546 (diff)