Initial commit, working on parsing the XML

author: Joe Robinson <joe@sol.home> 2014-09-15 23:26:08 +0100
committer: Joe Robinson <joe@sol.home> 2014-09-15 23:26:08 +0100
commit: 699addc3f843616ba50275f6b0c54b05a15fa2f2 (patch)
tree: 57b2477875c970b90cb721f8a4ea15af84ac08da /bladictionary.py
1 files changed, 132 insertions, 0 deletions
diff --git a/bladictionary.py b/bladictionary.py
new file mode 100644
index 0000000..df5d0fc
--- /dev/null
+++ b/bladictionary.py
@@ -0,0 +1,132 @@
+import requests
+from lxml import etree as ET
+
+version = "2.0.1b"
+class Definition(object):
+
+	#ID is relative to the word type, eg noun 1, noun 2, verb 1, verb 2, not to the entire list
+	id = 0
+	dictionary = ""
+	word_type = ""
+	definition = ""
+	synonyms = []
+
+	def __init__(self, id, dictionary, word_type, definition, synonyms, antonyms):
+		self.id = id
+		self.dictionary = dictionary
+		self.word_type = word_type
+		self.definition = definition
+		self.synonyms = synonyms
+
+def get_xml():
+	api_url = "http://services.aonaware.com/DictService/DictService.asmx/DefineInDict?dictId=wn&word=red"
+
+	xml = ET.parse("http://services.aonaware.com/DictService/DictService.asmx/DefineInDict?dictId=wn&word=red")
+
+	#Root element tag is WordDefinition, which is the same as the element which contains the definition
+	#That's akward, so let's just 
+	for element in xml.iter("{http://services.aonaware.com/webservices/}Definitions"):
+		root = element
+		break;
+
+	for element in root.iter("{http://services.aonaware.com/webservices/}WordDefinition"):
+		print element.text
+
+	return root
+
+def parse_xml(xml):
+
+	#Only worrying about one definition for now, add multiples later
+	for element in xml.iter("{http://services.aonaware.com/webservices/}WordDefinition"):
+		definition = element.text
+		break;
+
+	definition_lines = definition.split("\n")
+
+	items = []
+	synonyms = []
+	antonyms = []
+	id = 0
+	syn_line = False
+	ant_line = False
+	for line in definition_lines[1:]:
+		extra_parts = line.split("[")
+		line_parts = extra_parts[0].split(":")
+
+		#Work out if this is a line that includes identifying information (word type or id)
+		#This is done by checking the indenting - if the first 8 characters are spaces, it's not
+		#It's a bit hacky, maybe change this later
+
+		id_line = False
+		for char in line_parts[0][0:7]:
+			if char != " ":
+				id_line = True
+
+		if id_line:
+			if id > 0:
+				#Add a previous line to the array of definitions, if there is one
+				item = Definition(id, "wn", word_type, definition, [], [])
+				items.append(item)
+				synonyms = []
+				antonyms = []
+				syn_line = False
+				ant_line = False
+
+			id_parts = line_parts[0].strip().split(" ")
+
+			if id_parts[0] == "[also":
+				#This line comes at the end, don't worry about it for now
+				break;
+
+			if id_parts[0].isdigit() != True:
+				word_type = id_parts[0]
+				id = id_parts[1]
+				definition = line_parts[1].strip() + " "
+			else:
+				id = id_parts[0]
+				definition = line_parts[1].strip() + " "
+
+			for word in line_parts[2:]:
+				definition += word + " "
+
+		else:
+			definition += line_parts[0].strip() + " "
+			if len(line_parts) > 1:
+				for part in line_parts[1:-1]:
+					definition += part + ":"
+				definition += line_parts[-1]
+
+		if len(extra_parts) > 0:
+			for part in extra_parts:
+				extra_words = part.split(" ")
+				if extra_words[0] == "syn:":
+					for syn in extra_words[1:]:
+						print syn.strip("{},]")
+						synonyms.append(syn.strip("{}"))
+						syn_line = True
+				elif extra_words[0] == "ant:":
+					for ant in extra_words[1:]:
+						antonyms.append(ant.strip("{},]"))
+						ant_line = True
+
+		print definition
+		print "\n"
+
+	return items
+
+		
+
+
+
+def main():
+	print "test"
+	xml = get_xml()
+
+	items = parse_xml(xml)
+
+	print len(items)
+	for item in items:
+		print item.definition
+		print "\n"
+if __name__ == "__main__":
+	main()
+\ No newline at end of file
author	Joe Robinson <joe@sol.home>	2014-09-15 23:26:08 +0100
committer	Joe Robinson <joe@sol.home>	2014-09-15 23:26:08 +0100
commit	699addc3f843616ba50275f6b0c54b05a15fa2f2 (patch)
tree	57b2477875c970b90cb721f8a4ea15af84ac08da /bladictionary.py