summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJoe Robinson <joe@sol.home>2014-09-15 23:26:08 +0100
committerJoe Robinson <joe@sol.home>2014-09-15 23:26:08 +0100
commit699addc3f843616ba50275f6b0c54b05a15fa2f2 (patch)
tree57b2477875c970b90cb721f8a4ea15af84ac08da
Initial commit, working on parsing the XML
-rw-r--r--bladictionary.py132
1 files changed, 132 insertions, 0 deletions
diff --git a/bladictionary.py b/bladictionary.py
new file mode 100644
index 0000000..df5d0fc
--- /dev/null
+++ b/bladictionary.py
@@ -0,0 +1,132 @@
+import requests
+from lxml import etree as ET
+
+version = "2.0.1b"
+class Definition(object):
+
+ #ID is relative to the word type, eg noun 1, noun 2, verb 1, verb 2, not to the entire list
+ id = 0
+ dictionary = ""
+ word_type = ""
+ definition = ""
+ synonyms = []
+
+ def __init__(self, id, dictionary, word_type, definition, synonyms, antonyms):
+ self.id = id
+ self.dictionary = dictionary
+ self.word_type = word_type
+ self.definition = definition
+ self.synonyms = synonyms
+
+def get_xml():
+ api_url = "http://services.aonaware.com/DictService/DictService.asmx/DefineInDict?dictId=wn&word=red"
+
+ xml = ET.parse("http://services.aonaware.com/DictService/DictService.asmx/DefineInDict?dictId=wn&word=red")
+
+ #Root element tag is WordDefinition, which is the same as the element which contains the definition
+ #That's akward, so let's just
+ for element in xml.iter("{http://services.aonaware.com/webservices/}Definitions"):
+ root = element
+ break;
+
+ for element in root.iter("{http://services.aonaware.com/webservices/}WordDefinition"):
+ print element.text
+
+ return root
+
+def parse_xml(xml):
+
+ #Only worrying about one definition for now, add multiples later
+ for element in xml.iter("{http://services.aonaware.com/webservices/}WordDefinition"):
+ definition = element.text
+ break;
+
+ definition_lines = definition.split("\n")
+
+ items = []
+ synonyms = []
+ antonyms = []
+ id = 0
+ syn_line = False
+ ant_line = False
+ for line in definition_lines[1:]:
+ extra_parts = line.split("[")
+ line_parts = extra_parts[0].split(":")
+
+ #Work out if this is a line that includes identifying information (word type or id)
+ #This is done by checking the indenting - if the first 8 characters are spaces, it's not
+ #It's a bit hacky, maybe change this later
+
+ id_line = False
+ for char in line_parts[0][0:7]:
+ if char != " ":
+ id_line = True
+
+ if id_line:
+ if id > 0:
+ #Add a previous line to the array of definitions, if there is one
+ item = Definition(id, "wn", word_type, definition, [], [])
+ items.append(item)
+ synonyms = []
+ antonyms = []
+ syn_line = False
+ ant_line = False
+
+ id_parts = line_parts[0].strip().split(" ")
+
+ if id_parts[0] == "[also":
+ #This line comes at the end, don't worry about it for now
+ break;
+
+ if id_parts[0].isdigit() != True:
+ word_type = id_parts[0]
+ id = id_parts[1]
+ definition = line_parts[1].strip() + " "
+ else:
+ id = id_parts[0]
+ definition = line_parts[1].strip() + " "
+
+ for word in line_parts[2:]:
+ definition += word + " "
+
+ else:
+ definition += line_parts[0].strip() + " "
+ if len(line_parts) > 1:
+ for part in line_parts[1:-1]:
+ definition += part + ":"
+ definition += line_parts[-1]
+
+ if len(extra_parts) > 0:
+ for part in extra_parts:
+ extra_words = part.split(" ")
+ if extra_words[0] == "syn:":
+ for syn in extra_words[1:]:
+ print syn.strip("{},]")
+ synonyms.append(syn.strip("{}"))
+ syn_line = True
+ elif extra_words[0] == "ant:":
+ for ant in extra_words[1:]:
+ antonyms.append(ant.strip("{},]"))
+ ant_line = True
+
+ print definition
+ print "\n"
+
+ return items
+
+
+
+
+
+def main():
+ print "test"
+ xml = get_xml()
+
+ items = parse_xml(xml)
+
+ print len(items)
+ for item in items:
+ print item.definition
+ print "\n"
+if __name__ == "__main__":
+ main() \ No newline at end of file