diff options
author | Joe Robinson <joe@sol.home> | 2014-09-15 23:26:08 +0100 |
---|---|---|
committer | Joe Robinson <joe@sol.home> | 2014-09-15 23:26:08 +0100 |
commit | 699addc3f843616ba50275f6b0c54b05a15fa2f2 (patch) | |
tree | 57b2477875c970b90cb721f8a4ea15af84ac08da /bladictionary.py |
Initial commit, working on parsing the XML
Diffstat (limited to 'bladictionary.py')
-rw-r--r-- | bladictionary.py | 132 |
1 files changed, 132 insertions, 0 deletions
diff --git a/bladictionary.py b/bladictionary.py new file mode 100644 index 0000000..df5d0fc --- /dev/null +++ b/bladictionary.py @@ -0,0 +1,132 @@ +import requests +from lxml import etree as ET + +version = "2.0.1b" +class Definition(object): + + #ID is relative to the word type, eg noun 1, noun 2, verb 1, verb 2, not to the entire list + id = 0 + dictionary = "" + word_type = "" + definition = "" + synonyms = [] + + def __init__(self, id, dictionary, word_type, definition, synonyms, antonyms): + self.id = id + self.dictionary = dictionary + self.word_type = word_type + self.definition = definition + self.synonyms = synonyms + +def get_xml(): + api_url = "http://services.aonaware.com/DictService/DictService.asmx/DefineInDict?dictId=wn&word=red" + + xml = ET.parse("http://services.aonaware.com/DictService/DictService.asmx/DefineInDict?dictId=wn&word=red") + + #Root element tag is WordDefinition, which is the same as the element which contains the definition + #That's akward, so let's just + for element in xml.iter("{http://services.aonaware.com/webservices/}Definitions"): + root = element + break; + + for element in root.iter("{http://services.aonaware.com/webservices/}WordDefinition"): + print element.text + + return root + +def parse_xml(xml): + + #Only worrying about one definition for now, add multiples later + for element in xml.iter("{http://services.aonaware.com/webservices/}WordDefinition"): + definition = element.text + break; + + definition_lines = definition.split("\n") + + items = [] + synonyms = [] + antonyms = [] + id = 0 + syn_line = False + ant_line = False + for line in definition_lines[1:]: + extra_parts = line.split("[") + line_parts = extra_parts[0].split(":") + + #Work out if this is a line that includes identifying information (word type or id) + #This is done by checking the indenting - if the first 8 characters are spaces, it's not + #It's a bit hacky, maybe change this later + + id_line = False + for char in line_parts[0][0:7]: + if char != " ": + id_line = True + + if id_line: + if id > 0: + #Add a previous line to the array of definitions, if there is one + item = Definition(id, "wn", word_type, definition, [], []) + items.append(item) + synonyms = [] + antonyms = [] + syn_line = False + ant_line = False + + id_parts = line_parts[0].strip().split(" ") + + if id_parts[0] == "[also": + #This line comes at the end, don't worry about it for now + break; + + if id_parts[0].isdigit() != True: + word_type = id_parts[0] + id = id_parts[1] + definition = line_parts[1].strip() + " " + else: + id = id_parts[0] + definition = line_parts[1].strip() + " " + + for word in line_parts[2:]: + definition += word + " " + + else: + definition += line_parts[0].strip() + " " + if len(line_parts) > 1: + for part in line_parts[1:-1]: + definition += part + ":" + definition += line_parts[-1] + + if len(extra_parts) > 0: + for part in extra_parts: + extra_words = part.split(" ") + if extra_words[0] == "syn:": + for syn in extra_words[1:]: + print syn.strip("{},]") + synonyms.append(syn.strip("{}")) + syn_line = True + elif extra_words[0] == "ant:": + for ant in extra_words[1:]: + antonyms.append(ant.strip("{},]")) + ant_line = True + + print definition + print "\n" + + return items + + + + + +def main(): + print "test" + xml = get_xml() + + items = parse_xml(xml) + + print len(items) + for item in items: + print item.definition + print "\n" +if __name__ == "__main__": + main()
\ No newline at end of file |