import requests from lxml import etree as ET version = "2.0.1b" class Definition(object): #ID is relative to the word type, eg noun 1, noun 2, verb 1, verb 2, not to the entire list id = 0 dictionary = "" word_type = "" definition = "" synonyms = [] def __init__(self, id, dictionary, word_type, definition, synonyms, antonyms): self.id = id self.dictionary = dictionary self.word_type = word_type self.definition = definition self.synonyms = synonyms def get_xml(): api_url = "http://services.aonaware.com/DictService/DictService.asmx/DefineInDict?dictId=wn&word=red" xml = ET.parse("http://services.aonaware.com/DictService/DictService.asmx/DefineInDict?dictId=wn&word=red") #Root element tag is WordDefinition, which is the same as the element which contains the definition #That's akward, so let's just for element in xml.iter("{http://services.aonaware.com/webservices/}Definitions"): root = element break; for element in root.iter("{http://services.aonaware.com/webservices/}WordDefinition"): print element.text return root def parse_xml(xml): #Only worrying about one definition for now, add multiples later for element in xml.iter("{http://services.aonaware.com/webservices/}WordDefinition"): definition = element.text break; definition_lines = definition.split("\n") items = [] synonyms = [] antonyms = [] id = 0 syn_line = False ant_line = False for line in definition_lines[1:]: extra_parts = line.split("[") line_parts = extra_parts[0].split(":") #Work out if this is a line that includes identifying information (word type or id) #This is done by checking the indenting - if the first 8 characters are spaces, it's not #It's a bit hacky, maybe change this later id_line = False for char in line_parts[0][0:7]: if char != " ": id_line = True if id_line: if id > 0: #Add a previous line to the array of definitions, if there is one item = Definition(id, "wn", word_type, definition, [], []) items.append(item) synonyms = [] antonyms = [] syn_line = False ant_line = False id_parts = line_parts[0].strip().split(" ") if id_parts[0] == "[also": #This line comes at the end, don't worry about it for now break; if id_parts[0].isdigit() != True: word_type = id_parts[0] id = id_parts[1] definition = line_parts[1].strip() + " " else: id = id_parts[0] definition = line_parts[1].strip() + " " for word in line_parts[2:]: definition += word + " " else: definition += line_parts[0].strip() + " " if len(line_parts) > 1: for part in line_parts[1:-1]: definition += part + ":" definition += line_parts[-1] if len(extra_parts) > 0: for part in extra_parts: extra_words = part.split(" ") if extra_words[0] == "syn:": for syn in extra_words[1:]: print syn.strip("{},]") synonyms.append(syn.strip("{}")) syn_line = True elif extra_words[0] == "ant:": for ant in extra_words[1:]: antonyms.append(ant.strip("{},]")) ant_line = True print definition print "\n" return items def main(): print "test" xml = get_xml() items = parse_xml(xml) print len(items) for item in items: print item.definition print "\n" if __name__ == "__main__": main()