From 80ea16da36729475643c9dc591a09f516bb5473c Mon Sep 17 00:00:00 2001 From: Joe Robinson Date: Thu, 18 Sep 2014 10:40:29 +0100 Subject: Work in progress on parsing Oxford English Dictionary file --- bladictionary.py | 82 +++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 79 insertions(+), 3 deletions(-) (limited to 'bladictionary.py') diff --git a/bladictionary.py b/bladictionary.py index cbe6b22..8e1c7b9 100644 --- a/bladictionary.py +++ b/bladictionary.py @@ -1,4 +1,9 @@ +#!/usr/bin/python2 +# -*- coding: utf-8 -*- + import requests +import sys +import shlex from lxml import etree version = "2.0.1b" @@ -6,12 +11,14 @@ class Definition(object): #ID is relative to the word type, eg noun 1, noun 2, verb 1, verb 2, not to the entire list id = 0 + word = "" dictionary = "" word_type = "" definition = "" synonyms = [] - def __init__(self, id, dictionary, word_type, definition, synonyms, antonyms): + def __init__(self, word, id, dictionary, word_type, definition, synonyms, antonyms): + self.word = word self.id = id self.dictionary = dictionary self.word_type = word_type @@ -69,7 +76,7 @@ def parse_xml(xml): if id_line: if id > 0: #Add a previous line to the array of definitions, if there is one - item = Definition(id, "wn", word_type, definition, [], []) + item = Definition(id, "bla", "wn", word_type, definition, [], []) items.append(item) synonyms = [] antonyms = [] @@ -106,7 +113,7 @@ def parse_xml(xml): if extra_words[0] == "syn:": for syn in extra_words[1:]: print syn.strip("{},]") - synonyms.append(syn.strip("{}")) + synonyms.append(syn.strip("{},]")) syn_line = True elif extra_words[0] == "ant:": for ant in extra_words[1:]: @@ -118,11 +125,80 @@ def parse_xml(xml): return items +def parse_args(): + args = sys.argv[1:] + + if not args: + args = shlex.split(sys.stdin.read()) + + types = ["n", "noun", "v", "verb", "adj", "adjective", "adv", "adverb"] + dicts = ["wn", "wordnet", "oed"] + + word = args[0] + word_type = "" + word_dict = "" + if len(args) > 1: + for arg in args[1:]: + if arg in types: + word_type = arg + elif arg in dicts: + word_dict = arg + + return word, word_type, word_dict + +def parse_oed(word): + types = ["n.", "—n.", "v.", "—v.", "adj.", "—adj.", "adv.", "—adv."] + print "test" + file = open("oed.txt") + for line in file: + word_part = line.split(" ")[0] + + if word_part == word: + word_line = line + break; + + if word_line is None: + return + + prev_part = "" + items = [] + id = 0 + definition = "" + print line + for part in line.split(" ")[1:]: + if part in types: + if id > 0: + item = Definition(id, word, "oed", word_type, definition, [], []) + items.append(item) + definition = "" + + word_type = part + + elif part.isdigit() and prev_part is not None and prev_part[-1] == ".": + item = Definition(id, word, "oed", word_type, definition, [], []) + items.append(item) + definition = "" + id = part + else: + definition += part + " " + + + prev_part = part + + for item in items: + print item.word + print item.word_type + print item.definition def main(): + word, word_type, word_dict = parse_args() + print word_dict + if word_dict == "oed": + parse_oed(word) + xml = get_xml() if xml is None: -- cgit v1.2.3