summaryrefslogtreecommitdiff
path: root/bladictionary.py
diff options
context:
space:
mode:
authorJoe Robinson <joe@mumsnet.com>2014-09-18 10:40:29 +0100
committerJoe Robinson <joe@mumsnet.com>2014-09-18 10:40:29 +0100
commit80ea16da36729475643c9dc591a09f516bb5473c (patch)
tree3dde7d858c3583a0fa89dc0b0c8e7f54f017bbf9 /bladictionary.py
parentadc06d3e9905fcde60d90134f3e4a38241d54546 (diff)
Work in progress on parsing Oxford English Dictionary file
Diffstat (limited to 'bladictionary.py')
-rw-r--r--bladictionary.py82
1 files changed, 79 insertions, 3 deletions
diff --git a/bladictionary.py b/bladictionary.py
index cbe6b22..8e1c7b9 100644
--- a/bladictionary.py
+++ b/bladictionary.py
@@ -1,4 +1,9 @@
+#!/usr/bin/python2
+# -*- coding: utf-8 -*-
+
import requests
+import sys
+import shlex
from lxml import etree
version = "2.0.1b"
@@ -6,12 +11,14 @@ class Definition(object):
#ID is relative to the word type, eg noun 1, noun 2, verb 1, verb 2, not to the entire list
id = 0
+ word = ""
dictionary = ""
word_type = ""
definition = ""
synonyms = []
- def __init__(self, id, dictionary, word_type, definition, synonyms, antonyms):
+ def __init__(self, word, id, dictionary, word_type, definition, synonyms, antonyms):
+ self.word = word
self.id = id
self.dictionary = dictionary
self.word_type = word_type
@@ -69,7 +76,7 @@ def parse_xml(xml):
if id_line:
if id > 0:
#Add a previous line to the array of definitions, if there is one
- item = Definition(id, "wn", word_type, definition, [], [])
+ item = Definition(id, "bla", "wn", word_type, definition, [], [])
items.append(item)
synonyms = []
antonyms = []
@@ -106,7 +113,7 @@ def parse_xml(xml):
if extra_words[0] == "syn:":
for syn in extra_words[1:]:
print syn.strip("{},]")
- synonyms.append(syn.strip("{}"))
+ synonyms.append(syn.strip("{},]"))
syn_line = True
elif extra_words[0] == "ant:":
for ant in extra_words[1:]:
@@ -118,11 +125,80 @@ def parse_xml(xml):
return items
+def parse_args():
+ args = sys.argv[1:]
+
+ if not args:
+ args = shlex.split(sys.stdin.read())
+
+ types = ["n", "noun", "v", "verb", "adj", "adjective", "adv", "adverb"]
+ dicts = ["wn", "wordnet", "oed"]
+
+ word = args[0]
+ word_type = ""
+ word_dict = ""
+ if len(args) > 1:
+ for arg in args[1:]:
+ if arg in types:
+ word_type = arg
+ elif arg in dicts:
+ word_dict = arg
+
+ return word, word_type, word_dict
+
+def parse_oed(word):
+ types = ["n.", "—n.", "v.", "—v.", "adj.", "—adj.", "adv.", "—adv."]
+ print "test"
+ file = open("oed.txt")
+ for line in file:
+ word_part = line.split(" ")[0]
+
+ if word_part == word:
+ word_line = line
+ break;
+
+ if word_line is None:
+ return
+
+ prev_part = ""
+ items = []
+ id = 0
+ definition = ""
+ print line
+ for part in line.split(" ")[1:]:
+ if part in types:
+ if id > 0:
+ item = Definition(id, word, "oed", word_type, definition, [], [])
+ items.append(item)
+ definition = ""
+
+ word_type = part
+
+ elif part.isdigit() and prev_part is not None and prev_part[-1] == ".":
+ item = Definition(id, word, "oed", word_type, definition, [], [])
+ items.append(item)
+ definition = ""
+ id = part
+ else:
+ definition += part + " "
+
+
+ prev_part = part
+
+ for item in items:
+ print item.word
+ print item.word_type
+ print item.definition
def main():
+ word, word_type, word_dict = parse_args()
+ print word_dict
+ if word_dict == "oed":
+ parse_oed(word)
+
xml = get_xml()
if xml is None: