diff options
author | Joe Robinson <joe@mumsnet.com> | 2014-09-18 15:20:46 +0100 |
---|---|---|
committer | Joe Robinson <joe@mumsnet.com> | 2014-09-18 15:20:46 +0100 |
commit | e6206b9511c1284c5406a5af21147ddbda406006 (patch) | |
tree | 3605fdb266e9b95d0daab121f4fdd411d4184630 /bladictionary.py | |
parent | 778b673dc2e1d1249d303ef28893e4338699291b (diff) |
Fixes for OED parsing
Diffstat (limited to 'bladictionary.py')
-rwxr-xr-x | bladictionary.py | 18 |
1 files changed, 11 insertions, 7 deletions
diff --git a/bladictionary.py b/bladictionary.py index f7fcaec..ec5aff9 100755 --- a/bladictionary.py +++ b/bladictionary.py @@ -6,7 +6,7 @@ import sys import shlex from lxml import etree -version = "2.0.5b" +version = "2.0.7b" class Definition(object): #ID is relative to the word type, eg noun 1, noun 2, verb 1, verb 2, not to the entire list @@ -160,6 +160,7 @@ def parse_args(): def parse_oed(word): types = ["n.", "—n.", "v.", "—v.", "adj.", "—adj.", "adv.", "—adv."] file = open("oed.txt") + word_line = "" for line in file: word_part = line.split(" ")[0] @@ -168,11 +169,10 @@ def parse_oed(word): word_line = line break; - if word_line is None: + if word_line == "": return print line - prev_part = "" items = [] id = 0 @@ -182,14 +182,14 @@ def parse_oed(word): if part in types: if id > 0: - item = Definition(id, word, "oed", word_type, definition, [], []) + item = Definition(word, id, "oed", word_type, definition, [], []) items.append(item) definition = "" word_type = part - elif part.isdigit() and prev_part is not None and prev_part[-1] == ".": - item = Definition(id, word, "oed", word_type, definition, [], []) + elif part.isdigit() and prev_part is not None and (prev_part[-1] == "." or prev_part[-1] == ")"): + item = Definition(word, id, "oed", word_type, definition, [], []) items.append(item) definition = "" id = part @@ -198,6 +198,10 @@ def parse_oed(word): prev_part = part + if id == 0: + id = 1 + item = Definition(word, id, "oed", word_type, definition, [], []) + items.append(item) return items @@ -226,7 +230,7 @@ def main(): if suppress_print: num_more+=1 else: - definition = item.word_type + " " + item.id + ": " + item.definition + definition = item.word_type + " " + str(item.id) + ": " + item.definition if line_length + len(definition) > 455: suppress_print = True num_more+= 1 |