summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJoe Robinson <joe@mumsnet.com>2014-09-18 15:20:46 +0100
committerJoe Robinson <joe@mumsnet.com>2014-09-18 15:20:46 +0100
commite6206b9511c1284c5406a5af21147ddbda406006 (patch)
tree3605fdb266e9b95d0daab121f4fdd411d4184630
parent778b673dc2e1d1249d303ef28893e4338699291b (diff)
Fixes for OED parsing
-rwxr-xr-xbladictionary.py18
1 files changed, 11 insertions, 7 deletions
diff --git a/bladictionary.py b/bladictionary.py
index f7fcaec..ec5aff9 100755
--- a/bladictionary.py
+++ b/bladictionary.py
@@ -6,7 +6,7 @@ import sys
import shlex
from lxml import etree
-version = "2.0.5b"
+version = "2.0.7b"
class Definition(object):
#ID is relative to the word type, eg noun 1, noun 2, verb 1, verb 2, not to the entire list
@@ -160,6 +160,7 @@ def parse_args():
def parse_oed(word):
types = ["n.", "—n.", "v.", "—v.", "adj.", "—adj.", "adv.", "—adv."]
file = open("oed.txt")
+ word_line = ""
for line in file:
word_part = line.split(" ")[0]
@@ -168,11 +169,10 @@ def parse_oed(word):
word_line = line
break;
- if word_line is None:
+ if word_line == "":
return
print line
-
prev_part = ""
items = []
id = 0
@@ -182,14 +182,14 @@ def parse_oed(word):
if part in types:
if id > 0:
- item = Definition(id, word, "oed", word_type, definition, [], [])
+ item = Definition(word, id, "oed", word_type, definition, [], [])
items.append(item)
definition = ""
word_type = part
- elif part.isdigit() and prev_part is not None and prev_part[-1] == ".":
- item = Definition(id, word, "oed", word_type, definition, [], [])
+ elif part.isdigit() and prev_part is not None and (prev_part[-1] == "." or prev_part[-1] == ")"):
+ item = Definition(word, id, "oed", word_type, definition, [], [])
items.append(item)
definition = ""
id = part
@@ -198,6 +198,10 @@ def parse_oed(word):
prev_part = part
+ if id == 0:
+ id = 1
+ item = Definition(word, id, "oed", word_type, definition, [], [])
+ items.append(item)
return items
@@ -226,7 +230,7 @@ def main():
if suppress_print:
num_more+=1
else:
- definition = item.word_type + " " + item.id + ": " + item.definition
+ definition = item.word_type + " " + str(item.id) + ": " + item.definition
if line_length + len(definition) > 455:
suppress_print = True
num_more+= 1