1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44import nltk
import re
class Context:
def __init__(self, contents):
self._words = nltk.word_tokenize(contents)
self._tagged_words = nltk.pos_tag(self._words)
def generate(self):
words = ' '.join(self._words)
# clean up
words = re.sub(r" (n\'t|'\w+)\b", r'\1', words) # contractions
words = re.sub("(''|``)", '"', words) # quotes
words = re.sub(' ([.,?!)])', r'\1', words) # left-associative
words = re.sub('([(]) ', r'\1', words) # right-associative
words = re.sub(r'\b(gon|wan) na\b', r'\1na', words) # "gonna" / "wanna"
words = words.capitalize()
return words
def get(self, position):
return self._tagged_words[position]
def set(self, position, value):
self._words[position] = value
self._tagged_words = nltk.pos_tag(self._words)
def nouns(self):
for i, (word, tag) in enumerate(self._tagged_words):
if tag[0] == 'N':
yield Token(self, i)
class Token:
def __init__(self, context, position):
self._context = context
self._position = position
def get(self):
return self._context.get(self._position)
def set(self, value):
return self._context.set(self._position, value)