Commit 760bcddc by Vik Paruchuri

adding markup for grammatical errors

parent e2a4b13c
...@@ -82,10 +82,14 @@ class FeatureExtractor(object): ...@@ -82,10 +82,14 @@ class FeatureExtractor(object):
""" """
word_counts = [max(len(t),1) for t in tokens] word_counts = [max(len(t),1) for t in tokens]
good_pos_tags = [] good_pos_tags = []
min_pos_seq=2
max_pos_seq=4
for i in xrange(0, len(text)): for i in xrange(0, len(text)):
pos_seq = [tag[1] for tag in pos[i]] pos_seq = [tag[1] for tag in pos[i]]
pos_ngrams = util_functions.ngrams(pos_seq, 2, 4) pos_ngrams = util_functions.ngrams(pos_seq, min_pos_seq, max_pos_seq)
overlap_ngrams = [i for i in pos_ngrams if i in self._good_pos_ngrams] long_pos_ngrams=[z for z in pos_ngrams if z.count(' ')==(max_pos_seq-1)]
bad_pos_positions=[z for z in xrange(0,len(long_pos_ngrams)) if long_pos_ngrams[z] not in self._good_pos_ngrams]
overlap_ngrams = [z for z in pos_ngrams if z in self._good_pos_ngrams]
good_pos_tags.append(len(overlap_ngrams)) good_pos_tags.append(len(overlap_ngrams))
return good_pos_tags return good_pos_tags
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment