Commit d2ec5c55 by Vik Paruchuri

put grammar detection in a function, add in grammar errors per word in feature extractor

parent 9b0033c9
...@@ -40,9 +40,10 @@ class FeatureExtractor(object): ...@@ -40,9 +40,10 @@ class FeatureExtractor(object):
self._normal_dict = CountVectorizer(ngram_range=(1,2), vocabulary=nvocab) self._normal_dict = CountVectorizer(ngram_range=(1,2), vocabulary=nvocab)
self._stem_dict = CountVectorizer(ngram_range=(1,2), vocabulary=svocab) self._stem_dict = CountVectorizer(ngram_range=(1,2), vocabulary=svocab)
self.dict_initialized = True self.dict_initialized = True
self._mean_spelling_errors=sum(e_set._spelling_errors)/len(e_set._spelling_errors) self._mean_spelling_errors=sum(e_set._spelling_errors)/float(len(e_set._spelling_errors))
self._spell_errors_per_word=sum(e_set._spelling_errors)/sum([len(t) for t in e_set._text]) self._spell_errors_per_word=sum(e_set._spelling_errors)/float(sum([len(t) for t in e_set._text]))
self._grammar_errors_per_word=[] self._grammar_errors_per_word=sum(self._get_grammar_errors
(e_set._pos,e_set._text,e_set._tokens))/float(len(text))
ret = "ok" ret = "ok"
else: else:
raise util_functions.InputError(e_set, "needs to be an essay set of the train type.") raise util_functions.InputError(e_set, "needs to be an essay set of the train type.")
...@@ -64,6 +65,17 @@ class FeatureExtractor(object): ...@@ -64,6 +65,17 @@ class FeatureExtractor(object):
pickle.dump(good_pos_ngrams, open(base_path + 'good_pos_ngrams.p', 'wb')) pickle.dump(good_pos_ngrams, open(base_path + 'good_pos_ngrams.p', 'wb'))
return good_pos_ngrams return good_pos_ngrams
def _get_grammar_errors(self,pos,text,tokens):
word_counts = [max(len(t),1) for t in tokens]
good_pos_tags = []
for i in xrange(0, len(text)):
pos_seq = [tag[1] for tag in pos[i]]
pos_ngrams = util_functions.ngrams(pos_seq, 2, 4)
overlap_ngrams = [i for i in pos_ngrams if i in self._good_pos_ngrams]
good_pos_tags.append(len(overlap_ngrams))
good_pos_tag_prop = [good_pos_tags[m] / float(word_counts[m]) for m in xrange(0, len(text))]
return good_pos_tag_prop
def gen_length_feats(self, e_set): def gen_length_feats(self, e_set):
""" """
Generates length based features from an essay set Generates length based features from an essay set
...@@ -77,14 +89,7 @@ class FeatureExtractor(object): ...@@ -77,14 +89,7 @@ class FeatureExtractor(object):
ap_count = [e.count("'") for e in text] ap_count = [e.count("'") for e in text]
punc_count = [e.count(".") + e.count("?") + e.count("!") for e in text] punc_count = [e.count(".") + e.count("?") + e.count("!") for e in text]
chars_per_word = [lengths[m] / float(word_counts[m]) for m in xrange(0, len(text))] chars_per_word = [lengths[m] / float(word_counts[m]) for m in xrange(0, len(text))]
good_pos_tags = [] good_pos_tag_prop = self._get_grammar_errors(e_set._pos,e_set._text,e_set._tokens)
for i in xrange(0, len(text)):
pos_seq = [tag[1] for tag in e_set._pos[i]]
pos_ngrams = util_functions.ngrams(pos_seq, 2, 4)
overlap_ngrams = [i for i in pos_ngrams if i in self._good_pos_ngrams]
good_pos_tags.append(len(overlap_ngrams))
good_pos_tag_prop = [good_pos_tags[m] / float(word_counts[m]) for m in xrange(0, len(text))]
self._grammar_errors_per_word=[1-good_pos_tag_prop[m] for m in xrange(0,len(text))]
length_arr = numpy.array(( length_arr = numpy.array((
lengths, word_counts, comma_count, ap_count, punc_count, chars_per_word, good_pos_tags, lengths, word_counts, comma_count, ap_count, punc_count, chars_per_word, good_pos_tags,
......
in order to replicate the experiment .you would need to know how much vinegar to pour ,you need to know how much distilled water and you aso need to know how your going to weigh it . in order to replicate this experiment , we would need to know the temperature of the vinegar as well as how much vinegar to put in . both of these could vary and therefore change the result of the experiment .
\ No newline at end of file \ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment