put grammar detection in a function, add in grammar errors per word in feature extractor

d2ec5c55 · Vik Paruchuri · 9b0033c9 · d2ec5c55 · d2ec5c55
Commit d2ec5c55 authored Nov 02, 2012 by Vik Paruchuri
Hide whitespace changes
Inline Side-by-side

Showing with 18 additions and 13 deletions

feature_extractor.py
+16 -11

tests/tmpfile
+2 -2

No files found.
--- a/feature_extractor.py
+++ b/feature_extractor.py
@@ -40,9 +40,10 @@ class FeatureExtractor(object):
                self._normal_dict = CountVectorizer(ngram_range=(1,2), vocabulary=nvocab)
                self._stem_dict = CountVectorizer(ngram_range=(1,2), vocabulary=svocab)
                self.dict_initialized = True
-                self._mean_spelling_errors=sum(e_set._spelling_errors)/len(e_set._spelling_errors)
+                self._mean_spelling_errors=sum(e_set._spelling_errors)/float(len(e_set._spelling_errors))
-                self._spell_errors_per_word=sum(e_set._spelling_errors)/sum([len(t) for t in e_set._text])
+                self._spell_errors_per_word=sum(e_set._spelling_errors)/float(sum([len(t) for t in e_set._text]))
-                self._grammar_errors_per_word=[]
+                self._grammar_errors_per_word=sum(self._get_grammar_errors
+                    (e_set._pos,e_set._text,e_set._tokens))/float(len(text))
                ret = "ok"
            else:
                raise util_functions.InputError(e_set, "needs to be an essay set of the train type.")
@@ -64,6 +65,17 @@ class FeatureExtractor(object):
            pickle.dump(good_pos_ngrams, open(base_path + 'good_pos_ngrams.p', 'wb'))
        return good_pos_ngrams
+    def _get_grammar_errors(self,pos,text,tokens):
+        word_counts = [max(len(t),1) for t in tokens]
+        good_pos_tags = []
+        for i in xrange(0, len(text)):
+            pos_seq = [tag[1] for tag in pos[i]]
+            pos_ngrams = util_functions.ngrams(pos_seq, 2, 4)
+            overlap_ngrams = [i for i in pos_ngrams if i in self._good_pos_ngrams]
+            good_pos_tags.append(len(overlap_ngrams))
+        good_pos_tag_prop = [good_pos_tags[m] / float(word_counts[m]) for m in xrange(0, len(text))]
+        return good_pos_tag_prop
    def gen_length_feats(self, e_set):
        """
        Generates length based features from an essay set
@@ -77,14 +89,7 @@ class FeatureExtractor(object):
        ap_count = [e.count("'") for e in text]
        punc_count = [e.count(".") + e.count("?") + e.count("!") for e in text]
        chars_per_word = [lengths[m] / float(word_counts[m]) for m in xrange(0, len(text))]
-        good_pos_tags = []
+        good_pos_tag_prop = self._get_grammar_errors(e_set._pos,e_set._text,e_set._tokens)
-        for i in xrange(0, len(text)):
-            pos_seq = [tag[1] for tag in e_set._pos[i]]
-            pos_ngrams = util_functions.ngrams(pos_seq, 2, 4)
-            overlap_ngrams = [i for i in pos_ngrams if i in self._good_pos_ngrams]
-            good_pos_tags.append(len(overlap_ngrams))
-        good_pos_tag_prop = [good_pos_tags[m] / float(word_counts[m]) for m in xrange(0, len(text))]
-        self._grammar_errors_per_word=[1-good_pos_tag_prop[m] for m in xrange(0,len(text))]
        length_arr = numpy.array((
        lengths, word_counts, comma_count, ap_count, punc_count, chars_per_word, good_pos_tags,

--- a/tests/tmpfile
+++ b/tests/tmpfile
- in order to replicate the experiment .you would need to know how much vinegar to pour ,you need to know how much distilled water and you aso need to know how your going to weigh it . 
+ in order to replicate this experiment , we would need to know the temperature of the vinegar as well as how much vinegar to put in . both of these could vary and therefore change the result of the experiment . 
\ No newline at end of file