add in spell errors per word and grammar errors per word to feature extractor

9b0033c9 · Vik Paruchuri · 9beeab71 · 9b0033c9 · 9b0033c9 · 9b0033c9
Commit 9b0033c9 authored Nov 02, 2012 by Vik Paruchuri
Show whitespace changes
Inline Side-by-side

Showing with 10 additions and 2 deletions

essay_set.py
+4 -1

feature_extractor.py
+5 -0

util_functions.py
+1 -1

No files found.
--- a/essay_set.py
+++ b/essay_set.py
@@ -36,6 +36,7 @@ class EssaySet(object):
        self._clean_stem_text=[]
        self._generated = []
        self._prompt = ""
+        self._spelling_errors=[]
    def add_essay(self, essay_text, essay_score, essay_generated=0):
        """
@@ -61,7 +62,9 @@ class EssaySet(object):
                cleaned_essay=cleaned_essay[0:MAXIMUM_ESSAY_LENGTH]
            self._text.append(cleaned_essay)
            # Spell correct text using aspell
-            self._clean_text.append(util_functions.spell_correct(self._text[len(self._text) - 1]))
+            cleaned_text,spell_errors=util_functions.spell_correct(self._text[len(self._text) - 1])
+            self._clean_text.append(cleaned_text)
+            self._spelling_errors.append(spell_errors)
            # Tokenize text
            self._tokens.append(nltk.word_tokenize(self._clean_text[len(self._clean_text) - 1]))
            # Part of speech tag text

--- a/feature_extractor.py
+++ b/feature_extractor.py
@@ -10,6 +10,7 @@ from sklearn.feature_extraction.text import CountVectorizer
 import pickle
 import os
 from itertools import chain
+import copy
 base_path = os.path.dirname(__file__)
 sys.path.append(base_path)
@@ -39,6 +40,9 @@ class FeatureExtractor(object):
                self._normal_dict = CountVectorizer(ngram_range=(1,2), vocabulary=nvocab)
                self._stem_dict = CountVectorizer(ngram_range=(1,2), vocabulary=svocab)
                self.dict_initialized = True
+                self._mean_spelling_errors=sum(e_set._spelling_errors)/len(e_set._spelling_errors)
+                self._spell_errors_per_word=sum(e_set._spelling_errors)/sum([len(t) for t in e_set._text])
+                self._grammar_errors_per_word=[]
                ret = "ok"
            else:
                raise util_functions.InputError(e_set, "needs to be an essay set of the train type.")
@@ -80,6 +84,7 @@ class FeatureExtractor(object):
            overlap_ngrams = [i for i in pos_ngrams if i in self._good_pos_ngrams]
            good_pos_tags.append(len(overlap_ngrams))
        good_pos_tag_prop = [good_pos_tags[m] / float(word_counts[m]) for m in xrange(0, len(text))]
+        self._grammar_errors_per_word=[1-good_pos_tag_prop[m] for m in xrange(0,len(text))]
        length_arr = numpy.array((
        lengths, word_counts, comma_count, ap_count, punc_count, chars_per_word, good_pos_tags,

--- a/util_functions.py
+++ b/util_functions.py
@@ -87,7 +87,7 @@ def spell_correct(string):
        sub_pat = r"\b" + incorrect_words[i] + r"\b"
        sub_comp = re.compile(sub_pat)
        newstring = re.sub(sub_comp, correct_spelling[i], newstring)
-    return newstring
+    return newstring,len(incorrect)
 def ngrams(tokens, min_n, max_n):