Commit 9b0033c9 by Vik Paruchuri

add in spell errors per word and grammar errors per word to feature extractor

parent 9beeab71
...@@ -36,6 +36,7 @@ class EssaySet(object): ...@@ -36,6 +36,7 @@ class EssaySet(object):
self._clean_stem_text=[] self._clean_stem_text=[]
self._generated = [] self._generated = []
self._prompt = "" self._prompt = ""
self._spelling_errors=[]
def add_essay(self, essay_text, essay_score, essay_generated=0): def add_essay(self, essay_text, essay_score, essay_generated=0):
""" """
...@@ -61,7 +62,9 @@ class EssaySet(object): ...@@ -61,7 +62,9 @@ class EssaySet(object):
cleaned_essay=cleaned_essay[0:MAXIMUM_ESSAY_LENGTH] cleaned_essay=cleaned_essay[0:MAXIMUM_ESSAY_LENGTH]
self._text.append(cleaned_essay) self._text.append(cleaned_essay)
# Spell correct text using aspell # Spell correct text using aspell
self._clean_text.append(util_functions.spell_correct(self._text[len(self._text) - 1])) cleaned_text,spell_errors=util_functions.spell_correct(self._text[len(self._text) - 1])
self._clean_text.append(cleaned_text)
self._spelling_errors.append(spell_errors)
# Tokenize text # Tokenize text
self._tokens.append(nltk.word_tokenize(self._clean_text[len(self._clean_text) - 1])) self._tokens.append(nltk.word_tokenize(self._clean_text[len(self._clean_text) - 1]))
# Part of speech tag text # Part of speech tag text
......
...@@ -10,6 +10,7 @@ from sklearn.feature_extraction.text import CountVectorizer ...@@ -10,6 +10,7 @@ from sklearn.feature_extraction.text import CountVectorizer
import pickle import pickle
import os import os
from itertools import chain from itertools import chain
import copy
base_path = os.path.dirname(__file__) base_path = os.path.dirname(__file__)
sys.path.append(base_path) sys.path.append(base_path)
...@@ -39,6 +40,9 @@ class FeatureExtractor(object): ...@@ -39,6 +40,9 @@ class FeatureExtractor(object):
self._normal_dict = CountVectorizer(ngram_range=(1,2), vocabulary=nvocab) self._normal_dict = CountVectorizer(ngram_range=(1,2), vocabulary=nvocab)
self._stem_dict = CountVectorizer(ngram_range=(1,2), vocabulary=svocab) self._stem_dict = CountVectorizer(ngram_range=(1,2), vocabulary=svocab)
self.dict_initialized = True self.dict_initialized = True
self._mean_spelling_errors=sum(e_set._spelling_errors)/len(e_set._spelling_errors)
self._spell_errors_per_word=sum(e_set._spelling_errors)/sum([len(t) for t in e_set._text])
self._grammar_errors_per_word=[]
ret = "ok" ret = "ok"
else: else:
raise util_functions.InputError(e_set, "needs to be an essay set of the train type.") raise util_functions.InputError(e_set, "needs to be an essay set of the train type.")
...@@ -80,6 +84,7 @@ class FeatureExtractor(object): ...@@ -80,6 +84,7 @@ class FeatureExtractor(object):
overlap_ngrams = [i for i in pos_ngrams if i in self._good_pos_ngrams] overlap_ngrams = [i for i in pos_ngrams if i in self._good_pos_ngrams]
good_pos_tags.append(len(overlap_ngrams)) good_pos_tags.append(len(overlap_ngrams))
good_pos_tag_prop = [good_pos_tags[m] / float(word_counts[m]) for m in xrange(0, len(text))] good_pos_tag_prop = [good_pos_tags[m] / float(word_counts[m]) for m in xrange(0, len(text))]
self._grammar_errors_per_word=[1-good_pos_tag_prop[m] for m in xrange(0,len(text))]
length_arr = numpy.array(( length_arr = numpy.array((
lengths, word_counts, comma_count, ap_count, punc_count, chars_per_word, good_pos_tags, lengths, word_counts, comma_count, ap_count, punc_count, chars_per_word, good_pos_tags,
......
...@@ -87,7 +87,7 @@ def spell_correct(string): ...@@ -87,7 +87,7 @@ def spell_correct(string):
sub_pat = r"\b" + incorrect_words[i] + r"\b" sub_pat = r"\b" + incorrect_words[i] + r"\b"
sub_comp = re.compile(sub_pat) sub_comp = re.compile(sub_pat)
newstring = re.sub(sub_comp, correct_spelling[i], newstring) newstring = re.sub(sub_comp, correct_spelling[i], newstring)
return newstring return newstring,len(incorrect)
def ngrams(tokens, min_n, max_n): def ngrams(tokens, min_n, max_n):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment