Commit 1e575001 by gradyward

Cleaned up the essay_set.py file and propogated changes through other files.

parent da78277e
...@@ -23,21 +23,35 @@ MAXIMUM_ESSAY_LENGTH = 20000 ...@@ -23,21 +23,35 @@ MAXIMUM_ESSAY_LENGTH = 20000
class EssaySet(object): class EssaySet(object):
def __init__(self, essaytype="train"): """
The essay set object which encapsulates essays into sets for two purposes:
Testing
Training
Additionally, the addition of essays into one of these sets performs all spell/grammar
checking, tokenization of the essay, and stemming.
Essays in an essay set can be assumed to have these properties.
"""
def __init__(self, essay_type="train"):
""" """
Initialize variables and check essay set type Initialize variables and check essay set type
Args:
essay_type (string): Either 'train' or 'grade', defines the type of the essay set.
If not recognized, we default to "train"
""" """
if (essaytype != "train" and essaytype != "test"):
essaytype = "train" if essay_type != "train" and essay_type != "test":
essay_type = "train"
self._type = essaytype
self._score = [] self._type = essay_type
self._text = [] self._scores = []
self._id = [] self._cleaned_essays = []
self._clean_text = [] self._ids = []
self._cleaned_spelled_essays = []
self._tokens = [] self._tokens = []
self._pos = [] self._pos_tags = []
self._clean_stem_text = [] self._cleaned_stem_essays = []
self._generated = [] self._generated = []
self._prompt = "" self._prompt = ""
self._spelling_errors = [] self._spelling_errors = []
...@@ -45,105 +59,150 @@ class EssaySet(object): ...@@ -45,105 +59,150 @@ class EssaySet(object):
def add_essay(self, essay_text, essay_score, essay_generated=0): def add_essay(self, essay_text, essay_score, essay_generated=0):
""" """
Add new (essay_text,essay_score) pair to the essay set. Adds a new pair of (essay_text, essay_score) to the essay set.
essay_text must be a string.
essay_score must be an int. In the context of training, this occurs when a human creates another example
for the AI assessment to be based on
NOTE:
essay_generated should not be changed by the user. essay_generated should not be changed by the user.
Returns a confirmation that essay was added.
Args:
essay_text (string): The text of the essay
essay_score (int): The score assigned to the essay by a human.
Kwargs:
essay_generated (int):
Returns:
A string confirmation that essay was added.
""" """
# Get maximum current essay id, or set to 0 if this is the first essay added
if (len(self._id) > 0): # Get maximum current essay id (the newest essay), or set to 0 if this is the first essay added
max_id = max(self._id) if len(self._ids) > 0:
max_id = max(self._ids)
else: else:
max_id = 0 max_id = 0
# Verify that essay_score is an int, essay_text is a string, and essay_generated equals 0 or 1
# Encodes the essay into ascii. Note that un-recognized characters will be ignored
# Also note that if we first fail to encode, we will try to decode from utf-8 then encode.
try: try:
essay_text = essay_text.encode('ascii', 'ignore') essay_text = essay_text.encode('ascii', 'ignore')
if len(essay_text) < 5: except UnicodeError:
essay_text = "Invalid essay." try:
except: essay_text = (essay_text.decode('utf-8', 'replace')).encode('ascii', 'ignore')
except UnicodeError:
log.exception("Could not parse essay into ascii.") log.exception("Could not parse essay into ascii.")
raise
# Validates that score is an integer and essay_text is a string.
try: try:
# Try conversion of types
essay_score = int(essay_score) essay_score = int(essay_score)
essay_text = str(essay_text) essay_text = str(essay_text)
except: essay_generated = int(essay_generated)
# Nothing needed here, will return error in any case. except TypeError:
log.exception( log.exception(
"Invalid type for essay score : {0} or essay text : {1}".format(type(essay_score), type(essay_text))) "Invalid type for essay score : {0} or essay text : {1}".format(type(essay_score), type(essay_text)))
raise
if isinstance(essay_score, int) and isinstance(essay_text, basestring) \ # Validates that essay generated is 0 or 1
and (essay_generated == 0 or essay_generated == 1): if essay_generated != 0 and essay_generated != 1:
self._id.append(max_id + 1) ex = "Invalid value for essay_generated ({}). Value must be 0 or 1.".format(essay_generated)
self._score.append(essay_score) log.exception(ex)
# Clean text by removing non digit/work/punctuation characters raise util_functions.InputError(ex)
try:
essay_text = str(essay_text.encode('ascii', 'ignore')) # Validates to make sure that the essay is at least five characters long.
except: if len(essay_text) < 5:
essay_text = (essay_text.decode('utf-8', 'replace')).encode('ascii', 'ignore') essay_text = "Invalid essay."
# If we reach this point, we are not going to raise an exception beyond it, so we can add any and all
# variables to our lists while maintaining internal consistency. This is a new fix as of 6-12-14 GBW
# Assigns a new ID to the essay, adds fields passed in.
self._ids.append(max_id + 1)
self._scores.append(essay_score)
self._generated.append(essay_generated)
# Cleans text by removing non digit/work/punctuation characters
cleaned_essay = util_functions.sub_chars(essay_text).lower() cleaned_essay = util_functions.sub_chars(essay_text).lower()
if (len(cleaned_essay) > MAXIMUM_ESSAY_LENGTH): # Checks to see if the essay is longer than we allow. Truncates if longer
if len(cleaned_essay) > MAXIMUM_ESSAY_LENGTH:
cleaned_essay = cleaned_essay[0:MAXIMUM_ESSAY_LENGTH] cleaned_essay = cleaned_essay[0:MAXIMUM_ESSAY_LENGTH]
self._text.append(cleaned_essay) self._cleaned_essays.append(cleaned_essay)
# Spell correct text using aspell # Spell correct text using aspell
cleaned_text, spell_errors, markup_text = util_functions.spell_correct(self._text[len(self._text) - 1]) cleaned_spelled_essay, spell_errors, markup_text = util_functions.spell_correct(cleaned_essay)
self._clean_text.append(cleaned_text) self._cleaned_spelled_essays.append(cleaned_spelled_essay)
self._spelling_errors.append(spell_errors) self._spelling_errors.append(spell_errors)
self._markup_text.append(markup_text) self._markup_text.append(markup_text)
# Tokenize text
self._tokens.append(nltk.word_tokenize(self._clean_text[len(self._clean_text) - 1])) # Create tokens for the text and part of speech tags
# Part of speech tag text tokens = nltk.word_tokenize(cleaned_spelled_essay)
self._pos.append(nltk.pos_tag(self._clean_text[len(self._clean_text) - 1].split(" "))) pos_tags = nltk.pos_tag(cleaned_spelled_essay.split(" "))
self._generated.append(essay_generated) self._tokens.append(tokens)
# Stem spell corrected text self._pos_tags.append(pos_tags)
# Applies Porter stemming algorithm, a process for removing the commoner morphological and inflexional endings
# from words in English.
porter = nltk.PorterStemmer() porter = nltk.PorterStemmer()
por_toks = " ".join([porter.stem(w) for w in self._tokens[len(self._tokens) - 1]]) porter_tokens = " ".join([porter.stem(token) for token in tokens])
self._clean_stem_text.append(por_toks) self._cleaned_stem_essays.append(porter_tokens)
ret = "text: " + self._text[len(self._text) - 1] + " score: " + str(essay_score) return "Essay Added. Text: " + cleaned_essay + " Score: " + str(essay_score)
else:
raise util_functions.InputError(essay_text, "arguments need to be in format "
"(text,score). text needs to be string,"
" score needs to be int.")
def update_prompt(self, prompt_text): def update_prompt(self, prompt_text):
""" """
Update the default prompt string, which is "". Updates the default prompt (an empty string) to a user specified string
prompt_text should be a string.
Returns the prompt as a confirmation. Args:
prompt_text (str): the value to set the prompt to
Returns:
(str): The prompt, if it was stored successfully.
""" """
if (isinstance(prompt_text, basestring)): if (isinstance(prompt_text, basestring)):
self._prompt = util_functions.sub_chars(prompt_text) self._prompt = util_functions.sub_chars(prompt_text)
ret = self._prompt
else: else:
raise util_functions.InputError(prompt_text, "Invalid prompt. Need to enter a string value.") raise util_functions.InputError(prompt_text, "Invalid prompt. Need to enter a string value.")
return ret return self._prompt
def generate_additional_essays(self, e_text, e_score, dictionary=None, max_syns=3): def generate_additional_essays(self, original_essay, original_score, to_generate=3):
""" """
Substitute synonyms to generate extra essays from existing ones. Generates and adds additional essays to the essay set from a base essay by substituting synonyms.
This is done to increase the amount of training data.
Should only be used with lowest scoring essays. Args:
e_text is the text of the original essay. original_essay (str): The original essay to generate off of.
e_score is the score of the original essay. original_score (int): The integer score assigned to the input essay.
dictionary is a fixed dictionary (list) of words to replace.
max_syns defines the maximum number of additional essays to generate. Do not set too high. Kwargs:
FEATURE REMOVED (GBW): dictionary (dict): A static dictionary of words to replace. Defaults to none.
Feature was removed because it was not implemented fully to begin with.
to_generate (int): The number of additional essays to generate based on synonym substitution
""" """
e_toks = nltk.word_tokenize(e_text)
all_syns = [] original_tokens = nltk.word_tokenize(original_essay)
for word in e_toks: synonym_matrix = []
# Iterates through the words in the original essay
for word in original_tokens:
synonyms = util_functions.get_wordnet_syns(word) synonyms = util_functions.get_wordnet_syns(word)
if (len(synonyms) > max_syns): # Only substitute on a token if one could generate N=max_syns unique essays on that token.
synonyms = random.sample(synonyms, max_syns) if len(synonyms) > to_generate:
all_syns.append(synonyms) # Adds one word on to the list of synonyms, one for each of the new essays
synonyms = random.sample(synonyms, to_generate)
synonym_matrix.append(synonyms)
new_essays = [] new_essays = []
for i in range(0, max_syns): # Generates each essay
syn_toks = e_toks for i in range(0, to_generate):
for z in range(0, len(e_toks)): # Start out from the same base essay
if len(all_syns[z]) > i and (dictionary == None or e_toks[z] in dictionary): new_tokens = original_tokens
syn_toks[z] = all_syns[z][i] for z in range(0, len(original_tokens)):
new_essays.append(" ".join(syn_toks)) # Replace a given token ONLY if it is not the first token in the dictionary??!?!?!!?!
for z in xrange(0, len(new_essays)): if len(synonym_matrix[z]) > i:
self.add_essay(new_essays[z], e_score, 1) new_tokens[z] = synonym_matrix[z][i]
new_essays.append(" ".join(new_tokens))
# Adds each new essay to the list of essays in this essay set
for i in xrange(0, len(new_essays)):
self.add_essay(new_essays[i], original_score, 1)
...@@ -43,7 +43,7 @@ def grade(grader_data, submission): ...@@ -43,7 +43,7 @@ def grade(grader_data, submission):
results = {'errors': [], 'tests': [], 'score': 0, 'feedback': "", 'success': False, 'confidence': 0} results = {'errors': [], 'tests': [], 'score': 0, 'feedback': "", 'success': False, 'confidence': 0}
has_error = False has_error = False
grader_set = EssaySet(essaytype="test") grader_set = EssaySet(essay_type="test")
feedback = {} feedback = {}
model, extractor = get_classifier_and_ext(grader_data) model, extractor = get_classifier_and_ext(grader_data)
......
...@@ -82,7 +82,7 @@ def create_essay_set(text, score, prompt_string, generate_additional=True): ...@@ -82,7 +82,7 @@ def create_essay_set(text, score, prompt_string, generate_additional=True):
for i in xrange(0, len(text)): for i in xrange(0, len(text)):
x.add_essay(text[i], score[i]) x.add_essay(text[i], score[i])
if score[i] == min(score) and generate_additional == True: if score[i] == min(score) and generate_additional == True:
x.generate_additional_essays(x._clean_text[len(x._clean_text) - 1], score[i]) x.generate_additional_essays(x._cleaned_spelled_essays[len(x._cleaned_spelled_essays) - 1], score[i])
x.update_prompt(prompt_string) x.update_prompt(prompt_string)
......
...@@ -85,7 +85,7 @@ class PredictorSet(object): ...@@ -85,7 +85,7 @@ class PredictorSet(object):
#Create essay sets for textual features if needed #Create essay sets for textual features if needed
if len(self._textual_features) == 0: if len(self._textual_features) == 0:
for i in xrange(0, len(textual_features)): for i in xrange(0, len(textual_features)):
self._essay_sets.append(essay_set.EssaySet(essaytype=self._type)) self._essay_sets.append(essay_set.EssaySet(essay_type=self._type))
#Add numeric and textual features #Add numeric and textual features
self._numeric_features.append(numeric_features) self._numeric_features.append(numeric_features)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment