Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
E
ease
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
edx
ease
Commits
1e575001
Commit
1e575001
authored
Jun 12, 2014
by
gradyward
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Cleaned up the essay_set.py file and propogated changes through other files.
parent
da78277e
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
140 additions
and
81 deletions
+140
-81
ease/essay_set.py
+137
-78
ease/grade.py
+1
-1
ease/model_creator.py
+1
-1
ease/predictor_set.py
+1
-1
No files found.
ease/essay_set.py
View file @
1e575001
...
...
@@ -23,21 +23,35 @@ MAXIMUM_ESSAY_LENGTH = 20000
class
EssaySet
(
object
):
def
__init__
(
self
,
essaytype
=
"train"
):
"""
The essay set object which encapsulates essays into sets for two purposes:
Testing
Training
Additionally, the addition of essays into one of these sets performs all spell/grammar
checking, tokenization of the essay, and stemming.
Essays in an essay set can be assumed to have these properties.
"""
def
__init__
(
self
,
essay_type
=
"train"
):
"""
Initialize variables and check essay set type
Args:
essay_type (string): Either 'train' or 'grade', defines the type of the essay set.
If not recognized, we default to "train"
"""
if
(
essaytype
!=
"train"
and
essaytype
!=
"test"
):
essaytype
=
"train"
self
.
_type
=
essaytype
self
.
_score
=
[]
self
.
_text
=
[]
self
.
_id
=
[]
self
.
_clean_text
=
[]
if
essay_type
!=
"train"
and
essay_type
!=
"test"
:
essay_type
=
"train"
self
.
_type
=
essay_type
self
.
_scores
=
[]
self
.
_cleaned_essays
=
[]
self
.
_ids
=
[]
self
.
_cleaned_spelled_essays
=
[]
self
.
_tokens
=
[]
self
.
_pos
=
[]
self
.
_clean
_stem_text
=
[]
self
.
_pos
_tags
=
[]
self
.
_clean
ed_stem_essays
=
[]
self
.
_generated
=
[]
self
.
_prompt
=
""
self
.
_spelling_errors
=
[]
...
...
@@ -45,105 +59,150 @@ class EssaySet(object):
def
add_essay
(
self
,
essay_text
,
essay_score
,
essay_generated
=
0
):
"""
Add new (essay_text,essay_score) pair to the essay set.
essay_text must be a string.
essay_score must be an int.
Adds a new pair of (essay_text, essay_score) to the essay set.
In the context of training, this occurs when a human creates another example
for the AI assessment to be based on
NOTE:
essay_generated should not be changed by the user.
Returns a confirmation that essay was added.
Args:
essay_text (string): The text of the essay
essay_score (int): The score assigned to the essay by a human.
Kwargs:
essay_generated (int):
Returns:
A string confirmation that essay was added.
"""
# Get maximum current essay id, or set to 0 if this is the first essay added
if
(
len
(
self
.
_id
)
>
0
):
max_id
=
max
(
self
.
_id
)
# Get maximum current essay id (the newest essay), or set to 0 if this is the first essay added
if
len
(
self
.
_ids
)
>
0
:
max_id
=
max
(
self
.
_ids
)
else
:
max_id
=
0
# Verify that essay_score is an int, essay_text is a string, and essay_generated equals 0 or 1
# Encodes the essay into ascii. Note that un-recognized characters will be ignored
# Also note that if we first fail to encode, we will try to decode from utf-8 then encode.
try
:
essay_text
=
essay_text
.
encode
(
'ascii'
,
'ignore'
)
if
len
(
essay_text
)
<
5
:
essay_text
=
"Invalid essay."
except
:
except
UnicodeError
:
try
:
essay_text
=
(
essay_text
.
decode
(
'utf-8'
,
'replace'
))
.
encode
(
'ascii'
,
'ignore'
)
except
UnicodeError
:
log
.
exception
(
"Could not parse essay into ascii."
)
raise
# Validates that score is an integer and essay_text is a string.
try
:
# Try conversion of types
essay_score
=
int
(
essay_score
)
essay_text
=
str
(
essay_text
)
except
:
# Nothing needed here, will return error in any case.
essay_generated
=
int
(
essay_generated
)
except
TypeError
:
log
.
exception
(
"Invalid type for essay score : {0} or essay text : {1}"
.
format
(
type
(
essay_score
),
type
(
essay_text
)))
raise
if
isinstance
(
essay_score
,
int
)
and
isinstance
(
essay_text
,
basestring
)
\
and
(
essay_generated
==
0
or
essay_generated
==
1
):
self
.
_id
.
append
(
max_id
+
1
)
self
.
_score
.
append
(
essay_score
)
# Clean text by removing non digit/work/punctuation characters
try
:
essay_text
=
str
(
essay_text
.
encode
(
'ascii'
,
'ignore'
))
except
:
essay_text
=
(
essay_text
.
decode
(
'utf-8'
,
'replace'
))
.
encode
(
'ascii'
,
'ignore'
)
# Validates that essay generated is 0 or 1
if
essay_generated
!=
0
and
essay_generated
!=
1
:
ex
=
"Invalid value for essay_generated ({}). Value must be 0 or 1."
.
format
(
essay_generated
)
log
.
exception
(
ex
)
raise
util_functions
.
InputError
(
ex
)
# Validates to make sure that the essay is at least five characters long.
if
len
(
essay_text
)
<
5
:
essay_text
=
"Invalid essay."
# If we reach this point, we are not going to raise an exception beyond it, so we can add any and all
# variables to our lists while maintaining internal consistency. This is a new fix as of 6-12-14 GBW
# Assigns a new ID to the essay, adds fields passed in.
self
.
_ids
.
append
(
max_id
+
1
)
self
.
_scores
.
append
(
essay_score
)
self
.
_generated
.
append
(
essay_generated
)
# Cleans text by removing non digit/work/punctuation characters
cleaned_essay
=
util_functions
.
sub_chars
(
essay_text
)
.
lower
()
if
(
len
(
cleaned_essay
)
>
MAXIMUM_ESSAY_LENGTH
):
# Checks to see if the essay is longer than we allow. Truncates if longer
if
len
(
cleaned_essay
)
>
MAXIMUM_ESSAY_LENGTH
:
cleaned_essay
=
cleaned_essay
[
0
:
MAXIMUM_ESSAY_LENGTH
]
self
.
_text
.
append
(
cleaned_essay
)
self
.
_cleaned_essays
.
append
(
cleaned_essay
)
# Spell correct text using aspell
cleaned_text
,
spell_errors
,
markup_text
=
util_functions
.
spell_correct
(
self
.
_text
[
len
(
self
.
_text
)
-
1
]
)
self
.
_clean_text
.
append
(
cleaned_text
)
cleaned_spelled_essay
,
spell_errors
,
markup_text
=
util_functions
.
spell_correct
(
cleaned_essay
)
self
.
_cleaned_spelled_essays
.
append
(
cleaned_spelled_essay
)
self
.
_spelling_errors
.
append
(
spell_errors
)
self
.
_markup_text
.
append
(
markup_text
)
# Tokenize text
self
.
_tokens
.
append
(
nltk
.
word_tokenize
(
self
.
_clean_text
[
len
(
self
.
_clean_text
)
-
1
]))
# Part of speech tag text
self
.
_pos
.
append
(
nltk
.
pos_tag
(
self
.
_clean_text
[
len
(
self
.
_clean_text
)
-
1
]
.
split
(
" "
)))
self
.
_generated
.
append
(
essay_generated
)
# Stem spell corrected text
# Create tokens for the text and part of speech tags
tokens
=
nltk
.
word_tokenize
(
cleaned_spelled_essay
)
pos_tags
=
nltk
.
pos_tag
(
cleaned_spelled_essay
.
split
(
" "
))
self
.
_tokens
.
append
(
tokens
)
self
.
_pos_tags
.
append
(
pos_tags
)
# Applies Porter stemming algorithm, a process for removing the commoner morphological and inflexional endings
# from words in English.
porter
=
nltk
.
PorterStemmer
()
por_toks
=
" "
.
join
([
porter
.
stem
(
w
)
for
w
in
self
.
_tokens
[
len
(
self
.
_tokens
)
-
1
]
])
self
.
_clean_stem_text
.
append
(
por_tok
s
)
porter_tokens
=
" "
.
join
([
porter
.
stem
(
token
)
for
token
in
tokens
])
self
.
_cleaned_stem_essays
.
append
(
porter_token
s
)
ret
=
"text: "
+
self
.
_text
[
len
(
self
.
_text
)
-
1
]
+
" score: "
+
str
(
essay_score
)
else
:
raise
util_functions
.
InputError
(
essay_text
,
"arguments need to be in format "
"(text,score). text needs to be string,"
" score needs to be int."
)
return
"Essay Added. Text: "
+
cleaned_essay
+
" Score: "
+
str
(
essay_score
)
def
update_prompt
(
self
,
prompt_text
):
"""
Update the default prompt string, which is "".
prompt_text should be a string.
Returns the prompt as a confirmation.
Updates the default prompt (an empty string) to a user specified string
Args:
prompt_text (str): the value to set the prompt to
Returns:
(str): The prompt, if it was stored successfully.
"""
if
(
isinstance
(
prompt_text
,
basestring
)):
self
.
_prompt
=
util_functions
.
sub_chars
(
prompt_text
)
ret
=
self
.
_prompt
else
:
raise
util_functions
.
InputError
(
prompt_text
,
"Invalid prompt. Need to enter a string value."
)
return
re
t
return
self
.
_promp
t
def
generate_additional_essays
(
self
,
e_text
,
e_score
,
dictionary
=
None
,
max_syns
=
3
):
def
generate_additional_essays
(
self
,
original_essay
,
original_score
,
to_generate
=
3
):
"""
Substitute synonyms to generate extra essays from existing ones.
This is done to increase the amount of training data.
Should only be used with lowest scoring essays.
e_text is the text of the original essay.
e_score is the score of the original essay.
dictionary is a fixed dictionary (list) of words to replace.
max_syns defines the maximum number of additional essays to generate. Do not set too high.
Generates and adds additional essays to the essay set from a base essay by substituting synonyms.
Args:
original_essay (str): The original essay to generate off of.
original_score (int): The integer score assigned to the input essay.
Kwargs:
FEATURE REMOVED (GBW): dictionary (dict): A static dictionary of words to replace. Defaults to none.
Feature was removed because it was not implemented fully to begin with.
to_generate (int): The number of additional essays to generate based on synonym substitution
"""
e_toks
=
nltk
.
word_tokenize
(
e_text
)
all_syns
=
[]
for
word
in
e_toks
:
original_tokens
=
nltk
.
word_tokenize
(
original_essay
)
synonym_matrix
=
[]
# Iterates through the words in the original essay
for
word
in
original_tokens
:
synonyms
=
util_functions
.
get_wordnet_syns
(
word
)
if
(
len
(
synonyms
)
>
max_syns
):
synonyms
=
random
.
sample
(
synonyms
,
max_syns
)
all_syns
.
append
(
synonyms
)
# Only substitute on a token if one could generate N=max_syns unique essays on that token.
if
len
(
synonyms
)
>
to_generate
:
# Adds one word on to the list of synonyms, one for each of the new essays
synonyms
=
random
.
sample
(
synonyms
,
to_generate
)
synonym_matrix
.
append
(
synonyms
)
new_essays
=
[]
for
i
in
range
(
0
,
max_syns
):
syn_toks
=
e_toks
for
z
in
range
(
0
,
len
(
e_toks
)):
if
len
(
all_syns
[
z
])
>
i
and
(
dictionary
==
None
or
e_toks
[
z
]
in
dictionary
):
syn_toks
[
z
]
=
all_syns
[
z
][
i
]
new_essays
.
append
(
" "
.
join
(
syn_toks
))
for
z
in
xrange
(
0
,
len
(
new_essays
)):
self
.
add_essay
(
new_essays
[
z
],
e_score
,
1
)
# Generates each essay
for
i
in
range
(
0
,
to_generate
):
# Start out from the same base essay
new_tokens
=
original_tokens
for
z
in
range
(
0
,
len
(
original_tokens
)):
# Replace a given token ONLY if it is not the first token in the dictionary??!?!?!!?!
if
len
(
synonym_matrix
[
z
])
>
i
:
new_tokens
[
z
]
=
synonym_matrix
[
z
][
i
]
new_essays
.
append
(
" "
.
join
(
new_tokens
))
# Adds each new essay to the list of essays in this essay set
for
i
in
xrange
(
0
,
len
(
new_essays
)):
self
.
add_essay
(
new_essays
[
i
],
original_score
,
1
)
ease/grade.py
View file @
1e575001
...
...
@@ -43,7 +43,7 @@ def grade(grader_data, submission):
results
=
{
'errors'
:
[],
'tests'
:
[],
'score'
:
0
,
'feedback'
:
""
,
'success'
:
False
,
'confidence'
:
0
}
has_error
=
False
grader_set
=
EssaySet
(
essaytype
=
"test"
)
grader_set
=
EssaySet
(
essay
_
type
=
"test"
)
feedback
=
{}
model
,
extractor
=
get_classifier_and_ext
(
grader_data
)
...
...
ease/model_creator.py
View file @
1e575001
...
...
@@ -82,7 +82,7 @@ def create_essay_set(text, score, prompt_string, generate_additional=True):
for
i
in
xrange
(
0
,
len
(
text
)):
x
.
add_essay
(
text
[
i
],
score
[
i
])
if
score
[
i
]
==
min
(
score
)
and
generate_additional
==
True
:
x
.
generate_additional_essays
(
x
.
_clean
_text
[
len
(
x
.
_clean_text
)
-
1
],
score
[
i
])
x
.
generate_additional_essays
(
x
.
_clean
ed_spelled_essays
[
len
(
x
.
_cleaned_spelled_essays
)
-
1
],
score
[
i
])
x
.
update_prompt
(
prompt_string
)
...
...
ease/predictor_set.py
View file @
1e575001
...
...
@@ -85,7 +85,7 @@ class PredictorSet(object):
#Create essay sets for textual features if needed
if
len
(
self
.
_textual_features
)
==
0
:
for
i
in
xrange
(
0
,
len
(
textual_features
)):
self
.
_essay_sets
.
append
(
essay_set
.
EssaySet
(
essaytype
=
self
.
_type
))
self
.
_essay_sets
.
append
(
essay_set
.
EssaySet
(
essay
_
type
=
self
.
_type
))
#Add numeric and textual features
self
.
_numeric_features
.
append
(
numeric_features
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment