Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
E
ease
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
edx
ease
Commits
1e575001
Commit
1e575001
authored
Jun 12, 2014
by
gradyward
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Cleaned up the essay_set.py file and propogated changes through other files.
parent
da78277e
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
140 additions
and
81 deletions
+140
-81
ease/essay_set.py
+137
-78
ease/grade.py
+1
-1
ease/model_creator.py
+1
-1
ease/predictor_set.py
+1
-1
No files found.
ease/essay_set.py
View file @
1e575001
...
@@ -23,21 +23,35 @@ MAXIMUM_ESSAY_LENGTH = 20000
...
@@ -23,21 +23,35 @@ MAXIMUM_ESSAY_LENGTH = 20000
class
EssaySet
(
object
):
class
EssaySet
(
object
):
def
__init__
(
self
,
essaytype
=
"train"
):
"""
The essay set object which encapsulates essays into sets for two purposes:
Testing
Training
Additionally, the addition of essays into one of these sets performs all spell/grammar
checking, tokenization of the essay, and stemming.
Essays in an essay set can be assumed to have these properties.
"""
def
__init__
(
self
,
essay_type
=
"train"
):
"""
"""
Initialize variables and check essay set type
Initialize variables and check essay set type
Args:
essay_type (string): Either 'train' or 'grade', defines the type of the essay set.
If not recognized, we default to "train"
"""
"""
if
(
essaytype
!=
"train"
and
essaytype
!=
"test"
):
essaytype
=
"train"
if
essay_type
!=
"train"
and
essay_type
!=
"test"
:
essay_type
=
"train"
self
.
_type
=
essaytype
self
.
_score
=
[]
self
.
_type
=
essay_type
self
.
_text
=
[]
self
.
_scores
=
[]
self
.
_id
=
[]
self
.
_cleaned_essays
=
[]
self
.
_clean_text
=
[]
self
.
_ids
=
[]
self
.
_cleaned_spelled_essays
=
[]
self
.
_tokens
=
[]
self
.
_tokens
=
[]
self
.
_pos
=
[]
self
.
_pos
_tags
=
[]
self
.
_clean
_stem_text
=
[]
self
.
_clean
ed_stem_essays
=
[]
self
.
_generated
=
[]
self
.
_generated
=
[]
self
.
_prompt
=
""
self
.
_prompt
=
""
self
.
_spelling_errors
=
[]
self
.
_spelling_errors
=
[]
...
@@ -45,105 +59,150 @@ class EssaySet(object):
...
@@ -45,105 +59,150 @@ class EssaySet(object):
def
add_essay
(
self
,
essay_text
,
essay_score
,
essay_generated
=
0
):
def
add_essay
(
self
,
essay_text
,
essay_score
,
essay_generated
=
0
):
"""
"""
Add new (essay_text,essay_score) pair to the essay set.
Adds a new pair of (essay_text, essay_score) to the essay set.
essay_text must be a string.
essay_score must be an int.
In the context of training, this occurs when a human creates another example
for the AI assessment to be based on
NOTE:
essay_generated should not be changed by the user.
essay_generated should not be changed by the user.
Returns a confirmation that essay was added.
Args:
essay_text (string): The text of the essay
essay_score (int): The score assigned to the essay by a human.
Kwargs:
essay_generated (int):
Returns:
A string confirmation that essay was added.
"""
"""
# Get maximum current essay id, or set to 0 if this is the first essay added
if
(
len
(
self
.
_id
)
>
0
):
# Get maximum current essay id (the newest essay), or set to 0 if this is the first essay added
max_id
=
max
(
self
.
_id
)
if
len
(
self
.
_ids
)
>
0
:
max_id
=
max
(
self
.
_ids
)
else
:
else
:
max_id
=
0
max_id
=
0
# Verify that essay_score is an int, essay_text is a string, and essay_generated equals 0 or 1
# Encodes the essay into ascii. Note that un-recognized characters will be ignored
# Also note that if we first fail to encode, we will try to decode from utf-8 then encode.
try
:
try
:
essay_text
=
essay_text
.
encode
(
'ascii'
,
'ignore'
)
essay_text
=
essay_text
.
encode
(
'ascii'
,
'ignore'
)
if
len
(
essay_text
)
<
5
:
except
UnicodeError
:
essay_text
=
"Invalid essay."
try
:
except
:
essay_text
=
(
essay_text
.
decode
(
'utf-8'
,
'replace'
))
.
encode
(
'ascii'
,
'ignore'
)
except
UnicodeError
:
log
.
exception
(
"Could not parse essay into ascii."
)
log
.
exception
(
"Could not parse essay into ascii."
)
raise
# Validates that score is an integer and essay_text is a string.
try
:
try
:
# Try conversion of types
essay_score
=
int
(
essay_score
)
essay_score
=
int
(
essay_score
)
essay_text
=
str
(
essay_text
)
essay_text
=
str
(
essay_text
)
except
:
essay_generated
=
int
(
essay_generated
)
# Nothing needed here, will return error in any case.
except
TypeError
:
log
.
exception
(
log
.
exception
(
"Invalid type for essay score : {0} or essay text : {1}"
.
format
(
type
(
essay_score
),
type
(
essay_text
)))
"Invalid type for essay score : {0} or essay text : {1}"
.
format
(
type
(
essay_score
),
type
(
essay_text
)))
raise
if
isinstance
(
essay_score
,
int
)
and
isinstance
(
essay_text
,
basestring
)
\
# Validates that essay generated is 0 or 1
and
(
essay_generated
==
0
or
essay_generated
==
1
):
if
essay_generated
!=
0
and
essay_generated
!=
1
:
self
.
_id
.
append
(
max_id
+
1
)
ex
=
"Invalid value for essay_generated ({}). Value must be 0 or 1."
.
format
(
essay_generated
)
self
.
_score
.
append
(
essay_score
)
log
.
exception
(
ex
)
# Clean text by removing non digit/work/punctuation characters
raise
util_functions
.
InputError
(
ex
)
try
:
essay_text
=
str
(
essay_text
.
encode
(
'ascii'
,
'ignore'
))
# Validates to make sure that the essay is at least five characters long.
except
:
if
len
(
essay_text
)
<
5
:
essay_text
=
(
essay_text
.
decode
(
'utf-8'
,
'replace'
))
.
encode
(
'ascii'
,
'ignore'
)
essay_text
=
"Invalid essay."
# If we reach this point, we are not going to raise an exception beyond it, so we can add any and all
# variables to our lists while maintaining internal consistency. This is a new fix as of 6-12-14 GBW
# Assigns a new ID to the essay, adds fields passed in.
self
.
_ids
.
append
(
max_id
+
1
)
self
.
_scores
.
append
(
essay_score
)
self
.
_generated
.
append
(
essay_generated
)
# Cleans text by removing non digit/work/punctuation characters
cleaned_essay
=
util_functions
.
sub_chars
(
essay_text
)
.
lower
()
cleaned_essay
=
util_functions
.
sub_chars
(
essay_text
)
.
lower
()
if
(
len
(
cleaned_essay
)
>
MAXIMUM_ESSAY_LENGTH
):
# Checks to see if the essay is longer than we allow. Truncates if longer
if
len
(
cleaned_essay
)
>
MAXIMUM_ESSAY_LENGTH
:
cleaned_essay
=
cleaned_essay
[
0
:
MAXIMUM_ESSAY_LENGTH
]
cleaned_essay
=
cleaned_essay
[
0
:
MAXIMUM_ESSAY_LENGTH
]
self
.
_text
.
append
(
cleaned_essay
)
self
.
_cleaned_essays
.
append
(
cleaned_essay
)
# Spell correct text using aspell
# Spell correct text using aspell
cleaned_text
,
spell_errors
,
markup_text
=
util_functions
.
spell_correct
(
self
.
_text
[
len
(
self
.
_text
)
-
1
]
)
cleaned_spelled_essay
,
spell_errors
,
markup_text
=
util_functions
.
spell_correct
(
cleaned_essay
)
self
.
_clean_text
.
append
(
cleaned_text
)
self
.
_cleaned_spelled_essays
.
append
(
cleaned_spelled_essay
)
self
.
_spelling_errors
.
append
(
spell_errors
)
self
.
_spelling_errors
.
append
(
spell_errors
)
self
.
_markup_text
.
append
(
markup_text
)
self
.
_markup_text
.
append
(
markup_text
)
# Tokenize text
self
.
_tokens
.
append
(
nltk
.
word_tokenize
(
self
.
_clean_text
[
len
(
self
.
_clean_text
)
-
1
]))
# Create tokens for the text and part of speech tags
# Part of speech tag text
tokens
=
nltk
.
word_tokenize
(
cleaned_spelled_essay
)
self
.
_pos
.
append
(
nltk
.
pos_tag
(
self
.
_clean_text
[
len
(
self
.
_clean_text
)
-
1
]
.
split
(
" "
)))
pos_tags
=
nltk
.
pos_tag
(
cleaned_spelled_essay
.
split
(
" "
))
self
.
_generated
.
append
(
essay_generated
)
self
.
_tokens
.
append
(
tokens
)
# Stem spell corrected text
self
.
_pos_tags
.
append
(
pos_tags
)
# Applies Porter stemming algorithm, a process for removing the commoner morphological and inflexional endings
# from words in English.
porter
=
nltk
.
PorterStemmer
()
porter
=
nltk
.
PorterStemmer
()
por_toks
=
" "
.
join
([
porter
.
stem
(
w
)
for
w
in
self
.
_tokens
[
len
(
self
.
_tokens
)
-
1
]
])
porter_tokens
=
" "
.
join
([
porter
.
stem
(
token
)
for
token
in
tokens
])
self
.
_clean_stem_text
.
append
(
por_tok
s
)
self
.
_cleaned_stem_essays
.
append
(
porter_token
s
)
ret
=
"text: "
+
self
.
_text
[
len
(
self
.
_text
)
-
1
]
+
" score: "
+
str
(
essay_score
)
return
"Essay Added. Text: "
+
cleaned_essay
+
" Score: "
+
str
(
essay_score
)
else
:
raise
util_functions
.
InputError
(
essay_text
,
"arguments need to be in format "
"(text,score). text needs to be string,"
" score needs to be int."
)
def
update_prompt
(
self
,
prompt_text
):
def
update_prompt
(
self
,
prompt_text
):
"""
"""
Update the default prompt string, which is "".
Updates the default prompt (an empty string) to a user specified string
prompt_text should be a string.
Returns the prompt as a confirmation.
Args:
prompt_text (str): the value to set the prompt to
Returns:
(str): The prompt, if it was stored successfully.
"""
"""
if
(
isinstance
(
prompt_text
,
basestring
)):
if
(
isinstance
(
prompt_text
,
basestring
)):
self
.
_prompt
=
util_functions
.
sub_chars
(
prompt_text
)
self
.
_prompt
=
util_functions
.
sub_chars
(
prompt_text
)
ret
=
self
.
_prompt
else
:
else
:
raise
util_functions
.
InputError
(
prompt_text
,
"Invalid prompt. Need to enter a string value."
)
raise
util_functions
.
InputError
(
prompt_text
,
"Invalid prompt. Need to enter a string value."
)
return
re
t
return
self
.
_promp
t
def
generate_additional_essays
(
self
,
e_text
,
e_score
,
dictionary
=
None
,
max_syns
=
3
):
def
generate_additional_essays
(
self
,
original_essay
,
original_score
,
to_generate
=
3
):
"""
"""
Substitute synonyms to generate extra essays from existing ones.
Generates and adds additional essays to the essay set from a base essay by substituting synonyms.
This is done to increase the amount of training data.
Should only be used with lowest scoring essays.
Args:
e_text is the text of the original essay.
original_essay (str): The original essay to generate off of.
e_score is the score of the original essay.
original_score (int): The integer score assigned to the input essay.
dictionary is a fixed dictionary (list) of words to replace.
max_syns defines the maximum number of additional essays to generate. Do not set too high.
Kwargs:
FEATURE REMOVED (GBW): dictionary (dict): A static dictionary of words to replace. Defaults to none.
Feature was removed because it was not implemented fully to begin with.
to_generate (int): The number of additional essays to generate based on synonym substitution
"""
"""
e_toks
=
nltk
.
word_tokenize
(
e_text
)
all_syns
=
[]
original_tokens
=
nltk
.
word_tokenize
(
original_essay
)
for
word
in
e_toks
:
synonym_matrix
=
[]
# Iterates through the words in the original essay
for
word
in
original_tokens
:
synonyms
=
util_functions
.
get_wordnet_syns
(
word
)
synonyms
=
util_functions
.
get_wordnet_syns
(
word
)
if
(
len
(
synonyms
)
>
max_syns
):
# Only substitute on a token if one could generate N=max_syns unique essays on that token.
synonyms
=
random
.
sample
(
synonyms
,
max_syns
)
if
len
(
synonyms
)
>
to_generate
:
all_syns
.
append
(
synonyms
)
# Adds one word on to the list of synonyms, one for each of the new essays
synonyms
=
random
.
sample
(
synonyms
,
to_generate
)
synonym_matrix
.
append
(
synonyms
)
new_essays
=
[]
new_essays
=
[]
for
i
in
range
(
0
,
max_syns
):
# Generates each essay
syn_toks
=
e_toks
for
i
in
range
(
0
,
to_generate
):
for
z
in
range
(
0
,
len
(
e_toks
)):
# Start out from the same base essay
if
len
(
all_syns
[
z
])
>
i
and
(
dictionary
==
None
or
e_toks
[
z
]
in
dictionary
):
new_tokens
=
original_tokens
syn_toks
[
z
]
=
all_syns
[
z
][
i
]
for
z
in
range
(
0
,
len
(
original_tokens
)):
new_essays
.
append
(
" "
.
join
(
syn_toks
))
# Replace a given token ONLY if it is not the first token in the dictionary??!?!?!!?!
for
z
in
xrange
(
0
,
len
(
new_essays
)):
if
len
(
synonym_matrix
[
z
])
>
i
:
self
.
add_essay
(
new_essays
[
z
],
e_score
,
1
)
new_tokens
[
z
]
=
synonym_matrix
[
z
][
i
]
new_essays
.
append
(
" "
.
join
(
new_tokens
))
# Adds each new essay to the list of essays in this essay set
for
i
in
xrange
(
0
,
len
(
new_essays
)):
self
.
add_essay
(
new_essays
[
i
],
original_score
,
1
)
ease/grade.py
View file @
1e575001
...
@@ -43,7 +43,7 @@ def grade(grader_data, submission):
...
@@ -43,7 +43,7 @@ def grade(grader_data, submission):
results
=
{
'errors'
:
[],
'tests'
:
[],
'score'
:
0
,
'feedback'
:
""
,
'success'
:
False
,
'confidence'
:
0
}
results
=
{
'errors'
:
[],
'tests'
:
[],
'score'
:
0
,
'feedback'
:
""
,
'success'
:
False
,
'confidence'
:
0
}
has_error
=
False
has_error
=
False
grader_set
=
EssaySet
(
essaytype
=
"test"
)
grader_set
=
EssaySet
(
essay
_
type
=
"test"
)
feedback
=
{}
feedback
=
{}
model
,
extractor
=
get_classifier_and_ext
(
grader_data
)
model
,
extractor
=
get_classifier_and_ext
(
grader_data
)
...
...
ease/model_creator.py
View file @
1e575001
...
@@ -82,7 +82,7 @@ def create_essay_set(text, score, prompt_string, generate_additional=True):
...
@@ -82,7 +82,7 @@ def create_essay_set(text, score, prompt_string, generate_additional=True):
for
i
in
xrange
(
0
,
len
(
text
)):
for
i
in
xrange
(
0
,
len
(
text
)):
x
.
add_essay
(
text
[
i
],
score
[
i
])
x
.
add_essay
(
text
[
i
],
score
[
i
])
if
score
[
i
]
==
min
(
score
)
and
generate_additional
==
True
:
if
score
[
i
]
==
min
(
score
)
and
generate_additional
==
True
:
x
.
generate_additional_essays
(
x
.
_clean
_text
[
len
(
x
.
_clean_text
)
-
1
],
score
[
i
])
x
.
generate_additional_essays
(
x
.
_clean
ed_spelled_essays
[
len
(
x
.
_cleaned_spelled_essays
)
-
1
],
score
[
i
])
x
.
update_prompt
(
prompt_string
)
x
.
update_prompt
(
prompt_string
)
...
...
ease/predictor_set.py
View file @
1e575001
...
@@ -85,7 +85,7 @@ class PredictorSet(object):
...
@@ -85,7 +85,7 @@ class PredictorSet(object):
#Create essay sets for textual features if needed
#Create essay sets for textual features if needed
if
len
(
self
.
_textual_features
)
==
0
:
if
len
(
self
.
_textual_features
)
==
0
:
for
i
in
xrange
(
0
,
len
(
textual_features
)):
for
i
in
xrange
(
0
,
len
(
textual_features
)):
self
.
_essay_sets
.
append
(
essay_set
.
EssaySet
(
essaytype
=
self
.
_type
))
self
.
_essay_sets
.
append
(
essay_set
.
EssaySet
(
essay
_
type
=
self
.
_type
))
#Add numeric and textual features
#Add numeric and textual features
self
.
_numeric_features
.
append
(
numeric_features
)
self
.
_numeric_features
.
append
(
numeric_features
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment