Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
E
ease
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
edx
ease
Commits
515420a9
Commit
515420a9
authored
Jun 13, 2014
by
gradyward
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Refactored the feature_extractor file to include just what we want it to, in a logical manner.
parent
4b8ef34b
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
87 additions
and
194 deletions
+87
-194
ease/essay_set.py
+2
-2
ease/feature_extractor.py
+84
-170
ease/grade.py
+1
-22
No files found.
ease/essay_set.py
View file @
515420a9
...
...
@@ -52,7 +52,7 @@ class EssaySet(object):
self
.
_cleaned_spelled_essays
=
[]
self
.
_tokens
=
[]
self
.
_pos_tags
=
[]
self
.
_cleaned_stem_essays
=
[]
self
.
_cleaned_stem
med
_essays
=
[]
self
.
_generated
=
[]
self
.
_prompt
=
""
self
.
_spelling_errors
=
[]
...
...
@@ -147,7 +147,7 @@ class EssaySet(object):
# from words in English.
porter
=
nltk
.
PorterStemmer
()
porter_tokens
=
" "
.
join
([
porter
.
stem
(
token
)
for
token
in
tokens
])
self
.
_cleaned_stem_essays
.
append
(
porter_tokens
)
self
.
_cleaned_stem
med
_essays
.
append
(
porter_tokens
)
return
"Essay Added. Text: "
+
cleaned_essay
+
" Score: "
+
str
(
essay_score
)
...
...
ease/feature_extractor.py
View file @
515420a9
...
...
@@ -35,7 +35,6 @@ class FeatureExtractor(object):
features from an object which will allow object classification.
"""
def
__init__
(
self
,
essay_set
,
max_features_pass_2
=
200
):
"""
Initializes requisite dictionaries/statistics before the feature extraction can occur.
...
...
@@ -50,7 +49,6 @@ class FeatureExtractor(object):
max_features_pass_2: The maximum number of features we consider on the second pass of vocabulary grooming
"""
if
hasattr
(
essay_set
,
'_type'
):
if
essay_set
.
_type
==
"train"
:
# Finds vocabulary which differentiates good/high scoring essays from bad/low scoring essays.
...
...
@@ -60,7 +58,7 @@ class FeatureExtractor(object):
# Finds vocab (same criteria as above), but with essays that have been porter stemmed
stemmed_vocab
=
util_functions
.
get_vocab
(
essay_set
.
_clean
_stem_text
,
essay_set
.
_scores
,
max_features_pass_2
=
max_features_pass_2
essay_set
.
_clean
ed_stemmed_essays
,
essay_set
.
_scores
,
max_features_pass_2
=
max_features_pass_2
)
# Constructs dictionaries trained based on the important vocabularies
...
...
@@ -81,101 +79,55 @@ class FeatureExtractor(object):
essay_set
.
_pos_tags
,
essay_set
.
_cleaned_essays
,
essay_set
.
_tokens
)
# NOTE!!! Here, I changed the definition from utilizing good grammar ratios to using the counts of
# gramatical errors. Though this was not what the original author used, it is clearly what his code
# gram
m
atical errors. Though this was not what the original author used, it is clearly what his code
# implies, as if this is intended to be a true "grammar errors per character", we should have that
# exact number. The replaced call is included for posterity.
# self._grammar_errors_per_character =
# (sum(good_pos_tags) / float(sum([len(t) for t in essay_set._text])))
total_grammar_errors
=
sum
(
len
(
l
)
for
l
in
bad_pos_positions
)
total_characters
=
float
(
sum
([
len
(
t
)
for
t
in
essay_set
.
_
text
]))
total_characters
=
float
(
sum
([
len
(
t
)
for
t
in
essay_set
.
_
cleaned_essays
]))
self
.
_grammar_errors_per_character
=
total_grammar_errors
/
total_characters
# Generates a bag of vocabulary features
vocabulary_features
=
self
.
generate_vocabulary_features
(
essay_set
)
vocabulary_features
=
self
.
_
generate_vocabulary_features
(
essay_set
)
# Sum of a row of bag of words features (topical words in an essay)
feature_row_sum
=
numpy
.
sum
(
vocabulary_features
[:,
:])
# Average index of how "topical" essays are
self
.
_mean_
f_prop
=
feature_row_sum
/
float
(
sum
([
len
(
t
)
for
t
in
essay_set
.
_text
]))
self
.
_mean_
topical_index
=
feature_row_sum
/
float
(
sum
([
len
(
t
)
for
t
in
essay_set
.
_cleaned_essays
]))
else
:
raise
util_functions
.
InputError
(
essay_set
,
"needs to be an essay set of the train type."
)
else
:
raise
util_functions
.
InputError
(
essay_set
,
"wrong input. need an essay set object."
)
self
.
_good_pos_ngrams
=
self
.
get_good_pos_ngrams
()
self
.
dict_initialized
=
False
self
.
_good_pos_ngrams
=
self
.
_get_good_pos_ngrams
()
self
.
_spell_errors_per_character
=
0
self
.
_grammar_errors_per_character
=
0
def
get_good_pos_ngrams
(
self
):
"""
Gets a list of grammatically correct part of speech sequences from an input file called essaycorpus.txt
Returns the list and caches the file
Returns:
A list of all grammatically correct parts of speech.
"""
if
(
os
.
path
.
isfile
(
NGRAM_PATH
)):
good_pos_ngrams
=
pickle
.
load
(
open
(
NGRAM_PATH
,
'rb'
))
else
:
#Hard coded an incomplete list in case the needed files cannot be found
good_pos_ngrams
=
[
'NN PRP'
,
'NN PRP .'
,
'NN PRP . DT'
,
'PRP .'
,
'PRP . DT'
,
'PRP . DT NNP'
,
'. DT'
,
'. DT NNP'
,
'. DT NNP NNP'
,
'DT NNP'
,
'DT NNP NNP'
,
'DT NNP NNP NNP'
,
'NNP NNP'
,
'NNP NNP NNP'
,
'NNP NNP NNP NNP'
,
'NNP NNP NNP .'
,
'NNP NNP .'
,
'NNP NNP . TO'
,
'NNP .'
,
'NNP . TO'
,
'NNP . TO NNP'
,
'. TO'
,
'. TO NNP'
,
'. TO NNP NNP'
,
'TO NNP'
,
'TO NNP NNP'
]
return
good_pos_ngrams
def
_get_grammar_errors
(
self
,
pos
,
essays
,
tokens
):
def
generate_features
(
self
,
essay_set
):
"""
Internal function to get the number of grammar errors in given tex
t
Generates bag of words, length, and prompt features from an essay set objec
t
Args:
pos: list of pos values for an essay set
essays: list of essay texts
tokens: list of the lists of the tokens in each essay
essay_set (EssaySet): the essay set to extract features for
Returns:
Tuple of the form (good_grammar_ratios, bad_pos_positions)
The former is a list of each essay's "good grammar ratio", which is not very well defined
The latter is a list of lists of each essay's grammatical mistakes as a location in its tokens
Array of features with the following included:
- Length Features
- Vocabulary Features (both Normal and Stemmed Vocabulary)
- Prompt Features
"""
word_counts
=
[
max
(
len
(
t
),
1
)
for
t
in
tokens
]
good_grammar_ratios
=
[]
min_pos_seq
=
2
max_pos_seq
=
4
bad_pos_positions
=
[]
for
i
in
xrange
(
0
,
len
(
essays
)):
pos_seq
=
[
tag
[
1
]
for
tag
in
pos
[
i
]]
pos_ngrams
=
util_functions
.
ngrams
(
pos_seq
,
min_pos_seq
,
max_pos_seq
)
long_pos_ngrams
=
[
z
for
z
in
pos_ngrams
if
z
.
count
(
' '
)
==
(
max_pos_seq
-
1
)]
bad_pos_tuples
=
[[
z
,
z
+
max_pos_seq
]
for
z
in
xrange
(
0
,
len
(
long_pos_ngrams
))
if
long_pos_ngrams
[
z
]
not
in
self
.
_good_pos_ngrams
]
bad_pos_tuples
.
sort
(
key
=
operator
.
itemgetter
(
1
))
to_delete
=
[]
for
m
in
reversed
(
xrange
(
len
(
bad_pos_tuples
)
-
1
)):
start
,
end
=
bad_pos_tuples
[
m
]
for
j
in
xrange
(
m
+
1
,
len
(
bad_pos_tuples
)):
lstart
,
lend
=
bad_pos_tuples
[
j
]
if
lstart
>=
start
and
lstart
<=
end
:
bad_pos_tuples
[
m
][
1
]
=
bad_pos_tuples
[
j
][
1
]
to_delete
.
append
(
j
)
vocabulary_features
=
self
.
_generate_vocabulary_features
(
essay_set
)
length_features
=
self
.
_generate_length_features
(
essay_set
)
prompt_features
=
self
.
_generate_prompt_features
(
essay_set
)
fixed_bad_pos_tuples
=
[
bad_pos_tuples
[
z
]
for
z
in
xrange
(
0
,
len
(
bad_pos_tuples
))
if
z
not
in
to_delete
]
bad_pos_positions
.
append
(
fixed_bad_pos_tuples
)
overlap_ngrams
=
[
z
for
z
in
pos_ngrams
if
z
in
self
.
_good_pos_ngrams
]
if
(
len
(
pos_ngrams
)
-
len
(
overlap_ngrams
))
>
0
:
divisor
=
len
(
pos_ngrams
)
/
len
(
pos_seq
)
else
:
divisor
=
1
if
divisor
==
0
:
divisor
=
1
good_grammar_ratio
=
(
len
(
pos_ngrams
)
-
len
(
overlap_ngrams
))
/
divisor
good_grammar_ratios
.
append
(
good_grammar_ratio
)
return
good_grammar_ratios
,
bad_pos_positions
# Lumps them all together, copies to solidify, and returns
overall_features
=
numpy
.
concatenate
((
length_features
,
prompt_features
,
vocabulary_features
),
axis
=
1
)
overall_features
=
overall_features
.
copy
()
return
overall_features
def
generate_length_features
(
self
,
essay_set
):
def
_
generate_length_features
(
self
,
essay_set
):
"""
Generates length based features from an essay set
...
...
@@ -196,7 +148,8 @@ class FeatureExtractor(object):
chars_per_word
=
[
lengths
[
m
]
/
float
(
word_counts
[
m
])
for
m
in
xrange
(
0
,
len
(
essays
))]
# SEE COMMENT AROUND LINE 85
good_grammar_ratios
,
bad_pos_positions
=
self
.
_get_grammar_errors
(
essay_set
.
_pos
,
essay_set
.
_text
,
essay_set
.
_tokens
)
good_grammar_ratios
,
bad_pos_positions
=
self
.
_get_grammar_errors
(
essay_set
.
_pos_tags
,
essay_set
.
_cleaned_essays
,
essay_set
.
_tokens
)
good_pos_tag_proportion
=
[
len
(
bad_pos_positions
[
m
])
/
float
(
word_counts
[
m
])
for
m
in
xrange
(
0
,
len
(
essays
))]
length_array
=
numpy
.
array
((
...
...
@@ -205,7 +158,7 @@ class FeatureExtractor(object):
return
length_array
.
copy
()
def
generate_vocabulary_features
(
self
,
essay_set
):
def
_
generate_vocabulary_features
(
self
,
essay_set
):
"""
Generates a bag of words features from an essay set and a trained FeatureExtractor (self)
...
...
@@ -217,36 +170,14 @@ class FeatureExtractor(object):
An array of features to be used for extraction
"""
# Calculates Stem and Normal features
stem_features
=
self
.
_stem_dict
.
transform
(
essay_set
.
_cleaned_stem_essays
)
stem_features
=
self
.
_stem_dict
.
transform
(
essay_set
.
_cleaned_stem
med
_essays
)
normal_features
=
self
.
_normal_dict
.
transform
(
essay_set
.
_cleaned_essays
)
# Mushes them together and returns
bag_features
=
numpy
.
concatenate
((
stem_features
.
toarray
(),
normal_features
.
toarray
()),
axis
=
1
)
return
bag_features
.
copy
()
def
generate_features
(
self
,
essay_set
):
"""
Generates bag of words, length, and prompt features from an essay set object
Args:
essay_set (EssaySet): the essay set to extract features for
Returns:
Array of features with the following included:
- Length Features
- Vocabulary Features (both Normal and Stemmed Vocabulary)
- Prompt Features
"""
vocabulary_features
=
self
.
generate_vocabulary_features
(
essay_set
)
length_features
=
self
.
generate_length_features
(
essay_set
)
prompt_features
=
self
.
generate_prompt_features
(
essay_set
)
# Lumps them all together, copies to solidify, and returns
overall_features
=
numpy
.
concatenate
((
length_features
,
prompt_features
,
vocabulary_features
),
axis
=
1
)
overall_features
=
overall_features
.
copy
()
return
overall_features
def
generate_prompt_features
(
self
,
essay_set
):
def
_generate_prompt_features
(
self
,
essay_set
):
"""
Generates prompt based features from an essay set object and internal prompt variable.
...
...
@@ -285,85 +216,68 @@ class FeatureExtractor(object):
return
prompt_arr
.
copy
()
def
generate_feedback
(
self
,
essay_set
,
features
=
None
):
def
_get_grammar_errors
(
self
,
pos
,
essays
,
tokens
):
"""
Generates feedback for a given set of essays
Internal function to get the number of grammar errors in given text
Args:
essay_set (EssaySet): The essay set that will have feedback assigned to it.
pos: list of pos values for an essay set
essays: list of essay texts
tokens: list of the lists of the tokens in each essay
Kwargs:
features (list of feature): optionally, a matrix of features extracted from e_set using FeatureExtractor
Returns:
Tuple of the form (good_grammar_ratios, bad_pos_positions)
The former is a list of each essay's "good grammar ratio", which is not very well defined
The latter is a list of lists of each essay's grammatical mistakes as a location in its tokens
"""
good_grammar_ratios
=
[]
min_pos_seq
=
2
max_pos_seq
=
4
bad_pos_positions
=
[]
for
i
in
xrange
(
0
,
len
(
essays
)):
pos_seq
=
[
tag
[
1
]
for
tag
in
pos
[
i
]]
pos_ngrams
=
util_functions
.
ngrams
(
pos_seq
,
min_pos_seq
,
max_pos_seq
)
long_pos_ngrams
=
[
z
for
z
in
pos_ngrams
if
z
.
count
(
' '
)
==
(
max_pos_seq
-
1
)]
bad_pos_tuples
=
[[
z
,
z
+
max_pos_seq
]
for
z
in
xrange
(
0
,
len
(
long_pos_ngrams
))
if
long_pos_ngrams
[
z
]
not
in
self
.
_good_pos_ngrams
]
bad_pos_tuples
.
sort
(
key
=
operator
.
itemgetter
(
1
))
to_delete
=
[]
for
m
in
reversed
(
xrange
(
len
(
bad_pos_tuples
)
-
1
)):
start
,
end
=
bad_pos_tuples
[
m
]
for
j
in
xrange
(
m
+
1
,
len
(
bad_pos_tuples
)):
lstart
,
lend
=
bad_pos_tuples
[
j
]
if
lstart
>=
start
and
lstart
<=
end
:
bad_pos_tuples
[
m
][
1
]
=
bad_pos_tuples
[
j
][
1
]
to_delete
.
append
(
j
)
fixed_bad_pos_tuples
=
[
bad_pos_tuples
[
z
]
for
z
in
xrange
(
0
,
len
(
bad_pos_tuples
))
if
z
not
in
to_delete
]
bad_pos_positions
.
append
(
fixed_bad_pos_tuples
)
overlap_ngrams
=
[
z
for
z
in
pos_ngrams
if
z
in
self
.
_good_pos_ngrams
]
if
(
len
(
pos_ngrams
)
-
len
(
overlap_ngrams
))
>
0
:
divisor
=
len
(
pos_ngrams
)
/
len
(
pos_seq
)
else
:
divisor
=
1
if
divisor
==
0
:
divisor
=
1
good_grammar_ratio
=
(
len
(
pos_ngrams
)
-
len
(
overlap_ngrams
))
/
divisor
good_grammar_ratios
.
append
(
good_grammar_ratio
)
return
good_grammar_ratios
,
bad_pos_positions
def
_get_good_pos_ngrams
(
self
):
"""
Gets a list of grammatically correct part of speech sequences from an input file called essaycorpus.txt
Returns the list and caches the file
Returns:
A list of all grammatically correct parts of speech.
"""
#TODO This is still bad.
# Set ratio to modify thresholds for grammar/spelling errors
modifier_ratio
=
1.05
# GBW TODO: This might be wrong.
# Calc number of grammar and spelling errors per character
set_grammar
,
bad_pos_positions
=
self
.
_get_grammar_errors
(
essay_set
.
_pos
,
essay_set
.
_text
,
essay_set
.
_tokens
)
set_grammar_per_character
=
[
set_grammar
[
m
]
/
float
(
len
(
essay_set
.
_cleaned_essays
[
m
])
+
.
1
)
for
m
in
xrange
(
0
,
len
(
essay_set
.
_cleaned_essays
)
)
]
set_spell_errors_per_character
=
[
essay_set
.
_spelling_errors
[
m
]
/
float
(
len
(
essay_set
.
_cleaned_essays
[
m
])
+
.
1
)
for
m
in
xrange
(
0
,
len
(
essay_set
.
_cleaned_essays
)
)
]
# Iterate through essays and create a feedback dictionary for each
all_feedback
=
[]
for
m
in
xrange
(
0
,
len
(
essay_set
.
_text
)):
#Be very careful about changing these messages!
individual_feedback
=
{
'grammar'
:
"Grammar: Ok."
,
'spelling'
:
"Spelling: Ok."
,
'markup_text'
:
""
,
'grammar_per_char'
:
set_grammar_per_character
[
m
],
'spelling_per_char'
:
set_spell_errors_per_character
[
m
],
'too_similar_to_prompt'
:
False
,
}
markup_tokens
=
essay_set
.
_markup_text
[
m
]
.
split
(
" "
)
# This loop ensures that sequences of bad grammar get put together into one sequence instead of staying
# disjointed
bad_pos_starts
=
[
z
[
0
]
for
z
in
bad_pos_positions
[
m
]]
bad_pos_ends
=
[
z
[
1
]
-
1
for
z
in
bad_pos_positions
[
m
]]
for
z
in
xrange
(
0
,
len
(
markup_tokens
)):
if
z
in
bad_pos_starts
:
markup_tokens
[
z
]
=
'<bg>'
+
markup_tokens
[
z
]
elif
z
in
bad_pos_ends
:
markup_tokens
[
z
]
+=
"</bg>"
if
len
(
bad_pos_ends
)
>
0
and
len
(
bad_pos_starts
)
>
0
and
len
(
markup_tokens
)
>
1
:
if
max
(
bad_pos_ends
)
>
(
len
(
markup_tokens
)
-
1
)
and
max
(
bad_pos_starts
)
<
(
len
(
markup_tokens
)
-
1
):
markup_tokens
[
len
(
markup_tokens
)
-
1
]
+=
"</bg>"
# Display messages if grammar/spelling errors greater than average in training set
if
set_grammar_per_character
[
m
]
>
(
self
.
_grammar_errors_per_character
*
modifier_ratio
):
individual_feedback
[
'grammar'
]
=
"Grammar: More grammar errors than average."
if
set_spell_errors_per_character
[
m
]
>
(
self
.
_spell_errors_per_character
*
modifier_ratio
):
individual_feedback
[
'spelling'
]
=
"Spelling: More spelling errors than average."
# Test topicality by calculating # of on topic words per character and comparing to the training set
# mean. Requires features to be passed in
if
features
is
not
None
:
f_row_sum
=
numpy
.
sum
(
features
[
m
,
12
:])
f_row_prop
=
f_row_sum
/
len
(
essay_set
.
_text
[
m
])
if
f_row_prop
<
(
self
.
_mean_f_prop
/
1.5
)
or
len
(
essay_set
.
_text
[
m
])
<
20
:
individual_feedback
[
'topicality'
]
=
"Topicality: Essay may be off topic."
if
(
features
[
m
,
9
]
>
.
6
):
individual_feedback
[
'prompt_overlap'
]
=
"Prompt Overlap: Too much overlap with prompt."
individual_feedback
[
'too_similar_to_prompt'
]
=
True
log
.
debug
(
features
[
m
,
9
])
# Create string representation of markup text
markup_string
=
" "
.
join
(
markup_tokens
)
individual_feedback
[
'markup_text'
]
=
markup_string
all_feedback
.
append
(
individual_feedback
)
return
all_feedback
if
os
.
path
.
isfile
(
NGRAM_PATH
):
good_pos_ngrams
=
pickle
.
load
(
open
(
NGRAM_PATH
,
'rb'
))
else
:
# Hard coded an incomplete list in case the needed files cannot be found
good_pos_ngrams
=
[
'NN PRP'
,
'NN PRP .'
,
'NN PRP . DT'
,
'PRP .'
,
'PRP . DT'
,
'PRP . DT NNP'
,
'. DT'
,
'. DT NNP'
,
'. DT NNP NNP'
,
'DT NNP'
,
'DT NNP NNP'
,
'DT NNP NNP NNP'
,
'NNP NNP'
,
'NNP NNP NNP'
,
'NNP NNP NNP NNP'
,
'NNP NNP NNP .'
,
'NNP NNP .'
,
'NNP NNP . TO'
,
'NNP .'
,
'NNP . TO'
,
'NNP . TO NNP'
,
'. TO'
,
'. TO NNP'
,
'. TO NNP NNP'
,
'TO NNP'
,
'TO NNP NNP'
]
return
good_pos_ngrams
ease/grade.py
View file @
515420a9
...
...
@@ -66,36 +66,15 @@ def grade(grader_data, submission):
grader_features
=
None
try
:
grader_features
=
extractor
.
generate_features
(
grader_set
)
feedback
=
extractor
.
generate_feedback
(
grader_set
,
grader_features
)[
0
]
results
[
'score'
]
=
int
(
model
.
predict
(
grader_features
)[
0
])
except
:
error_message
=
"Could not extract features and score essay."
log
.
exception
(
error_message
)
results
[
'errors'
]
.
append
(
error_message
)
#
If we have errors above, we do not continue here, but return the dictionary of failure
#
We have gotten through without an error, so we have been successful
if
len
(
results
[
'errors'
])
<
0
:
# We have gotten through without an error, so we have been successful
results
[
'success'
]
=
True
# If the essay is just a copy of the prompt (or too similar), return a 0 as the score
if
'too_similar_to_prompt'
in
feedback
and
feedback
[
'too_similar_to_prompt'
]:
results
[
'score'
]
=
0
# Generate feedback, identifying a number of explicable problem areas
results
[
'feedback'
]
=
{
'spelling'
:
feedback
[
'spelling'
],
'grammar'
:
feedback
[
'grammar'
],
'markup-text'
:
feedback
[
'markup_text'
],
}
if
'topicality'
in
feedback
and
'prompt_overlap'
in
feedback
:
results
[
'feedback'
]
.
update
({
'topicality'
:
feedback
[
'topicality'
],
'prompt-overlap'
:
feedback
[
'prompt_overlap'
],
})
# If we get here, that means there was 1+ error above. Set success to false and return
else
:
results
[
'success'
]
=
False
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment