Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
E
ease
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
edx
ease
Commits
8bc96cb8
Commit
8bc96cb8
authored
Jun 12, 2014
by
gradyward
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Finished first round changes to feature_extractor.py
Still some serious logical work needs to be done.
parent
2a2972f8
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
72 additions
and
50 deletions
+72
-50
ease/feature_extractor.py
+71
-49
ease/grade.py
+1
-1
No files found.
ease/feature_extractor.py
View file @
8bc96cb8
...
...
@@ -91,7 +91,7 @@ class FeatureExtractor(object):
self
.
_grammar_errors_per_character
=
total_grammar_errors
/
total_characters
# Generates a bag of vocabulary features
vocabulary_features
=
self
.
gen_vocabulary_features
(
essay_set
)
vocabulary_features
=
self
.
gen
erate
_vocabulary_features
(
essay_set
)
# Sum of a row of bag of words features (topical words in an essay)
feature_row_sum
=
numpy
.
sum
(
vocabulary_features
[:,
:])
...
...
@@ -175,31 +175,37 @@ class FeatureExtractor(object):
good_grammar_ratios
.
append
(
good_grammar_ratio
)
return
good_grammar_ratios
,
bad_pos_positions
def
gen
_length_feats
(
self
,
e
_set
):
def
gen
erate_length_features
(
self
,
essay
_set
):
"""
Generates length based features from an essay set
Generally an internal function called by gen_feats
Returns an array of length features
e_set - EssaySet object
An exclusively internal function, called by generate_features
Args:
essay_set (EssaySet): the essay set to extract length features from
Returns:
An array of features that have been extracted based on length
"""
text
=
e_set
.
_text
lengths
=
[
len
(
e
)
for
e
in
text
]
word_counts
=
[
max
(
len
(
t
),
1
)
for
t
in
e_set
.
_tokens
]
comma_count
=
[
e
.
count
(
","
)
for
e
in
text
]
ap
_count
=
[
e
.
count
(
"'"
)
for
e
in
text
]
punc
_count
=
[
e
.
count
(
"."
)
+
e
.
count
(
"?"
)
+
e
.
count
(
"!"
)
for
e
in
text
]
chars_per_word
=
[
lengths
[
m
]
/
float
(
word_counts
[
m
])
for
m
in
xrange
(
0
,
len
(
text
))]
essays
=
essay_set
.
_cleaned_essays
lengths
=
[
len
(
e
)
for
e
in
essays
]
word_counts
=
[
max
(
len
(
t
),
1
)
for
t
in
e
ssay
_set
.
_tokens
]
comma_count
=
[
e
.
count
(
","
)
for
e
in
essays
]
ap
ostrophe_count
=
[
e
.
count
(
"'"
)
for
e
in
essays
]
punc
tuation_count
=
[
e
.
count
(
"."
)
+
e
.
count
(
"?"
)
+
e
.
count
(
"!"
)
for
e
in
essays
]
chars_per_word
=
[
lengths
[
m
]
/
float
(
word_counts
[
m
])
for
m
in
xrange
(
0
,
len
(
essays
))]
good_pos_tags
,
bad_pos_positions
=
self
.
_get_grammar_errors
(
e_set
.
_pos
,
e_set
.
_text
,
e_set
.
_tokens
)
good_pos_tag_prop
=
[
good_pos_tags
[
m
]
/
float
(
word_counts
[
m
])
for
m
in
xrange
(
0
,
len
(
text
))]
# SEE COMMENT AROUND LINE 85
good_grammar_ratios
,
bad_pos_positions
=
self
.
_get_grammar_errors
(
essay_set
.
_pos
,
essay_set
.
_text
,
essay_set
.
_tokens
)
good_pos_tag_proportion
=
[
len
(
bad_pos_positions
[
m
])
/
float
(
word_counts
[
m
])
for
m
in
xrange
(
0
,
len
(
essays
))]
length_arr
=
numpy
.
array
((
lengths
,
word_counts
,
comma_count
,
ap
_count
,
punc_count
,
chars_per_word
,
good_pos_tag
s
,
good_pos_tag_prop
))
.
transpose
()
length_arr
ay
=
numpy
.
array
((
lengths
,
word_counts
,
comma_count
,
ap
ostrophe_count
,
punctuation_count
,
chars_per_word
,
good_grammar_ratio
s
,
good_pos_tag_prop
ortion
))
.
transpose
()
return
length_arr
.
copy
()
return
length_arr
ay
.
copy
()
def
gen_vocabulary_features
(
self
,
essay_set
):
def
gen
erate
_vocabulary_features
(
self
,
essay_set
):
"""
Generates a bag of words features from an essay set and a trained FeatureExtractor (self)
...
...
@@ -231,23 +237,28 @@ class FeatureExtractor(object):
- Vocabulary Features (both Normal and Stemmed Vocabulary)
- Prompt Features
"""
vocabulary_features
=
self
.
gen_vocabulary_features
(
essay_set
)
length_features
=
self
.
gen
_length_feat
s
(
essay_set
)
prompt_features
=
self
.
gen
_prompt_feat
s
(
essay_set
)
vocabulary_features
=
self
.
gen
erate
_vocabulary_features
(
essay_set
)
length_features
=
self
.
gen
erate_length_feature
s
(
essay_set
)
prompt_features
=
self
.
gen
erate_prompt_feature
s
(
essay_set
)
# Lumps them all together, copies to solidify, and returns
overall_features
=
numpy
.
concatenate
((
length_features
,
prompt_features
,
vocabulary_features
),
axis
=
1
)
overall_features
=
overall_features
.
copy
()
return
overall_features
def
gen
_prompt_feats
(
self
,
e
_set
):
def
gen
erate_prompt_features
(
self
,
essay
_set
):
"""
Generates prompt based features from an essay set object and internal prompt variable.
Generally called internally by gen_feats
Returns an array of prompt features
e_set - EssaySet object
Called internally by generate_features
Args:
essay_set (EssaySet): an essay set object that is manipulated to generate prompt features
Returns:
an array of prompt features
"""
prompt_toks
=
nltk
.
word_tokenize
(
e_set
.
_prompt
)
prompt_toks
=
nltk
.
word_tokenize
(
e
ssay
_set
.
_prompt
)
expand_syns
=
[]
for
word
in
prompt_toks
:
synonyms
=
util_functions
.
get_wordnet_syns
(
word
)
...
...
@@ -255,7 +266,7 @@ class FeatureExtractor(object):
expand_syns
=
list
(
chain
.
from_iterable
(
expand_syns
))
prompt_overlap
=
[]
prompt_overlap_prop
=
[]
for
j
in
e_set
.
_tokens
:
for
j
in
e
ssay
_set
.
_tokens
:
tok_length
=
len
(
j
)
if
(
tok_length
==
0
):
tok_length
=
1
...
...
@@ -263,7 +274,7 @@ class FeatureExtractor(object):
prompt_overlap_prop
.
append
(
prompt_overlap
[
len
(
prompt_overlap
)
-
1
]
/
float
(
tok_length
))
expand_overlap
=
[]
expand_overlap_prop
=
[]
for
j
in
e_set
.
_tokens
:
for
j
in
e
ssay
_set
.
_tokens
:
tok_length
=
len
(
j
)
if
(
tok_length
==
0
):
tok_length
=
1
...
...
@@ -274,29 +285,40 @@ class FeatureExtractor(object):
return
prompt_arr
.
copy
()
def
gen
_feedback
(
self
,
e
_set
,
features
=
None
):
def
gen
erate_feedback
(
self
,
essay
_set
,
features
=
None
):
"""
Generate feedback for a given set of essays
e_set - EssaySet object
features - optionally, pass in a matrix of features extracted from e_set using FeatureExtractor
in order to get off topic feedback.
Returns a list of lists (one list per essay in e_set)
e_set - EssaySet object
Generates feedback for a given set of essays
Args:
essay_set (EssaySet): The essay set that will have feedback assigned to it.
Kwargs:
features (list of feature): optionally, a matrix of features extracted from e_set using FeatureExtractor
"""
#TODO This is still bad.
#Set ratio to modify thresholds for grammar/spelling errors
modifier_ratio
=
1.05
#GBW TODO: This might be wrong.
#Calc number of grammar and spelling errors per character
set_grammar
,
bad_pos_positions
=
self
.
_get_grammar_errors
(
e_set
.
_pos
,
e_set
.
_text
,
e_set
.
_tokens
)
set_grammar_per_character
=
[
set_grammar
[
m
]
/
float
(
len
(
e_set
.
_text
[
m
])
+
.
1
)
for
m
in
xrange
(
0
,
len
(
e_set
.
_text
))]
set_spell_errors_per_character
=
[
e_set
.
_spelling_errors
[
m
]
/
float
(
len
(
e_set
.
_text
[
m
])
+
.
1
)
for
m
in
xrange
(
0
,
len
(
e_set
.
_text
))]
set_grammar
,
bad_pos_positions
=
self
.
_get_grammar_errors
(
essay_set
.
_pos
,
essay_set
.
_text
,
essay_set
.
_tokens
)
set_grammar_per_character
=
[
set_grammar
[
m
]
/
float
(
len
(
essay_set
.
_cleaned_essays
[
m
])
+
.
1
)
for
m
in
xrange
(
0
,
len
(
essay_set
.
_cleaned_essays
)
)
]
set_spell_errors_per_character
=
[
essay_set
.
_spelling_errors
[
m
]
/
float
(
len
(
essay_set
.
_cleaned_essays
[
m
])
+
.
1
)
for
m
in
xrange
(
0
,
len
(
essay_set
.
_cleaned_essays
)
)
]
#Iterate through essays and create a feedback dict for each
#Iterate through essays and create a feedback dict
ionary
for each
all_feedback
=
[]
for
m
in
xrange
(
0
,
len
(
e_set
.
_text
)):
for
m
in
xrange
(
0
,
len
(
e
ssay
_set
.
_text
)):
#Be very careful about changing these messages!
individual_feedback
=
{
'grammar'
:
"Grammar: Ok."
,
'spelling'
:
"Spelling: Ok."
,
...
...
@@ -305,7 +327,7 @@ class FeatureExtractor(object):
'spelling_per_char'
:
set_spell_errors_per_character
[
m
],
'too_similar_to_prompt'
:
False
,
}
markup_tokens
=
e_set
.
_markup_text
[
m
]
.
split
(
" "
)
markup_tokens
=
e
ssay
_set
.
_markup_text
[
m
]
.
split
(
" "
)
#This loop ensures that sequences of bad grammar get put together into one sequence instead of staying
#disjointed
...
...
@@ -315,8 +337,8 @@ class FeatureExtractor(object):
if
z
in
bad_pos_starts
:
markup_tokens
[
z
]
=
'<bg>'
+
markup_tokens
[
z
]
elif
z
in
bad_pos_ends
:
markup_tokens
[
z
]
=
markup_tokens
[
z
]
+
"</bg>"
if
(
len
(
bad_pos_ends
)
>
0
and
len
(
bad_pos_starts
)
>
0
and
len
(
markup_tokens
)
>
1
)
:
markup_tokens
[
z
]
+=
"</bg>"
if
len
(
bad_pos_ends
)
>
0
and
len
(
bad_pos_starts
)
>
0
and
len
(
markup_tokens
)
>
1
:
if
max
(
bad_pos_ends
)
>
(
len
(
markup_tokens
)
-
1
)
and
max
(
bad_pos_starts
)
<
(
len
(
markup_tokens
)
-
1
):
markup_tokens
[
len
(
markup_tokens
)
-
1
]
+=
"</bg>"
...
...
@@ -330,8 +352,8 @@ class FeatureExtractor(object):
#mean. Requires features to be passed in
if
features
is
not
None
:
f_row_sum
=
numpy
.
sum
(
features
[
m
,
12
:])
f_row_prop
=
f_row_sum
/
len
(
e_set
.
_text
[
m
])
if
f_row_prop
<
(
self
.
_mean_f_prop
/
1.5
)
or
len
(
e_set
.
_text
[
m
])
<
20
:
f_row_prop
=
f_row_sum
/
len
(
e
ssay
_set
.
_text
[
m
])
if
f_row_prop
<
(
self
.
_mean_f_prop
/
1.5
)
or
len
(
e
ssay
_set
.
_text
[
m
])
<
20
:
individual_feedback
[
'topicality'
]
=
"Topicality: Essay may be off topic."
if
(
features
[
m
,
9
]
>
.
6
):
...
...
ease/grade.py
View file @
8bc96cb8
...
...
@@ -72,7 +72,7 @@ def grade(grader_data, submission):
grader_features
=
None
try
:
grader_features
=
extractor
.
generate_features
(
grader_set
)
feedback
=
extractor
.
gen_feedback
(
grader_set
,
grader_features
)[
0
]
feedback
=
extractor
.
gen
erate
_feedback
(
grader_set
,
grader_features
)[
0
]
results
[
'score'
]
=
int
(
model
.
predict
(
grader_features
)[
0
])
except
:
error_message
=
"Could not extract features and score essay."
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment