Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
E
ease
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
edx
ease
Commits
b32d5674
Commit
b32d5674
authored
Jun 12, 2014
by
gradyward
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Changed the predictor_extractor.py file, adding clarity and integrating previous changes.
parent
8bc96cb8
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
51 additions
and
45 deletions
+51
-45
ease/feature_extractor.py
+10
-10
ease/grade.py
+0
-4
ease/model_creator.py
+2
-3
ease/predictor_extractor.py
+39
-28
No files found.
ease/feature_extractor.py
View file @
b32d5674
...
...
@@ -299,10 +299,10 @@ class FeatureExtractor(object):
"""
#TODO This is still bad.
#Set ratio to modify thresholds for grammar/spelling errors
#
Set ratio to modify thresholds for grammar/spelling errors
modifier_ratio
=
1.05
#GBW TODO: This might be wrong.
#Calc number of grammar and spelling errors per character
#
GBW TODO: This might be wrong.
#
Calc number of grammar and spelling errors per character
set_grammar
,
bad_pos_positions
=
self
.
_get_grammar_errors
(
essay_set
.
_pos
,
essay_set
.
_text
,
essay_set
.
_tokens
)
set_grammar_per_character
=
[
set_grammar
[
m
]
/
float
(
...
...
@@ -316,7 +316,7 @@ class FeatureExtractor(object):
)
]
#Iterate through essays and create a feedback dictionary for each
#
Iterate through essays and create a feedback dictionary for each
all_feedback
=
[]
for
m
in
xrange
(
0
,
len
(
essay_set
.
_text
)):
#Be very careful about changing these messages!
...
...
@@ -329,8 +329,8 @@ class FeatureExtractor(object):
}
markup_tokens
=
essay_set
.
_markup_text
[
m
]
.
split
(
" "
)
#This loop ensures that sequences of bad grammar get put together into one sequence instead of staying
#disjointed
#
This loop ensures that sequences of bad grammar get put together into one sequence instead of staying
#
disjointed
bad_pos_starts
=
[
z
[
0
]
for
z
in
bad_pos_positions
[
m
]]
bad_pos_ends
=
[
z
[
1
]
-
1
for
z
in
bad_pos_positions
[
m
]]
for
z
in
xrange
(
0
,
len
(
markup_tokens
)):
...
...
@@ -342,14 +342,14 @@ class FeatureExtractor(object):
if
max
(
bad_pos_ends
)
>
(
len
(
markup_tokens
)
-
1
)
and
max
(
bad_pos_starts
)
<
(
len
(
markup_tokens
)
-
1
):
markup_tokens
[
len
(
markup_tokens
)
-
1
]
+=
"</bg>"
#Display messages if grammar/spelling errors greater than average in training set
#
Display messages if grammar/spelling errors greater than average in training set
if
set_grammar_per_character
[
m
]
>
(
self
.
_grammar_errors_per_character
*
modifier_ratio
):
individual_feedback
[
'grammar'
]
=
"Grammar: More grammar errors than average."
if
set_spell_errors_per_character
[
m
]
>
(
self
.
_spell_errors_per_character
*
modifier_ratio
):
individual_feedback
[
'spelling'
]
=
"Spelling: More spelling errors than average."
#Test topicality by calculating # of on topic words per character and comparing to the training set
#mean. Requires features to be passed in
#
Test topicality by calculating # of on topic words per character and comparing to the training set
#
mean. Requires features to be passed in
if
features
is
not
None
:
f_row_sum
=
numpy
.
sum
(
features
[
m
,
12
:])
f_row_prop
=
f_row_sum
/
len
(
essay_set
.
_text
[
m
])
...
...
@@ -361,7 +361,7 @@ class FeatureExtractor(object):
individual_feedback
[
'too_similar_to_prompt'
]
=
True
log
.
debug
(
features
[
m
,
9
])
#Create string representation of markup text
#
Create string representation of markup text
markup_string
=
" "
.
join
(
markup_tokens
)
individual_feedback
[
'markup_text'
]
=
markup_string
all_feedback
.
append
(
individual_feedback
)
...
...
ease/grade.py
View file @
b32d5674
...
...
@@ -3,7 +3,6 @@ Functions to score specified data using specified ML models
"""
import
sys
import
pickle
import
os
import
numpy
import
logging
...
...
@@ -14,14 +13,11 @@ sys.path.append(base_path)
#Depend on base path to be imported
from
essay_set
import
EssaySet
import
predictor_extractor
import
predictor_set
import
util_functions
from
errors
import
*
#Imports needed to unpickle grader data
import
feature_extractor
import
sklearn.ensemble
import
math
log
=
logging
.
getLogger
(
__name__
)
...
...
ease/model_creator.py
View file @
b32d5674
...
...
@@ -142,10 +142,9 @@ def extract_features_and_generate_model_predictors(predictor_set, algorithm=util
if
(
algorithm
not
in
[
util_functions
.
AlgorithmTypes
.
regression
,
util_functions
.
AlgorithmTypes
.
classification
]):
algorithm
=
util_functions
.
AlgorithmTypes
.
regression
f
=
predictor_extractor
.
PredictorExtractor
()
f
.
initialize_dictionaries
(
predictor_set
)
f
=
predictor_extractor
.
PredictorExtractor
(
predictor_set
)
train_feats
=
f
.
gen
_feat
s
(
predictor_set
)
train_feats
=
f
.
gen
erate_feature
s
(
predictor_set
)
clf
,
clf2
=
get_algorithms
(
algorithm
)
cv_error_results
=
get_cv_error
(
clf2
,
train_feats
,
predictor_set
.
_target
)
...
...
ease/predictor_extractor.py
View file @
b32d5674
...
...
@@ -29,58 +29,69 @@ log = logging.getLogger(__name__)
class
PredictorExtractor
(
object
):
def
__init__
(
self
):
self
.
_extractors
=
[]
self
.
_initialized
=
False
"""
Provides an interface for extracting features from a predictor set (as opposed to an essay set), and uses the
methods of the essay set feature extractor in order to maintain cohesion between the two different methods.
"""
def
initialize_dictionaries
(
self
,
p
_set
):
def
__init__
(
self
,
predictor
_set
):
"""
Initialize dictionaries with the textual inputs in the PredictorSet object
p_set - PredictorSet object that has had data fed in
Initializes dictionaries with the textual inputs in the PredictorSet object
Uses a predictor_set in the definition of the PredictorExtractor to train the extractor.
Args:
predictor_set (PredictorSet): PredictorSet object that has had data fed to it
"""
success
=
False
if
not
(
hasattr
(
p_set
,
'_type'
)):
if
not
(
hasattr
(
p
redictor
_set
,
'_type'
)):
error_message
=
"needs to be an essay set of the train type."
log
.
exception
(
error_message
)
raise
util_functions
.
InputError
(
p_set
,
error_message
)
raise
util_functions
.
InputError
(
p
redictor
_set
,
error_message
)
if
not
(
p_set
.
_type
==
"train"
):
if
not
(
p
redictor
_set
.
_type
==
"train"
):
error_message
=
"needs to be an essay set of the train type."
log
.
exception
(
error_message
)
raise
util_functions
.
InputError
(
p_set
,
error_message
)
raise
util_functions
.
InputError
(
p
redictor
_set
,
error_message
)
div_length
=
len
(
p_set
.
_essay_sets
)
div_length
=
len
(
p
redictor
_set
.
_essay_sets
)
if
div_length
==
0
:
div_length
=
1
#Ensures that even with a large amount of input textual features, training time stays reasonable
max_feats2
=
int
(
math
.
floor
(
200
/
div_length
))
for
i
in
xrange
(
0
,
len
(
p_set
.
_essay_sets
)):
self
.
_extractors
.
append
(
FeatureExtractor
())
self
.
_extractors
[
i
]
.
initialize_dictionaries
(
p_set
.
_essay_sets
[
i
],
max_features_pass_2
=
max_feats2
)
self
.
_extractors
=
[]
# Ensures that even with a large amount of input textual features, training time will stay reasonable
max_features_pass_2
=
int
(
math
.
floor
(
200
/
div_length
))
for
i
in
xrange
(
0
,
len
(
predictor_set
.
_essay_sets
)):
self
.
_extractors
.
append
(
FeatureExtractor
(
predictor_set
.
_essay_sets
[
i
])
)
self
.
_initialized
=
True
success
=
True
return
success
def
gen
_feats
(
self
,
p
_set
):
def
gen
erate_features
(
self
,
predictor
_set
):
"""
Generates features based on an iput p_set
p_set - PredictorSet
Generates features given a predictor set containing the essays/data we want to extract from
Args:
predictor_set (PredictorSet): the wrapper which contains the prediction data we want to extract from
Returns:
an array of features
"""
if
self
.
_initialized
!=
True
:
error_message
=
"Dictionaries have not been initialized."
log
.
exception
(
error_message
)
raise
util_functions
.
InputError
(
p_set
,
error_message
)
raise
util_functions
.
InputError
(
p
redictor
_set
,
error_message
)
textual_features
=
[]
for
i
in
xrange
(
0
,
len
(
p_set
.
_essay_sets
)):
textual_features
.
append
(
self
.
_extractors
[
i
]
.
generate_features
(
p_set
.
_essay_sets
[
i
]))
# Generates features by using the generate_features method from the essay set class
for
i
in
xrange
(
0
,
len
(
predictor_set
.
_essay_sets
)):
textual_features
.
append
(
self
.
_extractors
[
i
]
.
generate_features
(
predictor_set
.
_essay_sets
[
i
])
)
textual_matrix
=
numpy
.
concatenate
(
textual_features
,
axis
=
1
)
predictor_matrix
=
numpy
.
array
(
p_set
.
_numeric_features
)
predictor_matrix
=
numpy
.
array
(
p
redictor
_set
.
_numeric_features
)
print
textual_matrix
.
shape
print
predictor_matrix
.
shape
# Originally there were two calls here to print the shape of the feature matricies. GBW didn't think this was
# appropriate, and deleted them.
overall_matrix
=
numpy
.
concatenate
((
textual_matrix
,
predictor_matrix
),
axis
=
1
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment