Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
E
ease
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
edx
ease
Commits
b32d5674
Commit
b32d5674
authored
Jun 12, 2014
by
gradyward
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Changed the predictor_extractor.py file, adding clarity and integrating previous changes.
parent
8bc96cb8
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
51 additions
and
45 deletions
+51
-45
ease/feature_extractor.py
+10
-10
ease/grade.py
+0
-4
ease/model_creator.py
+2
-3
ease/predictor_extractor.py
+39
-28
No files found.
ease/feature_extractor.py
View file @
b32d5674
...
@@ -299,10 +299,10 @@ class FeatureExtractor(object):
...
@@ -299,10 +299,10 @@ class FeatureExtractor(object):
"""
"""
#TODO This is still bad.
#TODO This is still bad.
#Set ratio to modify thresholds for grammar/spelling errors
#
Set ratio to modify thresholds for grammar/spelling errors
modifier_ratio
=
1.05
modifier_ratio
=
1.05
#GBW TODO: This might be wrong.
#
GBW TODO: This might be wrong.
#Calc number of grammar and spelling errors per character
#
Calc number of grammar and spelling errors per character
set_grammar
,
bad_pos_positions
=
self
.
_get_grammar_errors
(
essay_set
.
_pos
,
essay_set
.
_text
,
essay_set
.
_tokens
)
set_grammar
,
bad_pos_positions
=
self
.
_get_grammar_errors
(
essay_set
.
_pos
,
essay_set
.
_text
,
essay_set
.
_tokens
)
set_grammar_per_character
=
[
set_grammar_per_character
=
[
set_grammar
[
m
]
/
float
(
set_grammar
[
m
]
/
float
(
...
@@ -316,7 +316,7 @@ class FeatureExtractor(object):
...
@@ -316,7 +316,7 @@ class FeatureExtractor(object):
)
)
]
]
#Iterate through essays and create a feedback dictionary for each
#
Iterate through essays and create a feedback dictionary for each
all_feedback
=
[]
all_feedback
=
[]
for
m
in
xrange
(
0
,
len
(
essay_set
.
_text
)):
for
m
in
xrange
(
0
,
len
(
essay_set
.
_text
)):
#Be very careful about changing these messages!
#Be very careful about changing these messages!
...
@@ -329,8 +329,8 @@ class FeatureExtractor(object):
...
@@ -329,8 +329,8 @@ class FeatureExtractor(object):
}
}
markup_tokens
=
essay_set
.
_markup_text
[
m
]
.
split
(
" "
)
markup_tokens
=
essay_set
.
_markup_text
[
m
]
.
split
(
" "
)
#This loop ensures that sequences of bad grammar get put together into one sequence instead of staying
#
This loop ensures that sequences of bad grammar get put together into one sequence instead of staying
#disjointed
#
disjointed
bad_pos_starts
=
[
z
[
0
]
for
z
in
bad_pos_positions
[
m
]]
bad_pos_starts
=
[
z
[
0
]
for
z
in
bad_pos_positions
[
m
]]
bad_pos_ends
=
[
z
[
1
]
-
1
for
z
in
bad_pos_positions
[
m
]]
bad_pos_ends
=
[
z
[
1
]
-
1
for
z
in
bad_pos_positions
[
m
]]
for
z
in
xrange
(
0
,
len
(
markup_tokens
)):
for
z
in
xrange
(
0
,
len
(
markup_tokens
)):
...
@@ -342,14 +342,14 @@ class FeatureExtractor(object):
...
@@ -342,14 +342,14 @@ class FeatureExtractor(object):
if
max
(
bad_pos_ends
)
>
(
len
(
markup_tokens
)
-
1
)
and
max
(
bad_pos_starts
)
<
(
len
(
markup_tokens
)
-
1
):
if
max
(
bad_pos_ends
)
>
(
len
(
markup_tokens
)
-
1
)
and
max
(
bad_pos_starts
)
<
(
len
(
markup_tokens
)
-
1
):
markup_tokens
[
len
(
markup_tokens
)
-
1
]
+=
"</bg>"
markup_tokens
[
len
(
markup_tokens
)
-
1
]
+=
"</bg>"
#Display messages if grammar/spelling errors greater than average in training set
#
Display messages if grammar/spelling errors greater than average in training set
if
set_grammar_per_character
[
m
]
>
(
self
.
_grammar_errors_per_character
*
modifier_ratio
):
if
set_grammar_per_character
[
m
]
>
(
self
.
_grammar_errors_per_character
*
modifier_ratio
):
individual_feedback
[
'grammar'
]
=
"Grammar: More grammar errors than average."
individual_feedback
[
'grammar'
]
=
"Grammar: More grammar errors than average."
if
set_spell_errors_per_character
[
m
]
>
(
self
.
_spell_errors_per_character
*
modifier_ratio
):
if
set_spell_errors_per_character
[
m
]
>
(
self
.
_spell_errors_per_character
*
modifier_ratio
):
individual_feedback
[
'spelling'
]
=
"Spelling: More spelling errors than average."
individual_feedback
[
'spelling'
]
=
"Spelling: More spelling errors than average."
#Test topicality by calculating # of on topic words per character and comparing to the training set
#
Test topicality by calculating # of on topic words per character and comparing to the training set
#mean. Requires features to be passed in
#
mean. Requires features to be passed in
if
features
is
not
None
:
if
features
is
not
None
:
f_row_sum
=
numpy
.
sum
(
features
[
m
,
12
:])
f_row_sum
=
numpy
.
sum
(
features
[
m
,
12
:])
f_row_prop
=
f_row_sum
/
len
(
essay_set
.
_text
[
m
])
f_row_prop
=
f_row_sum
/
len
(
essay_set
.
_text
[
m
])
...
@@ -361,7 +361,7 @@ class FeatureExtractor(object):
...
@@ -361,7 +361,7 @@ class FeatureExtractor(object):
individual_feedback
[
'too_similar_to_prompt'
]
=
True
individual_feedback
[
'too_similar_to_prompt'
]
=
True
log
.
debug
(
features
[
m
,
9
])
log
.
debug
(
features
[
m
,
9
])
#Create string representation of markup text
#
Create string representation of markup text
markup_string
=
" "
.
join
(
markup_tokens
)
markup_string
=
" "
.
join
(
markup_tokens
)
individual_feedback
[
'markup_text'
]
=
markup_string
individual_feedback
[
'markup_text'
]
=
markup_string
all_feedback
.
append
(
individual_feedback
)
all_feedback
.
append
(
individual_feedback
)
...
...
ease/grade.py
View file @
b32d5674
...
@@ -3,7 +3,6 @@ Functions to score specified data using specified ML models
...
@@ -3,7 +3,6 @@ Functions to score specified data using specified ML models
"""
"""
import
sys
import
sys
import
pickle
import
os
import
os
import
numpy
import
numpy
import
logging
import
logging
...
@@ -14,14 +13,11 @@ sys.path.append(base_path)
...
@@ -14,14 +13,11 @@ sys.path.append(base_path)
#Depend on base path to be imported
#Depend on base path to be imported
from
essay_set
import
EssaySet
from
essay_set
import
EssaySet
import
predictor_extractor
import
predictor_set
import
predictor_set
import
util_functions
import
util_functions
from
errors
import
*
from
errors
import
*
#Imports needed to unpickle grader data
#Imports needed to unpickle grader data
import
feature_extractor
import
sklearn.ensemble
import
math
import
math
log
=
logging
.
getLogger
(
__name__
)
log
=
logging
.
getLogger
(
__name__
)
...
...
ease/model_creator.py
View file @
b32d5674
...
@@ -142,10 +142,9 @@ def extract_features_and_generate_model_predictors(predictor_set, algorithm=util
...
@@ -142,10 +142,9 @@ def extract_features_and_generate_model_predictors(predictor_set, algorithm=util
if
(
algorithm
not
in
[
util_functions
.
AlgorithmTypes
.
regression
,
util_functions
.
AlgorithmTypes
.
classification
]):
if
(
algorithm
not
in
[
util_functions
.
AlgorithmTypes
.
regression
,
util_functions
.
AlgorithmTypes
.
classification
]):
algorithm
=
util_functions
.
AlgorithmTypes
.
regression
algorithm
=
util_functions
.
AlgorithmTypes
.
regression
f
=
predictor_extractor
.
PredictorExtractor
()
f
=
predictor_extractor
.
PredictorExtractor
(
predictor_set
)
f
.
initialize_dictionaries
(
predictor_set
)
train_feats
=
f
.
gen
_feat
s
(
predictor_set
)
train_feats
=
f
.
gen
erate_feature
s
(
predictor_set
)
clf
,
clf2
=
get_algorithms
(
algorithm
)
clf
,
clf2
=
get_algorithms
(
algorithm
)
cv_error_results
=
get_cv_error
(
clf2
,
train_feats
,
predictor_set
.
_target
)
cv_error_results
=
get_cv_error
(
clf2
,
train_feats
,
predictor_set
.
_target
)
...
...
ease/predictor_extractor.py
View file @
b32d5674
...
@@ -29,58 +29,69 @@ log = logging.getLogger(__name__)
...
@@ -29,58 +29,69 @@ log = logging.getLogger(__name__)
class
PredictorExtractor
(
object
):
class
PredictorExtractor
(
object
):
def
__init__
(
self
):
"""
self
.
_extractors
=
[]
Provides an interface for extracting features from a predictor set (as opposed to an essay set), and uses the
self
.
_initialized
=
False
methods of the essay set feature extractor in order to maintain cohesion between the two different methods.
"""
def
initialize_dictionaries
(
self
,
p
_set
):
def
__init__
(
self
,
predictor
_set
):
"""
"""
Initialize dictionaries with the textual inputs in the PredictorSet object
Initializes dictionaries with the textual inputs in the PredictorSet object
p_set - PredictorSet object that has had data fed in
Uses a predictor_set in the definition of the PredictorExtractor to train the extractor.
Args:
predictor_set (PredictorSet): PredictorSet object that has had data fed to it
"""
"""
success
=
False
if
not
(
hasattr
(
p_set
,
'_type'
)):
if
not
(
hasattr
(
p
redictor
_set
,
'_type'
)):
error_message
=
"needs to be an essay set of the train type."
error_message
=
"needs to be an essay set of the train type."
log
.
exception
(
error_message
)
log
.
exception
(
error_message
)
raise
util_functions
.
InputError
(
p_set
,
error_message
)
raise
util_functions
.
InputError
(
p
redictor
_set
,
error_message
)
if
not
(
p_set
.
_type
==
"train"
):
if
not
(
p
redictor
_set
.
_type
==
"train"
):
error_message
=
"needs to be an essay set of the train type."
error_message
=
"needs to be an essay set of the train type."
log
.
exception
(
error_message
)
log
.
exception
(
error_message
)
raise
util_functions
.
InputError
(
p_set
,
error_message
)
raise
util_functions
.
InputError
(
p
redictor
_set
,
error_message
)
div_length
=
len
(
p_set
.
_essay_sets
)
div_length
=
len
(
p
redictor
_set
.
_essay_sets
)
if
div_length
==
0
:
if
div_length
==
0
:
div_length
=
1
div_length
=
1
#Ensures that even with a large amount of input textual features, training time stays reasonable
self
.
_extractors
=
[]
max_feats2
=
int
(
math
.
floor
(
200
/
div_length
))
# Ensures that even with a large amount of input textual features, training time will stay reasonable
for
i
in
xrange
(
0
,
len
(
p_set
.
_essay_sets
)):
max_features_pass_2
=
int
(
math
.
floor
(
200
/
div_length
))
self
.
_extractors
.
append
(
FeatureExtractor
())
for
i
in
xrange
(
0
,
len
(
predictor_set
.
_essay_sets
)):
self
.
_extractors
[
i
]
.
initialize_dictionaries
(
p_set
.
_essay_sets
[
i
],
max_features_pass_2
=
max_feats2
)
self
.
_extractors
.
append
(
FeatureExtractor
(
predictor_set
.
_essay_sets
[
i
])
)
self
.
_initialized
=
True
self
.
_initialized
=
True
success
=
True
return
success
def
gen
_feats
(
self
,
p
_set
):
def
gen
erate_features
(
self
,
predictor
_set
):
"""
"""
Generates features based on an iput p_set
Generates features given a predictor set containing the essays/data we want to extract from
p_set - PredictorSet
Args:
predictor_set (PredictorSet): the wrapper which contains the prediction data we want to extract from
Returns:
an array of features
"""
"""
if
self
.
_initialized
!=
True
:
if
self
.
_initialized
!=
True
:
error_message
=
"Dictionaries have not been initialized."
error_message
=
"Dictionaries have not been initialized."
log
.
exception
(
error_message
)
log
.
exception
(
error_message
)
raise
util_functions
.
InputError
(
p_set
,
error_message
)
raise
util_functions
.
InputError
(
p
redictor
_set
,
error_message
)
textual_features
=
[]
textual_features
=
[]
for
i
in
xrange
(
0
,
len
(
p_set
.
_essay_sets
)):
# Generates features by using the generate_features method from the essay set class
textual_features
.
append
(
self
.
_extractors
[
i
]
.
generate_features
(
p_set
.
_essay_sets
[
i
]))
for
i
in
xrange
(
0
,
len
(
predictor_set
.
_essay_sets
)):
textual_features
.
append
(
self
.
_extractors
[
i
]
.
generate_features
(
predictor_set
.
_essay_sets
[
i
])
)
textual_matrix
=
numpy
.
concatenate
(
textual_features
,
axis
=
1
)
textual_matrix
=
numpy
.
concatenate
(
textual_features
,
axis
=
1
)
predictor_matrix
=
numpy
.
array
(
p_set
.
_numeric_features
)
predictor_matrix
=
numpy
.
array
(
p
redictor
_set
.
_numeric_features
)
print
textual_matrix
.
shape
# Originally there were two calls here to print the shape of the feature matricies. GBW didn't think this was
print
predictor_matrix
.
shape
# appropriate, and deleted them.
overall_matrix
=
numpy
.
concatenate
((
textual_matrix
,
predictor_matrix
),
axis
=
1
)
overall_matrix
=
numpy
.
concatenate
((
textual_matrix
,
predictor_matrix
),
axis
=
1
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment