Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
E
ease
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
edx
ease
Commits
aae59858
Commit
aae59858
authored
Feb 26, 2013
by
Vik Paruchuri
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Generate some documentation
parent
2c7214d8
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
34 additions
and
0 deletions
+34
-0
feature_extractor.py
+18
-0
grade.py
+1
-0
model_creator.py
+15
-0
No files found.
feature_extractor.py
View file @
aae59858
...
@@ -24,6 +24,7 @@ if not base_path.endswith("/"):
...
@@ -24,6 +24,7 @@ if not base_path.endswith("/"):
log
=
logging
.
getLogger
(
__name__
)
log
=
logging
.
getLogger
(
__name__
)
#Paths to needed data files
NGRAM_PATH
=
base_path
+
"data/good_pos_ngrams.p"
NGRAM_PATH
=
base_path
+
"data/good_pos_ngrams.p"
ESSAY_CORPUS_PATH
=
util_functions
.
ESSAY_CORPUS_PATH
ESSAY_CORPUS_PATH
=
util_functions
.
ESSAY_CORPUS_PATH
...
@@ -43,17 +44,26 @@ class FeatureExtractor(object):
...
@@ -43,17 +44,26 @@ class FeatureExtractor(object):
"""
"""
if
(
hasattr
(
e_set
,
'_type'
)):
if
(
hasattr
(
e_set
,
'_type'
)):
if
(
e_set
.
_type
==
"train"
):
if
(
e_set
.
_type
==
"train"
):
#normal text (unstemmed) useful words/bigrams
nvocab
=
util_functions
.
get_vocab
(
e_set
.
_text
,
e_set
.
_score
,
max_feats2
=
max_feats2
)
nvocab
=
util_functions
.
get_vocab
(
e_set
.
_text
,
e_set
.
_score
,
max_feats2
=
max_feats2
)
#stemmed and spell corrected vocab useful words/ngrams
svocab
=
util_functions
.
get_vocab
(
e_set
.
_clean_stem_text
,
e_set
.
_score
,
max_feats2
=
max_feats2
)
svocab
=
util_functions
.
get_vocab
(
e_set
.
_clean_stem_text
,
e_set
.
_score
,
max_feats2
=
max_feats2
)
#dictionary trained on proper vocab
self
.
_normal_dict
=
CountVectorizer
(
ngram_range
=
(
1
,
2
),
vocabulary
=
nvocab
)
self
.
_normal_dict
=
CountVectorizer
(
ngram_range
=
(
1
,
2
),
vocabulary
=
nvocab
)
#dictionary trained on proper vocab
self
.
_stem_dict
=
CountVectorizer
(
ngram_range
=
(
1
,
2
),
vocabulary
=
svocab
)
self
.
_stem_dict
=
CountVectorizer
(
ngram_range
=
(
1
,
2
),
vocabulary
=
svocab
)
self
.
dict_initialized
=
True
self
.
dict_initialized
=
True
#Average spelling errors in set. needed later for spelling detection
self
.
_mean_spelling_errors
=
sum
(
e_set
.
_spelling_errors
)
/
float
(
len
(
e_set
.
_spelling_errors
))
self
.
_mean_spelling_errors
=
sum
(
e_set
.
_spelling_errors
)
/
float
(
len
(
e_set
.
_spelling_errors
))
self
.
_spell_errors_per_character
=
sum
(
e_set
.
_spelling_errors
)
/
float
(
sum
([
len
(
t
)
for
t
in
e_set
.
_text
]))
self
.
_spell_errors_per_character
=
sum
(
e_set
.
_spelling_errors
)
/
float
(
sum
([
len
(
t
)
for
t
in
e_set
.
_text
]))
#Gets the number and positions of grammar errors
good_pos_tags
,
bad_pos_positions
=
self
.
_get_grammar_errors
(
e_set
.
_pos
,
e_set
.
_text
,
e_set
.
_tokens
)
good_pos_tags
,
bad_pos_positions
=
self
.
_get_grammar_errors
(
e_set
.
_pos
,
e_set
.
_text
,
e_set
.
_tokens
)
self
.
_grammar_errors_per_character
=
(
sum
(
good_pos_tags
)
/
float
(
sum
([
len
(
t
)
for
t
in
e_set
.
_text
])))
self
.
_grammar_errors_per_character
=
(
sum
(
good_pos_tags
)
/
float
(
sum
([
len
(
t
)
for
t
in
e_set
.
_text
])))
#Generate bag of words features
bag_feats
=
self
.
gen_bag_feats
(
e_set
)
bag_feats
=
self
.
gen_bag_feats
(
e_set
)
#Sum of a row of bag of words features (topical words in an essay)
f_row_sum
=
numpy
.
sum
(
bag_feats
[:,:])
f_row_sum
=
numpy
.
sum
(
bag_feats
[:,:])
#Average index of how "topical" essays are
self
.
_mean_f_prop
=
f_row_sum
/
float
(
sum
([
len
(
t
)
for
t
in
e_set
.
_text
]))
self
.
_mean_f_prop
=
f_row_sum
/
float
(
sum
([
len
(
t
)
for
t
in
e_set
.
_text
]))
ret
=
"ok"
ret
=
"ok"
else
:
else
:
...
@@ -87,6 +97,9 @@ class FeatureExtractor(object):
...
@@ -87,6 +97,9 @@ class FeatureExtractor(object):
def
_get_grammar_errors
(
self
,
pos
,
text
,
tokens
):
def
_get_grammar_errors
(
self
,
pos
,
text
,
tokens
):
"""
"""
Internal function to get the number of grammar errors in given text
Internal function to get the number of grammar errors in given text
pos - part of speech tagged text (list)
text - normal text (list)
tokens - list of lists of tokenized text
"""
"""
word_counts
=
[
max
(
len
(
t
),
1
)
for
t
in
tokens
]
word_counts
=
[
max
(
len
(
t
),
1
)
for
t
in
tokens
]
good_pos_tags
=
[]
good_pos_tags
=
[]
...
@@ -123,6 +136,7 @@ class FeatureExtractor(object):
...
@@ -123,6 +136,7 @@ class FeatureExtractor(object):
Generates length based features from an essay set
Generates length based features from an essay set
Generally an internal function called by gen_feats
Generally an internal function called by gen_feats
Returns an array of length features
Returns an array of length features
e_set - EssaySet object
"""
"""
text
=
e_set
.
_text
text
=
e_set
.
_text
lengths
=
[
len
(
e
)
for
e
in
text
]
lengths
=
[
len
(
e
)
for
e
in
text
]
...
@@ -146,6 +160,7 @@ class FeatureExtractor(object):
...
@@ -146,6 +160,7 @@ class FeatureExtractor(object):
Generates bag of words features from an input essay set and trained FeatureExtractor
Generates bag of words features from an input essay set and trained FeatureExtractor
Generally called by gen_feats
Generally called by gen_feats
Returns an array of features
Returns an array of features
e_set - EssaySet object
"""
"""
if
(
hasattr
(
self
,
'_stem_dict'
)):
if
(
hasattr
(
self
,
'_stem_dict'
)):
sfeats
=
self
.
_stem_dict
.
transform
(
e_set
.
_clean_stem_text
)
sfeats
=
self
.
_stem_dict
.
transform
(
e_set
.
_clean_stem_text
)
...
@@ -159,6 +174,7 @@ class FeatureExtractor(object):
...
@@ -159,6 +174,7 @@ class FeatureExtractor(object):
"""
"""
Generates bag of words, length, and prompt features from an essay set object
Generates bag of words, length, and prompt features from an essay set object
returns an array of features
returns an array of features
e_set - EssaySet object
"""
"""
bag_feats
=
self
.
gen_bag_feats
(
e_set
)
bag_feats
=
self
.
gen_bag_feats
(
e_set
)
length_feats
=
self
.
gen_length_feats
(
e_set
)
length_feats
=
self
.
gen_length_feats
(
e_set
)
...
@@ -173,6 +189,7 @@ class FeatureExtractor(object):
...
@@ -173,6 +189,7 @@ class FeatureExtractor(object):
Generates prompt based features from an essay set object and internal prompt variable.
Generates prompt based features from an essay set object and internal prompt variable.
Generally called internally by gen_feats
Generally called internally by gen_feats
Returns an array of prompt features
Returns an array of prompt features
e_set - EssaySet object
"""
"""
prompt_toks
=
nltk
.
word_tokenize
(
e_set
.
_prompt
)
prompt_toks
=
nltk
.
word_tokenize
(
e_set
.
_prompt
)
expand_syns
=
[]
expand_syns
=
[]
...
@@ -208,6 +225,7 @@ class FeatureExtractor(object):
...
@@ -208,6 +225,7 @@ class FeatureExtractor(object):
features - optionally, pass in a matrix of features extracted from e_set using FeatureExtractor
features - optionally, pass in a matrix of features extracted from e_set using FeatureExtractor
in order to get off topic feedback.
in order to get off topic feedback.
Returns a list of lists (one list per essay in e_set)
Returns a list of lists (one list per essay in e_set)
e_set - EssaySet object
"""
"""
#Set ratio to modify thresholds for grammar/spelling errors
#Set ratio to modify thresholds for grammar/spelling errors
...
...
grade.py
View file @
aae59858
...
@@ -174,6 +174,7 @@ def get_confidence_value(algorithm,model,grader_feats,score):
...
@@ -174,6 +174,7 @@ def get_confidence_value(algorithm,model,grader_feats,score):
min_score=min(numpy.asarray(score))
min_score=min(numpy.asarray(score))
max_score=max(numpy.asarray(score))
max_score=max(numpy.asarray(score))
if algorithm == util_functions.AlgorithmTypes.classification:
if algorithm == util_functions.AlgorithmTypes.classification:
#If classification, predict with probability, which gives you a matrix of confidences per score point
raw_confidence=model.predict_proba(grader_feats)[0,(score-min_score)]
raw_confidence=model.predict_proba(grader_feats)[0,(score-min_score)]
#TODO: Normalize confidence somehow here
#TODO: Normalize confidence somehow here
confidence=raw_confidence
confidence=raw_confidence
...
...
model_creator.py
View file @
aae59858
...
@@ -87,6 +87,12 @@ def create_essay_set(text, score, prompt_string, generate_additional=True):
...
@@ -87,6 +87,12 @@ def create_essay_set(text, score, prompt_string, generate_additional=True):
return
x
return
x
def
get_cv_error
(
clf
,
feats
,
scores
):
def
get_cv_error
(
clf
,
feats
,
scores
):
"""
Gets cross validated error for a given classifier, set of features, and scores
clf - classifier
feats - features to feed into the classified and cross validate over
scores - scores associated with the features -- feature row 1 associates with score 1, etc.
"""
results
=
{
'success'
:
False
,
'kappa'
:
0
,
'mae'
:
0
}
results
=
{
'success'
:
False
,
'kappa'
:
0
,
'mae'
:
0
}
try
:
try
:
cv_preds
=
util_functions
.
gen_cv_preds
(
clf
,
feats
,
scores
)
cv_preds
=
util_functions
.
gen_cv_preds
(
clf
,
feats
,
scores
)
...
@@ -104,6 +110,10 @@ def get_cv_error(clf,feats,scores):
...
@@ -104,6 +110,10 @@ def get_cv_error(clf,feats,scores):
return
results
return
results
def
get_algorithms
(
type
):
def
get_algorithms
(
type
):
"""
Gets two classifiers for each type of algorithm, and returns them. First for predicting, second for cv error.
type - one of util_functions.AlgorithmTypes
"""
if
type
==
util_functions
.
AlgorithmTypes
.
classification
:
if
type
==
util_functions
.
AlgorithmTypes
.
classification
:
clf
=
sklearn
.
ensemble
.
GradientBoostingClassifier
(
n_estimators
=
100
,
learn_rate
=.
05
,
clf
=
sklearn
.
ensemble
.
GradientBoostingClassifier
(
n_estimators
=
100
,
learn_rate
=.
05
,
max_depth
=
4
,
random_state
=
1
,
min_samples_leaf
=
3
)
max_depth
=
4
,
random_state
=
1
,
min_samples_leaf
=
3
)
...
@@ -118,6 +128,11 @@ def get_algorithms(type):
...
@@ -118,6 +128,11 @@ def get_algorithms(type):
def
extract_features_and_generate_model_predictors
(
predictor_set
,
type
=
util_functions
.
AlgorithmTypes
.
regression
):
def
extract_features_and_generate_model_predictors
(
predictor_set
,
type
=
util_functions
.
AlgorithmTypes
.
regression
):
"""
Extracts features and generates predictors based on a given predictor set
predictor_set - a PredictorSet object that has been initialized with data
type - one of util_functions.AlgorithmType
"""
if
(
algorithm
not
in
[
util_functions
.
AlgorithmTypes
.
regression
,
util_functions
.
AlgorithmTypes
.
classification
]):
if
(
algorithm
not
in
[
util_functions
.
AlgorithmTypes
.
regression
,
util_functions
.
AlgorithmTypes
.
classification
]):
algorithm
=
util_functions
.
AlgorithmTypes
.
regression
algorithm
=
util_functions
.
AlgorithmTypes
.
regression
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment