Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
E
ease
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
edx
ease
Commits
b118aba5
Commit
b118aba5
authored
Jun 13, 2014
by
gradyward
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Finishing Touches
parent
6cac95c1
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
10 additions
and
86 deletions
+10
-86
ease/create.py
+5
-36
ease/essay_set.py
+2
-7
ease/feature_extractor.py
+3
-42
ease/grade.py
+0
-1
No files found.
ease/create.py
View file @
b118aba5
...
...
@@ -28,7 +28,7 @@ from ease import feature_extractor
from
ease.essay_set
import
EssaySet
def
create
(
examples
,
scores
,
prompt_string
,
dump_data
=
False
):
def
create
(
examples
,
scores
,
prompt_string
):
"""
Creates a machine learning model from basic inputs (essays, associated scores and a prompt) and trains the model.
...
...
@@ -39,9 +39,6 @@ def create(examples, scores, prompt_string, dump_data=False):
scores (list of int): the associated scores that correspond to the essays.
prompt_string (str): the common prompt for all of the example essays.
Kwargs:
dump_data (bool): whether or not a examples and scores should be set via a data input dump
Returns:
(dict): Has the following keys:
'errors' (list of Exception): List of all errors that occurred during training
...
...
@@ -52,11 +49,7 @@ def create(examples, scores, prompt_string, dump_data=False):
'success' (bool): Whether or not the training of the classifier was successful.
"""
# If dump_data is true, then the examples and scores are loaded from json data.
if
dump_data
:
_dump_input_data
(
examples
,
scores
)
# Selects the appropriate ML algorithm to use to train the classifier
# Selects the appropriate ML algorithm to use to train (Classification or Regression)
algorithm
=
_determine_algorithm
(
scores
)
#Initialize a results dictionary to return
...
...
@@ -114,7 +107,7 @@ def _determine_algorithm(score_list):
The ML algorithm used to train the classifier set and feature extractor
"""
#Count the number of unique score
point
s in the score list
#Count the number of unique score
value
s in the score list
if
len
(
set
(
score_list
))
>
5
:
return
util_functions
.
AlgorithmTypes
.
regression
else
:
...
...
@@ -249,33 +242,8 @@ def _get_cv_error(classifier, features, scores):
results
[
'success'
]
=
True
except
ValueError
as
ex
:
# If this is hit, everything is fine. It is hard to explain why the error occurs, but it isn't a big deal.
# TODO Figure out why this error would occur in the first place.
# TODO Figure out why this error would occur in the first place.
^^^ THIS IS NOT ACCEPTABLE
msg
=
u"Not enough classes (0,1,etc) in each cross validation fold: {ex}"
.
format
(
ex
=
ex
)
log
.
debug
(
msg
)
return
results
\ No newline at end of file
def
_dump_input_data
(
essays
,
scores
):
"""
Dumps input data using json serialized objects of the form {'text': essay, 'score': score}
Args:
essays (list of str): A list of essays to dump
scores (list of int): An associated list of scores
"""
file_path
=
base_path
+
"/tests/data/json_data/"
time_suffix
=
datetime
.
now
()
.
strftime
(
"
%
H
%
M
%
S
%
d
%
m
%
Y"
)
prefix
=
"test-case-"
filename
=
prefix
+
time_suffix
+
".json"
json_data
=
[]
try
:
for
i
in
xrange
(
0
,
len
(
essays
)):
json_data
.
append
({
'text'
:
essays
[
i
],
'score'
:
scores
[
i
]})
with
open
(
file_path
+
filename
,
'w+'
)
as
outfile
:
json
.
dump
(
json_data
,
outfile
)
except
IOError
as
ex
:
error
=
"An IO error occurred while trying to dump JSON data to a file: {ex}"
.
format
(
ex
=
ex
)
log
.
exception
(
error
)
raise
CreateRequestError
(
error
)
ease/essay_set.py
View file @
b118aba5
...
...
@@ -102,22 +102,17 @@ class EssaySet(object):
log
.
exception
(
msg
)
raise
EssaySetRequestError
(
msg
)
# Validates that score is an integer and essay_text is a string.
# Validates that score is an integer and essay_text is a string
and essay_generated is a 0 or a 1
.
try
:
essay_score
=
int
(
essay_score
)
essay_text
=
str
(
essay_text
)
essay_generated
=
int
(
essay_generated
)
bool
(
essay_generated
)
except
TypeError
:
ex
=
"Invalid type for essay score : {0} or essay text : {1}"
.
format
(
type
(
essay_score
),
type
(
essay_text
))
log
.
exception
(
ex
)
raise
EssaySetRequestError
(
ex
)
# Validates that essay generated is 0 or 1
if
essay_generated
!=
0
and
essay_generated
!=
1
:
ex
=
"Invalid value for essay_generated ({}). Value must be 0 or 1."
.
format
(
essay_generated
)
log
.
exception
(
ex
)
raise
EssaySetRequestError
(
ex
)
# Validates to make sure that the essay is at least five characters long.
if
len
(
essay_text
)
<
5
:
essay_text
=
"Invalid essay."
...
...
ease/feature_extractor.py
View file @
b118aba5
...
...
@@ -117,19 +117,19 @@ class FeatureExtractor(object):
Array of features with the following included:
- Length Features
- Vocabulary Features (both Normal and Stemmed Vocabulary)
- Prompt Features
- EDIT: Prompt Features were being ignored (passed in an empty string), so for posterity we are ignoring
them.
"""
try
:
vocabulary_features
=
self
.
_generate_vocabulary_features
(
essay_set
)
length_features
=
self
.
_generate_length_features
(
essay_set
)
prompt_features
=
self
.
_generate_prompt_features
(
essay_set
)
except
Exception
as
ex
:
msg
=
"An unexpected error occurred during feature extraction: {}"
.
format
(
ex
)
log
.
exception
(
msg
)
raise
FeatureExtractionInternalError
(
msg
)
# Lumps them all together, copies to solidify, and returns
overall_features
=
numpy
.
concatenate
((
length_features
,
prompt_features
,
vocabulary_features
),
axis
=
1
)
overall_features
=
numpy
.
concatenate
((
length_features
,
vocabulary_features
),
axis
=
1
)
overall_features
=
overall_features
.
copy
()
return
overall_features
...
...
@@ -183,45 +183,6 @@ class FeatureExtractor(object):
bag_features
=
numpy
.
concatenate
((
stem_features
.
toarray
(),
normal_features
.
toarray
()),
axis
=
1
)
return
bag_features
.
copy
()
def
_generate_prompt_features
(
self
,
essay_set
):
"""
Generates prompt based features from an essay set object and internal prompt variable.
Called internally by generate_features
Args:
essay_set (EssaySet): an essay set object that is manipulated to generate prompt features
Returns:
an array of prompt features
"""
prompt_toks
=
nltk
.
word_tokenize
(
essay_set
.
_prompt
)
expand_syns
=
[]
for
word
in
prompt_toks
:
synonyms
=
util_functions
.
get_wordnet_syns
(
word
)
expand_syns
.
append
(
synonyms
)
expand_syns
=
list
(
chain
.
from_iterable
(
expand_syns
))
prompt_overlap
=
[]
prompt_overlap_prop
=
[]
for
j
in
essay_set
.
_tokens
:
tok_length
=
len
(
j
)
if
tok_length
==
0
:
tok_length
=
1
prompt_overlap
.
append
(
len
([
i
for
i
in
j
if
i
in
prompt_toks
]))
prompt_overlap_prop
.
append
(
prompt_overlap
[
len
(
prompt_overlap
)
-
1
]
/
float
(
tok_length
))
expand_overlap
=
[]
expand_overlap_prop
=
[]
for
j
in
essay_set
.
_tokens
:
tok_length
=
len
(
j
)
if
tok_length
==
0
:
tok_length
=
1
expand_overlap
.
append
(
len
([
i
for
i
in
j
if
i
in
expand_syns
]))
expand_overlap_prop
.
append
(
expand_overlap
[
len
(
expand_overlap
)
-
1
]
/
float
(
tok_length
))
prompt_arr
=
numpy
.
array
((
prompt_overlap
,
prompt_overlap_prop
,
expand_overlap
,
expand_overlap_prop
))
.
transpose
()
return
prompt_arr
.
copy
()
def
_get_grammar_errors
(
self
,
pos
,
essays
):
"""
Internal function to get the number of grammar errors in given text
...
...
ease/grade.py
View file @
b118aba5
...
...
@@ -8,7 +8,6 @@ import logging
import
sys
# Append sys to base path to import the following modules
base_path
=
os
.
path
.
dirname
(
__file__
)
sys
.
path
.
append
(
base_path
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment