Commit 09b8c904 by Vik Paruchuri

adding documentation

parent ead03fa6
Project to integrate machine learning based essay scoring with xserver. Aspell must be installed and added to path to run. numpy, scipy and sklearn also need to be installed. Project to integrate machine learning based essay scoring with xserver. Aspell must be installed and added to path to run. numpy, scipy, sklearn, and nltk also need to be installed.
Runnable files: Runnable files:
......
...@@ -4,14 +4,13 @@ import os ...@@ -4,14 +4,13 @@ import os
import sys import sys
import argparse import argparse
base_path = os.path.dirname( __file__ ) base_path = os.path.dirname(__file__)
sys.path.append(base_path) sys.path.append(base_path)
import model_creator import model_creator
def main(argv): def main(argv):
parser = argparse.ArgumentParser(description="Generate model from test data files") parser = argparse.ArgumentParser(description="Generate model from test data files")
parser.add_argument('train_file') parser.add_argument('train_file')
parser.add_argument('prompt_file') parser.add_argument('prompt_file')
...@@ -19,11 +18,11 @@ def main(argv): ...@@ -19,11 +18,11 @@ def main(argv):
args = parser.parse_args(argv) args = parser.parse_args(argv)
score,text=model_creator.read_in_test_data(args.train_file) score, text = model_creator.read_in_test_data(args.train_file)
prompt_string=model_creator.read_in_test_prompt(args.prompt_file) prompt_string = model_creator.read_in_test_prompt(args.prompt_file)
e_set=model_creator.create_essay_set(text,score,prompt_string) e_set = model_creator.create_essay_set(text, score, prompt_string)
feature_ext,classifier=model_creator.extract_features_and_generate_model(e_set) feature_ext, classifier = model_creator.extract_features_and_generate_model(e_set)
model_creator.dump_model_to_file(prompt_string,feature_ext,classifier,args.model_path) model_creator.dump_model_to_file(prompt_string, feature_ext, classifier, args.model_path)
if __name__=="__main__": if __name__ == "__main__":
main(sys.argv[1:]) main(sys.argv[1:])
...@@ -13,75 +13,94 @@ import util_functions ...@@ -13,75 +13,94 @@ import util_functions
class essay_set: class essay_set:
def __init__(self, type="train"): def __init__(self, type="train"):
if(type!="train" and type!="test"): """
type="train" Initialize variables and check essay set type
"""
if(type != "train" and type != "test"):
type = "train"
self._type = type self._type = type
self._score,self._text,self._id,self._clean_text,self._tokens,self._pos,\ self._score, self._text, self._id, self._clean_text, self._tokens, self._pos,\
self._clean_stem_text,self._generated=[],[],[],[],[],[],[],[] self._clean_stem_text, self._generated = [], [], [], [], [], [], [], []
self._prompt="" self._prompt = ""
#Add new (essay_text,essay_score) pair to the essay set def add_essay(self, essay_text, essay_score, essay_generated=0):
def add_essay(self,essay_text,essay_score,essay_generated=0): """
Add new (essay_text,essay_score) pair to the essay set.
essay_text must be a string.
essay_score must be an int.
essay_generated should not be changed by the user.
Returns a confirmation that essay was added.
"""
#Get maximum current essay id, or set to 0 if this is the first essay added #Get maximum current essay id, or set to 0 if this is the first essay added
if(len(self._id)>0): if(len(self._id) > 0):
max_id=max(self._id) max_id = max(self._id)
else : else:
max_id=0 max_id = 0
#Verify that essay_score is an int, essay_text is a string, and essay_generated equals 0 or 1 #Verify that essay_score is an int, essay_text is a string, and essay_generated equals 0 or 1
if type(essay_score)==type(0) and type(essay_text)==type("text") \ if type(essay_score) == type(0) and type(essay_text) == type("text")\
and (essay_generated==0 or essay_generated==1): and (essay_generated == 0 or essay_generated == 1):
self._id.append(max_id + 1)
self._id.append(max_id+1)
self._score.append(essay_score) self._score.append(essay_score)
#Clean text by removing non digit/work/punctuation characters #Clean text by removing non digit/work/punctuation characters
self._text.append(util_functions.sub_chars(essay_text).lower()) self._text.append(util_functions.sub_chars(essay_text).lower())
#Spell correct text using aspell #Spell correct text using aspell
self._clean_text.append(util_functions.spell_correct(self._text[len(self._text)-1])) self._clean_text.append(util_functions.spell_correct(self._text[len(self._text) - 1]))
#Tokenize text #Tokenize text
self._tokens.append(nltk.word_tokenize(self._clean_text[len(self._clean_text)-1])) self._tokens.append(nltk.word_tokenize(self._clean_text[len(self._clean_text) - 1]))
#Part of speech tag text #Part of speech tag text
self._pos.append(nltk.pos_tag(self._tokens[len(self._tokens)-1])) self._pos.append(nltk.pos_tag(self._tokens[len(self._tokens) - 1]))
self._generated.append(essay_generated) self._generated.append(essay_generated)
#Stem spell corrected text #Stem spell corrected text
porter = nltk.PorterStemmer() porter = nltk.PorterStemmer()
por_toks=" ".join([porter.stem(w) for w in self._tokens[len(self._tokens)-1]]) por_toks = " ".join([porter.stem(w) for w in self._tokens[len(self._tokens) - 1]])
self._clean_stem_text.append(por_toks) self._clean_stem_text.append(por_toks)
ret="text: " + self._text[len(self._text)-1] + " score: " + str(essay_score) ret = "text: " + self._text[len(self._text) - 1] + " score: " + str(essay_score)
else: else:
raise util_functions.InputError(essay_text,"arguments need to be in format " raise util_functions.InputError(essay_text, "arguments need to be in format "
"(text,score). text needs to be string," "(text,score). text needs to be string,"
" score needs to be int.") " score needs to be int.")
return ret return ret
#Update the default prompt string, which is "" def update_prompt(self, prompt_text):
def update_prompt(self,prompt_text): """
if(type(prompt_text)==type("text")): Update the default prompt string, which is "".
self._prompt=util_functions.sub_chars(prompt_text) prompt_text should be a string.
ret=self._prompt Returns the prompt as a confirmation.
"""
if(type(prompt_text) == type("text")):
self._prompt = util_functions.sub_chars(prompt_text)
ret = self._prompt
else: else:
raise util_functions.InputError(prompt_text,"Invalid prompt. Need to enter a string value.") raise util_functions.InputError(prompt_text, "Invalid prompt. Need to enter a string value.")
return ret return ret
#Substitute synonyms to generate extra essays from existing ones def generate_additional_essays(self, e_text, e_score, dict=None, max_syns=3):
def generate_additional_essays(self,e_text,e_score,dict=None,max_syns=3): """
Substitute synonyms to generate extra essays from existing ones.
This is done to increase the amount of training data.
Should only be used with lowest scoring essays.
e_text is the text of the original essay.
e_score is the score of the original essay.
dict is a fixed dictionary (list) of words to replace.
max_syns defines the maximum number of additional essays to generate. Do not set too high.
"""
random.seed(1) random.seed(1)
e_toks=nltk.word_tokenize(e_text) e_toks = nltk.word_tokenize(e_text)
all_syns=[] all_syns = []
for word in e_toks: for word in e_toks:
synonyms=util_functions.get_wordnet_syns(word) synonyms = util_functions.get_wordnet_syns(word)
if(len(synonyms)>max_syns): if(len(synonyms) > max_syns):
synonyms=random.sample(synonyms,max_syns) synonyms = random.sample(synonyms, max_syns)
all_syns.append(synonyms) all_syns.append(synonyms)
new_essays=[] new_essays = []
for i in range(0,max_syns): for i in range(0, max_syns):
syn_toks=e_toks syn_toks = e_toks
for z in range(0,len(e_toks)): for z in range(0, len(e_toks)):
if len(all_syns[z])>i and (dict==None or e_toks[z] in dict): if len(all_syns[z]) > i and (dict == None or e_toks[z] in dict):
syn_toks[z]=all_syns[z][i] syn_toks[z] = all_syns[z][i]
new_essays.append(" ".join(syn_toks)) new_essays.append(" ".join(syn_toks))
for z in xrange(0,len(new_essays)): for z in xrange(0, len(new_essays)):
self.add_essay(new_essays[z],e_score,1) self.add_essay(new_essays[z], e_score, 1)
\ No newline at end of file \ No newline at end of file
...@@ -16,92 +16,93 @@ import util_functions ...@@ -16,92 +16,93 @@ import util_functions
class feature_extractor: class feature_extractor:
def __init__(self): def __init__(self):
self._good_pos_ngrams=self.get_good_pos_ngrams() self._good_pos_ngrams = self.get_good_pos_ngrams()
self.dict_initialized=False self.dict_initialized = False
def initialize_dictionaries(self,e_set): def initialize_dictionaries(self, e_set):
if(hasattr(e_set, '_type')): if(hasattr(e_set, '_type')):
if(e_set._type=="train"): if(e_set._type == "train"):
nvocab=util_functions.get_vocab(e_set._text,e_set._score) nvocab = util_functions.get_vocab(e_set._text, e_set._score)
svocab=util_functions.get_vocab(e_set._clean_stem_text,e_set._score) svocab = util_functions.get_vocab(e_set._clean_stem_text, e_set._score)
self._normal_dict=CountVectorizer(min_n=1,max_n=2,vocabulary=nvocab) self._normal_dict = CountVectorizer(min_n=1, max_n=2, vocabulary=nvocab)
self._stem_dict=CountVectorizer(min_n=1,max_n=2,vocabulary=svocab) self._stem_dict = CountVectorizer(min_n=1, max_n=2, vocabulary=svocab)
self.dict_initialized=True self.dict_initialized = True
ret="ok" ret = "ok"
else: else:
raise util_functions.InputError(e_set,"needs to be an essay set of the train type.") raise util_functions.InputError(e_set, "needs to be an essay set of the train type.")
else: else:
raise util_functions.InputError(e_set,"wrong input. need an essay set object") raise util_functions.InputError(e_set, "wrong input. need an essay set object")
return ret return ret
def get_good_pos_ngrams(self): def get_good_pos_ngrams(self):
if(os.path.isfile("good_pos_ngrams.p")): if(os.path.isfile("good_pos_ngrams.p")):
good_pos_ngrams=pickle.load(open('good_pos_ngrams.p', 'rb')) good_pos_ngrams = pickle.load(open('good_pos_ngrams.p', 'rb'))
else : else:
essay_corpus=open("essaycorpus.txt").read() essay_corpus = open("essaycorpus.txt").read()
essay_corpus=util_functions.sub_chars(essay_corpus) essay_corpus = util_functions.sub_chars(essay_corpus)
good_pos_ngrams=util_functions.regenerate_good_tokens(essay_corpus) good_pos_ngrams = util_functions.regenerate_good_tokens(essay_corpus)
pickle.dump(good_pos_ngrams, open('good_pos_ngrams.p', 'wb')) pickle.dump(good_pos_ngrams, open('good_pos_ngrams.p', 'wb'))
return good_pos_ngrams return good_pos_ngrams
def gen_length_feats(self,e_set): def gen_length_feats(self, e_set):
text=e_set._text text = e_set._text
lengths=[len(e) for e in text] lengths = [len(e) for e in text]
word_counts=[len(t) for t in e_set._tokens] word_counts = [len(t) for t in e_set._tokens]
comma_count=[e.count(",") for e in text] comma_count = [e.count(",") for e in text]
ap_count=[e.count("'") for e in text] ap_count = [e.count("'") for e in text]
punc_count=[e.count(".")+e.count("?")+e.count("!") for e in text] punc_count = [e.count(".") + e.count("?") + e.count("!") for e in text]
chars_per_word=[lengths[m]/float(word_counts[m]) for m in xrange(0,len(text))] chars_per_word = [lengths[m] / float(word_counts[m]) for m in xrange(0, len(text))]
good_pos_tags=[] good_pos_tags = []
for i in xrange(0,len(text)) : for i in xrange(0, len(text)):
pos_seq=[tag[1] for tag in e_set._pos[i]] pos_seq = [tag[1] for tag in e_set._pos[i]]
pos_ngrams=util_functions.ngrams(pos_seq,2,4) pos_ngrams = util_functions.ngrams(pos_seq, 2, 4)
overlap_ngrams=[i for i in pos_ngrams if i in self._good_pos_ngrams] overlap_ngrams = [i for i in pos_ngrams if i in self._good_pos_ngrams]
good_pos_tags.append(len(overlap_ngrams)) good_pos_tags.append(len(overlap_ngrams))
good_pos_tag_prop=[good_pos_tags[m]/float(word_counts[m]) for m in xrange(0,len(text))] good_pos_tag_prop = [good_pos_tags[m] / float(word_counts[m]) for m in xrange(0, len(text))]
length_arr=numpy.array((lengths,word_counts,comma_count,ap_count,punc_count,chars_per_word,good_pos_tags,good_pos_tag_prop)).transpose() length_arr = numpy.array((
lengths, word_counts, comma_count, ap_count, punc_count, chars_per_word, good_pos_tags,
good_pos_tag_prop)).transpose()
return length_arr.copy() return length_arr.copy()
def gen_bag_feats(self,e_set): def gen_bag_feats(self, e_set):
if(hasattr(self, '_stem_dict')): if(hasattr(self, '_stem_dict')):
sfeats=self._stem_dict.transform(e_set._clean_stem_text) sfeats = self._stem_dict.transform(e_set._clean_stem_text)
nfeats=self._normal_dict.transform(e_set._text) nfeats = self._normal_dict.transform(e_set._text)
bag_feats=numpy.concatenate((sfeats.toarray(),nfeats.toarray()),axis=1) bag_feats = numpy.concatenate((sfeats.toarray(), nfeats.toarray()), axis=1)
else: else:
raise util_functions.InputError(self,"Dictionaries must be initialized prior to generating bag features.") raise util_functions.InputError(self, "Dictionaries must be initialized prior to generating bag features.")
return bag_feats.copy() return bag_feats.copy()
def gen_feats(self,e_set): def gen_feats(self, e_set):
bag_feats=self.gen_bag_feats(e_set) bag_feats = self.gen_bag_feats(e_set)
length_feats=self.gen_length_feats(e_set) length_feats = self.gen_length_feats(e_set)
prompt_feats=self.gen_prompt_feats(e_set) prompt_feats = self.gen_prompt_feats(e_set)
overall_feats=numpy.concatenate((length_feats,prompt_feats,bag_feats),axis=1) overall_feats = numpy.concatenate((length_feats, prompt_feats, bag_feats), axis=1)
overall_feats=overall_feats.copy() overall_feats = overall_feats.copy()
return overall_feats return overall_feats
def gen_prompt_feats(self,e_set): def gen_prompt_feats(self, e_set):
prompt_toks=nltk.word_tokenize(e_set._prompt) prompt_toks = nltk.word_tokenize(e_set._prompt)
expand_syns=[] expand_syns = []
for word in prompt_toks: for word in prompt_toks:
synonyms=util_functions.get_wordnet_syns(word) synonyms = util_functions.get_wordnet_syns(word)
expand_syns.append(synonyms) expand_syns.append(synonyms)
expand_syns=list(chain.from_iterable(expand_syns)) expand_syns = list(chain.from_iterable(expand_syns))
prompt_overlap=[] prompt_overlap = []
prompt_overlap_prop=[] prompt_overlap_prop = []
for j in e_set._tokens: for j in e_set._tokens:
prompt_overlap.append(len([i for i in j if i in prompt_toks])) prompt_overlap.append(len([i for i in j if i in prompt_toks]))
prompt_overlap_prop.append(prompt_overlap[len(prompt_overlap)-1]/float(len(j))) prompt_overlap_prop.append(prompt_overlap[len(prompt_overlap) - 1] / float(len(j)))
expand_overlap=[] expand_overlap = []
expand_overlap_prop=[] expand_overlap_prop = []
for j in e_set._tokens: for j in e_set._tokens:
expand_overlap.append(len([i for i in j if i in expand_syns])) expand_overlap.append(len([i for i in j if i in expand_syns]))
expand_overlap_prop.append(expand_overlap[len(expand_overlap)-1]/float(len(j))) expand_overlap_prop.append(expand_overlap[len(expand_overlap) - 1] / float(len(j)))
prompt_arr=numpy.array((prompt_overlap,prompt_overlap_prop,expand_overlap,expand_overlap_prop)).transpose() prompt_arr = numpy.array((prompt_overlap, prompt_overlap_prop, expand_overlap, expand_overlap_prop)).transpose()
return prompt_arr.copy() return prompt_arr.copy()
\ No newline at end of file
...@@ -10,7 +10,7 @@ import os ...@@ -10,7 +10,7 @@ import os
import sklearn.ensemble import sklearn.ensemble
from itertools import chain from itertools import chain
base_path = os.path.dirname( __file__ ) base_path = os.path.dirname(__file__)
sys.path.append(base_path) sys.path.append(base_path)
from essay_set import essay_set from essay_set import essay_set
...@@ -18,30 +18,31 @@ import util_functions ...@@ -18,30 +18,31 @@ import util_functions
import feature_extractor import feature_extractor
def read_in_test_data(filename): def read_in_test_data(filename):
id,e_set,score,score2,text=[],[],[],[],[] id, e_set, score, score2, text = [], [], [], [], []
combined_raw=open(filename).read() combined_raw = open(filename).read()
raw_lines=combined_raw.splitlines() raw_lines = combined_raw.splitlines()
for row in xrange(1,len(raw_lines)): for row in xrange(1, len(raw_lines)):
id1,set1,score1,score12,text1 = raw_lines[row].strip().split("\t") id1, set1, score1, score12, text1 = raw_lines[row].strip().split("\t")
id.append(int(id1)) id.append(int(id1))
text.append(text1) text.append(text1)
e_set.append(int(set1)) e_set.append(int(set1))
score.append(int(score1)) score.append(int(score1))
score2.append(int(score12)) score2.append(int(score12))
return score,text return score, text
def read_in_test_prompt(filename): def read_in_test_prompt(filename):
prompt_string=open(filename).read() prompt_string = open(filename).read()
return prompt_string return prompt_string
#Create an essay set. text and score should be lists of strings and ints, respectively. #Create an essay set. text and score should be lists of strings and ints, respectively.
def create_essay_set(text,score,prompt_string,generate_additional=True): def create_essay_set(text, score, prompt_string, generate_additional=True):
x=essay_set() x = essay_set()
for i in xrange(0,len(text)): for i in xrange(0, len(text)):
x.add_essay(text[i],score[i]) x.add_essay(text[i], score[i])
if score[i]==min(score) and generate_additional==True: if score[i] == min(score) and generate_additional == True:
x.generate_additional_essays(x._clean_text[len(x._clean_text)-1],score[i]) x.generate_additional_essays(x._clean_text[len(x._clean_text) - 1], score[i])
x.update_prompt(prompt_string) x.update_prompt(prompt_string)
...@@ -49,22 +50,22 @@ def create_essay_set(text,score,prompt_string,generate_additional=True): ...@@ -49,22 +50,22 @@ def create_essay_set(text,score,prompt_string,generate_additional=True):
#Feed in an essay set to get feature vector and classifier #Feed in an essay set to get feature vector and classifier
def extract_features_and_generate_model(essays): def extract_features_and_generate_model(essays):
f=feature_extractor.feature_extractor() f = feature_extractor.feature_extractor()
f.initialize_dictionaries(essays) f.initialize_dictionaries(essays)
train_feats=f.gen_feats(essays) train_feats = f.gen_feats(essays)
clf = sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05, clf = sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
max_depth=4, random_state=1, max_depth=4, random_state=1,
min_samples_leaf=3) min_samples_leaf=3)
model=util_functions.gen_model(clf,train_feats,essays._score) model = util_functions.gen_model(clf, train_feats, essays._score)
return f,clf return f, clf
#Writes out model to pickle file #Writes out model to pickle file
def dump_model_to_file(prompt_string,feature_ext,classifier,model_path): def dump_model_to_file(prompt_string, feature_ext, classifier, model_path):
model_file={'prompt': prompt_string, 'extractor' : feature_ext, 'model' : classifier} model_file = {'prompt': prompt_string, 'extractor': feature_ext, 'model': classifier}
pickle.dump(model_file,file=open(model_path,"w")) pickle.dump(model_file, file=open(model_path, "w"))
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment