Commit 88b1585b by Vik Paruchuri

changed class names, added documentation

parent 4f1c10ca
Project to integrate machine learning based essay scoring with xserver. Aspell must be installed and added to path to run. numpy, scipy, sklearn, and nltk also need to be installed.
Nltk also requires the treebank maxent tagger and wordnet to be installed. These can be installed through the nltk downloader(nltk.download()), or programatically through python -m nltk.downloader maxent_treebank_pos_tagger wordnet .
Runnable files:
1. create_test_models.py
......
Project to integrate machine learning based essay scoring with xserver. Aspell must be installed and added to path to run. numpy, scipy and sklearn also need to be installed.
Project to integrate machine learning based essay scoring with xserver. Aspell must be installed and added to path to run. numpy, scipy, sklearn, and nltk also need to be installed.
Runnable files:
......
......@@ -12,7 +12,7 @@ sys.path.append(base_path)
import util_functions
class essay_set:
class EssaySet:
def __init__(self, type="train"):
"""
Initialize variables and check essay set type
......
......@@ -11,11 +11,11 @@ from itertools import chain
base_path = os.path.dirname(__file__)
sys.path.append(base_path)
from essay_set import essay_set
from essay_set import EssaySet
import util_functions
class feature_extractor:
class FeatureExtractor:
def __init__(self):
self._good_pos_ngrams = self.get_good_pos_ngrams()
self.dict_initialized = False
......
......@@ -13,11 +13,16 @@ from itertools import chain
base_path = os.path.dirname(__file__)
sys.path.append(base_path)
from essay_set import essay_set
from essay_set import EssaySet
import util_functions
import feature_extractor
def read_in_test_data(filename):
"""
Reads in test data file found at filename.
filename must be a tab delimited file with columns id, dummy number column, score, dummy score, text
returns the score and the text
"""
id, e_set, score, score2, text = [], [], [], [], []
combined_raw = open(filename).read()
raw_lines = combined_raw.splitlines()
......@@ -33,12 +38,23 @@ def read_in_test_data(filename):
def read_in_test_prompt(filename):
"""
Reads in the prompt from a text file
Returns string
"""
prompt_string = open(filename).read()
return prompt_string
#Create an essay set. text and score should be lists of strings and ints, respectively.
def create_essay_set(text, score, prompt_string, generate_additional=True):
x = essay_set()
"""
Creates an essay set from given data.
Text should be a list of strings corresponding to essay text.
Score should be a list of scores where score[n] corresponds to text[n]
Prompt string is just a string containing the essay prompt.
Generate_additional indicates whether to generate additional essays at the minimum score point or not.
"""
x = EssaySet()
for i in xrange(0, len(text)):
x.add_essay(text[i], score[i])
if score[i] == min(score) and generate_additional == True:
......@@ -48,9 +64,13 @@ def create_essay_set(text, score, prompt_string, generate_additional=True):
return x
#Feed in an essay set to get feature vector and classifier
def extract_features_and_generate_model(essays):
f = feature_extractor.feature_extractor()
"""
Feed in an essay set to get feature vector and classifier
essays must be an essay set object
returns a trained FeatureExtractor object and a trained classifier
"""
f = feature_extractor.FeatureExtractor()
f.initialize_dictionaries(essays)
train_feats = f.gen_feats(essays)
......@@ -63,8 +83,14 @@ def extract_features_and_generate_model(essays):
return f, clf
#Writes out model to pickle file
def dump_model_to_file(prompt_string, feature_ext, classifier, model_path):
"""
Writes out a model to a file.
prompt string is a string containing the prompt
feature_ext is a trained FeatureExtractor object
classifier is a trained classifier
model_path is the path of write out the model file to
"""
model_file = {'prompt': prompt_string, 'extractor': feature_ext, 'model': classifier}
pickle.dump(model_file, file=open(model_path, "w"))
......
......@@ -301,6 +301,7 @@ def quadratic_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None)
rater_b is a list of rater b scores
min_rating is an optional argument describing the minimum rating possible on the data set
max_rating is an optional argument describing the maximum rating possible on the data set
Returns a float corresponding to the kappa correlation
"""
assert(len(rater_a) == len(rater_b))
if min_rating is None:
......@@ -333,6 +334,11 @@ def quadratic_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None)
def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None):
"""
Generates a confusion matrix between rater_a and rater_b
A confusion matrix shows how often 2 values agree and disagree
See quadratic_weighted_kappa for argument descriptions
"""
assert(len(rater_a) == len(rater_b))
if min_rating is None:
min_rating = min(rater_a)
......@@ -347,6 +353,11 @@ def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None):
def histogram(ratings, min_rating=None, max_rating=None):
"""
Generates a frequency count of each rating on the scale
ratings is a list of scores
Returns a list of frequencies
"""
if min_rating is None:
min_rating = min(ratings)
if max_rating is None:
......@@ -359,6 +370,11 @@ def histogram(ratings, min_rating=None, max_rating=None):
def get_wordnet_syns(word):
"""
Utilize wordnet (installed with nltk) to get synonyms for words
word is the input word
returns a list of unique synonyms
"""
synonyms = []
regex = r"_"
pat = re.compile(regex)
......@@ -371,6 +387,12 @@ def get_wordnet_syns(word):
def get_separator_words(toks1):
"""
Finds the words that separate a list of tokens from a background corpus
Basically this generates a list of informative/interesting words in a set
toks1 is a list of words
Returns a list of separator words
"""
tab_toks1 = nltk.FreqDist(word.lower() for word in toks1)
if(os.path.isfile("essay_cor_tokens.p")):
toks2 = pickle.load(open('essay_cor_tokens.p', 'rb'))
......@@ -395,12 +417,21 @@ def get_separator_words(toks1):
def encode_plus(s):
"""
Literally encodes the plus sign
input is a string
returns the string with plus signs encoded
"""
regex = r"\+"
pat = re.compile(regex)
return pat.sub("%2B", s)
def getMedian(numericValues):
"""
Gets the median of a list of values
Returns a float/int
"""
theValues = sorted(numericValues)
if len(theValues) % 2 == 1:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment