changed class names, added documentation

88b1585b · Vik Paruchuri · 4f1c10ca · 88b1585b · 88b1585b · 88b1585b
Commit 88b1585b authored Oct 24, 2012 by Vik Paruchuri
Hide whitespace changes
Inline Side-by-side

Showing with 69 additions and 10 deletions

Readme.md
+2 -0

Readme.md~
+1 -1

essay_set.py
+1 -1

feature_extractor.py
+2 -2

model_creator.py
+32 -6

util_functions.py
+31 -0

No files found.
--- a/Readme.md
+++ b/Readme.md
 Project to integrate machine learning based essay scoring with xserver. Aspell must be installed and added to path to run.  numpy, scipy, sklearn, and nltk also need to be installed.

+Nltk also requires the treebank maxent tagger and wordnet to be installed.  These can be installed through the nltk downloader(nltk.download()), or programatically through  python -m nltk.downloader maxent_treebank_pos_tagger wordnet .
+
 Runnable files:

 1. create_test_models.py 

--- a/Readme.md~
+++ b/Readme.md~
-Project to integrate machine learning based essay scoring with xserver. Aspell must be installed and added to path to run.  numpy, scipy and sklearn also need to be installed.
+Project to integrate machine learning based essay scoring with xserver. Aspell must be installed and added to path to run.  numpy, scipy, sklearn, and nltk also need to be installed.

 Runnable files:


--- a/essay_set.py
+++ b/essay_set.py
@@ -12,7 +12,7 @@ sys.path.append(base_path)
 import util_functions


-class essay_set:
+class EssaySet:
    def __init__(self, type="train"):
        """
        Initialize variables and check essay set type

--- a/feature_extractor.py
+++ b/feature_extractor.py
@@ -11,11 +11,11 @@ from itertools import chain

 base_path = os.path.dirname(__file__)
 sys.path.append(base_path)
-from essay_set import essay_set
+from essay_set import EssaySet
 import util_functions


-class feature_extractor:
+class FeatureExtractor:
    def __init__(self):
        self._good_pos_ngrams = self.get_good_pos_ngrams()
        self.dict_initialized = False

--- a/model_creator.py
+++ b/model_creator.py
@@ -13,11 +13,16 @@ from itertools import chain
 base_path = os.path.dirname(__file__)
 sys.path.append(base_path)

-from essay_set import essay_set
+from essay_set import EssaySet
 import util_functions
 import feature_extractor

 def read_in_test_data(filename):
+    """
+    Reads in test data file found at filename.
+    filename must be a tab delimited file with columns id, dummy number column, score, dummy score, text
+    returns the score and the text
+    """
    id, e_set, score, score2, text = [], [], [], [], []
    combined_raw = open(filename).read()
    raw_lines = combined_raw.splitlines()
@@ -33,12 +38,23 @@ def read_in_test_data(filename):


 def read_in_test_prompt(filename):
+    """
+    Reads in the prompt from a text file
+    Returns string
+    """
    prompt_string = open(filename).read()
    return prompt_string

-#Create an essay set.  text and score should be lists of strings and ints, respectively.
+
 def create_essay_set(text, score, prompt_string, generate_additional=True):
-    x = essay_set()
+    """
+    Creates an essay set from given data.
+    Text should be a list of strings corresponding to essay text.
+    Score should be a list of scores where score[n] corresponds to text[n]
+    Prompt string is just a string containing the essay prompt.
+    Generate_additional indicates whether to generate additional essays at the minimum score point or not.
+    """
+    x = EssaySet()
    for i in xrange(0, len(text)):
        x.add_essay(text[i], score[i])
        if score[i] == min(score) and generate_additional == True:
@@ -48,9 +64,13 @@ def create_essay_set(text, score, prompt_string, generate_additional=True):

    return x

-#Feed in an essay set to get feature vector and classifier
 def extract_features_and_generate_model(essays):
-    f = feature_extractor.feature_extractor()
+    """
+    Feed in an essay set to get feature vector and classifier
+    essays must be an essay set object
+    returns a trained FeatureExtractor object and a trained classifier
+    """
+    f = feature_extractor.FeatureExtractor()
    f.initialize_dictionaries(essays)

    train_feats = f.gen_feats(essays)
@@ -63,8 +83,14 @@ def extract_features_and_generate_model(essays):

    return f, clf

-#Writes out model to pickle file
 def dump_model_to_file(prompt_string, feature_ext, classifier, model_path):
+    """
+    Writes out a model to a file.
+    prompt string is a string containing the prompt
+    feature_ext is a trained FeatureExtractor object
+    classifier is a trained classifier
+    model_path is the path of write out the model file to
+    """
    model_file = {'prompt': prompt_string, 'extractor': feature_ext, 'model': classifier}
    pickle.dump(model_file, file=open(model_path, "w"))


--- a/util_functions.py
+++ b/util_functions.py
@@ -301,6 +301,7 @@ def quadratic_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None)
    rater_b is a list of rater b scores
    min_rating is an optional argument describing the minimum rating possible on the data set
    max_rating is an optional argument describing the maximum rating possible on the data set
+    Returns a float corresponding to the kappa correlation
    """
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
@@ -333,6 +334,11 @@ def quadratic_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None)


 def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None):
+    """
+    Generates a confusion matrix between rater_a and rater_b
+    A confusion matrix shows how often 2 values agree and disagree
+    See quadratic_weighted_kappa for argument descriptions
+    """
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(rater_a)
@@ -347,6 +353,11 @@ def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None):


 def histogram(ratings, min_rating=None, max_rating=None):
+    """
+    Generates a frequency count of each rating on the scale
+    ratings is a list of scores
+    Returns a list of frequencies
+    """
    if min_rating is None:
        min_rating = min(ratings)
    if max_rating is None:
@@ -359,6 +370,11 @@ def histogram(ratings, min_rating=None, max_rating=None):


 def get_wordnet_syns(word):
+    """
+    Utilize wordnet (installed with nltk) to get synonyms for words
+    word is the input word
+    returns a list of unique synonyms
+    """
    synonyms = []
    regex = r"_"
    pat = re.compile(regex)
@@ -371,6 +387,12 @@ def get_wordnet_syns(word):


 def get_separator_words(toks1):
+    """
+    Finds the words that separate a list of tokens from a background corpus
+    Basically this generates a list of informative/interesting words in a set
+    toks1 is a list of words
+    Returns a list of separator words
+    """
    tab_toks1 = nltk.FreqDist(word.lower() for word in toks1)
    if(os.path.isfile("essay_cor_tokens.p")):
        toks2 = pickle.load(open('essay_cor_tokens.p', 'rb'))
@@ -395,12 +417,21 @@ def get_separator_words(toks1):


 def encode_plus(s):
+    """
+    Literally encodes the plus sign
+    input is a string
+    returns the string with plus signs encoded
+    """
    regex = r"\+"
    pat = re.compile(regex)
    return pat.sub("%2B", s)


 def getMedian(numericValues):
+    """
+    Gets the median of a list of values
+    Returns a float/int
+    """
    theValues = sorted(numericValues)

    if len(theValues) % 2 == 1: