Commit 6f7ae1c2 by Vik Paruchuri

added documentation strings and reformatted code

parent 88b1585b
...@@ -21,6 +21,12 @@ class FeatureExtractor: ...@@ -21,6 +21,12 @@ class FeatureExtractor:
self.dict_initialized = False self.dict_initialized = False
def initialize_dictionaries(self, e_set): def initialize_dictionaries(self, e_set):
"""
Initializes dictionaries from an essay set object
Dictionaries must be initialized prior to using this to extract features
e_set is an input essay set
returns a confirmation of initialization
"""
if(hasattr(e_set, '_type')): if(hasattr(e_set, '_type')):
if(e_set._type == "train"): if(e_set._type == "train"):
nvocab = util_functions.get_vocab(e_set._text, e_set._score) nvocab = util_functions.get_vocab(e_set._text, e_set._score)
...@@ -36,6 +42,10 @@ class FeatureExtractor: ...@@ -36,6 +42,10 @@ class FeatureExtractor:
return ret return ret
def get_good_pos_ngrams(self): def get_good_pos_ngrams(self):
"""
Gets a list of gramatically correct part of speech sequences from an input file called essaycorpus.txt
Returns the list and caches the file
"""
if(os.path.isfile("good_pos_ngrams.p")): if(os.path.isfile("good_pos_ngrams.p")):
good_pos_ngrams = pickle.load(open('good_pos_ngrams.p', 'rb')) good_pos_ngrams = pickle.load(open('good_pos_ngrams.p', 'rb'))
else: else:
...@@ -46,6 +56,11 @@ class FeatureExtractor: ...@@ -46,6 +56,11 @@ class FeatureExtractor:
return good_pos_ngrams return good_pos_ngrams
def gen_length_feats(self, e_set): def gen_length_feats(self, e_set):
"""
Generates length based features from an essay set
Generally an internal function called by gen_feats
Returns an array of length features
"""
text = e_set._text text = e_set._text
lengths = [len(e) for e in text] lengths = [len(e) for e in text]
word_counts = [len(t) for t in e_set._tokens] word_counts = [len(t) for t in e_set._tokens]
...@@ -68,6 +83,11 @@ class FeatureExtractor: ...@@ -68,6 +83,11 @@ class FeatureExtractor:
return length_arr.copy() return length_arr.copy()
def gen_bag_feats(self, e_set): def gen_bag_feats(self, e_set):
"""
Generates bag of words features from an input essay set and trained FeatureExtractor
Generally called by gen_feats
Returns an array of features
"""
if(hasattr(self, '_stem_dict')): if(hasattr(self, '_stem_dict')):
sfeats = self._stem_dict.transform(e_set._clean_stem_text) sfeats = self._stem_dict.transform(e_set._clean_stem_text)
nfeats = self._normal_dict.transform(e_set._text) nfeats = self._normal_dict.transform(e_set._text)
...@@ -77,6 +97,10 @@ class FeatureExtractor: ...@@ -77,6 +97,10 @@ class FeatureExtractor:
return bag_feats.copy() return bag_feats.copy()
def gen_feats(self, e_set): def gen_feats(self, e_set):
"""
Generates bag of words, length, and prompt features from an essay set object
returns an array of features
"""
bag_feats = self.gen_bag_feats(e_set) bag_feats = self.gen_bag_feats(e_set)
length_feats = self.gen_length_feats(e_set) length_feats = self.gen_length_feats(e_set)
prompt_feats = self.gen_prompt_feats(e_set) prompt_feats = self.gen_prompt_feats(e_set)
...@@ -86,6 +110,11 @@ class FeatureExtractor: ...@@ -86,6 +110,11 @@ class FeatureExtractor:
return overall_feats return overall_feats
def gen_prompt_feats(self, e_set): def gen_prompt_feats(self, e_set):
"""
Generates prompt based features from an essay set object and internal prompt variable.
Generally called internally by gen_feats
Returns an array of prompt features
"""
prompt_toks = nltk.word_tokenize(e_set._prompt) prompt_toks = nltk.word_tokenize(e_set._prompt)
expand_syns = [] expand_syns = []
for word in prompt_toks: for word in prompt_toks:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment