Commit ef68f076 by Vik Paruchuri

updates to base path and removed deprecated sklearn code

parent e90b1d90
...@@ -13,6 +13,9 @@ base_path = os.path.dirname(__file__) ...@@ -13,6 +13,9 @@ base_path = os.path.dirname(__file__)
sys.path.append(base_path) sys.path.append(base_path)
import util_functions import util_functions
if not base_path.endswith("/"):
base_path=base_path+"/"
MAXIMUM_ESSAY_LENGTH=20000 MAXIMUM_ESSAY_LENGTH=20000
class EssaySet(object): class EssaySet(object):
......
...@@ -16,6 +16,9 @@ sys.path.append(base_path) ...@@ -16,6 +16,9 @@ sys.path.append(base_path)
from essay_set import EssaySet from essay_set import EssaySet
import util_functions import util_functions
if not base_path.endswith("/"):
base_path=base_path+"/"
class FeatureExtractor(object): class FeatureExtractor(object):
def __init__(self): def __init__(self):
...@@ -33,8 +36,8 @@ class FeatureExtractor(object): ...@@ -33,8 +36,8 @@ class FeatureExtractor(object):
if(e_set._type == "train"): if(e_set._type == "train"):
nvocab = util_functions.get_vocab(e_set._text, e_set._score) nvocab = util_functions.get_vocab(e_set._text, e_set._score)
svocab = util_functions.get_vocab(e_set._clean_stem_text, e_set._score) svocab = util_functions.get_vocab(e_set._clean_stem_text, e_set._score)
self._normal_dict = CountVectorizer(min_n=1, max_n=2, vocabulary=nvocab) self._normal_dict = CountVectorizer(ngram_range=(1,2), vocabulary=nvocab)
self._stem_dict = CountVectorizer(min_n=1, max_n=2, vocabulary=svocab) self._stem_dict = CountVectorizer(ngram_range=(1,2), vocabulary=svocab)
self.dict_initialized = True self.dict_initialized = True
ret = "ok" ret = "ok"
else: else:
...@@ -48,13 +51,13 @@ class FeatureExtractor(object): ...@@ -48,13 +51,13 @@ class FeatureExtractor(object):
Gets a list of gramatically correct part of speech sequences from an input file called essaycorpus.txt Gets a list of gramatically correct part of speech sequences from an input file called essaycorpus.txt
Returns the list and caches the file Returns the list and caches the file
""" """
if(os.path.isfile("good_pos_ngrams.p")): if(os.path.isfile(base_path + "good_pos_ngrams.p")):
good_pos_ngrams = pickle.load(open('good_pos_ngrams.p', 'rb')) good_pos_ngrams = pickle.load(open(base_path + 'good_pos_ngrams.p', 'rb'))
else: else:
essay_corpus = open("essaycorpus.txt").read() essay_corpus = open(base_path + "essaycorpus.txt").read()
essay_corpus = util_functions.sub_chars(essay_corpus) essay_corpus = util_functions.sub_chars(essay_corpus)
good_pos_ngrams = util_functions.regenerate_good_tokens(essay_corpus) good_pos_ngrams = util_functions.regenerate_good_tokens(essay_corpus)
pickle.dump(good_pos_ngrams, open('good_pos_ngrams.p', 'wb')) pickle.dump(good_pos_ngrams, open(base_path + 'good_pos_ngrams.p', 'wb'))
return good_pos_ngrams return good_pos_ngrams
def gen_length_feats(self, e_set): def gen_length_feats(self, e_set):
...@@ -65,7 +68,7 @@ class FeatureExtractor(object): ...@@ -65,7 +68,7 @@ class FeatureExtractor(object):
""" """
text = e_set._text text = e_set._text
lengths = [len(e) for e in text] lengths = [len(e) for e in text]
word_counts = [len(t) for t in e_set._tokens] word_counts = [max(len(t),1) for t in e_set._tokens]
comma_count = [e.count(",") for e in text] comma_count = [e.count(",") for e in text]
ap_count = [e.count("'") for e in text] ap_count = [e.count("'") for e in text]
punc_count = [e.count(".") + e.count("?") + e.count("!") for e in text] punc_count = [e.count(".") + e.count("?") + e.count("!") for e in text]
......
...@@ -150,7 +150,7 @@ def get_vocab(text, score, max_feats=750, max_feats2=200): ...@@ -150,7 +150,7 @@ def get_vocab(text, score, max_feats=750, max_feats2=200):
max_feats2 is the maximum number of features to consider in the second (final) pass max_feats2 is the maximum number of features to consider in the second (final) pass
Returns a list of words that constitute the significant vocabulary Returns a list of words that constitute the significant vocabulary
""" """
dict = CountVectorizer(min_n=1, max_n=2, max_features=max_feats) dict = CountVectorizer(ngram_range=(1,2), max_features=max_feats)
dict_mat = dict.fit_transform(text) dict_mat = dict.fit_transform(text)
set_score = numpy.asarray(score, dtype=numpy.int) set_score = numpy.asarray(score, dtype=numpy.int)
med_score = numpy.median(set_score) med_score = numpy.median(set_score)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment