Commit e2c155af by Vik Paruchuri

move around data files

parent 6b14ee83
......@@ -24,6 +24,8 @@ if not base_path.endswith("/"):
log = logging.getLogger(__name__)
NGRAM_PATH = base_path + "data/good_pos_ngrams.p"
ESSAY_CORPUS_PATH = util_functions.ESSAY_CORPUS_PATH
class FeatureExtractor(object):
def __init__(self):
......@@ -65,13 +67,13 @@ class FeatureExtractor(object):
Gets a list of gramatically correct part of speech sequences from an input file called essaycorpus.txt
Returns the list and caches the file
"""
if(os.path.isfile(base_path + "good_pos_ngrams.p")):
good_pos_ngrams = pickle.load(open(base_path + 'good_pos_ngrams.p', 'rb'))
elif os.path.isfile(base_path + "essaycorpus.txt"):
essay_corpus = open(base_path + "essaycorpus.txt").read()
if(os.path.isfile(NGRAM_PATH)):
good_pos_ngrams = pickle.load(open(NGRAM_PATH, 'rb'))
elif os.path.isfile(ESSAY_CORPUS_PATH):
essay_corpus = open(ESSAY_CORPUS_PATH).read()
essay_corpus = util_functions.sub_chars(essay_corpus)
good_pos_ngrams = util_functions.regenerate_good_tokens(essay_corpus)
pickle.dump(good_pos_ngrams, open(base_path + 'good_pos_ngrams.p', 'wb'))
pickle.dump(good_pos_ngrams, open(NGRAM_PATH, 'wb'))
else:
#Hard coded list in case the needed files cannot be found
good_pos_ngrams=['NN PRP', 'NN PRP .', 'NN PRP . DT', 'PRP .', 'PRP . DT', 'PRP . DT NNP', '. DT',
......
......@@ -15,6 +15,14 @@ import logging
log=logging.getLogger(__name__)
base_path = os.path.dirname(__file__)
sys.path.append(base_path)
if not base_path.endswith("/"):
base_path=base_path+"/"
ESSAY_CORPUS_PATH = base_path + "data/essaycorpus.txt"
ESSAY_COR_TOKENS_PATH = base_path + "data/essay_cor_tokens.p"
class AlgorithmTypes(object):
regression = "regression"
classification = "classifiction"
......@@ -417,13 +425,13 @@ def get_separator_words(toks1):
Returns a list of separator words
"""
tab_toks1 = nltk.FreqDist(word.lower() for word in toks1)
if(os.path.isfile("essay_cor_tokens.p")):
toks2 = pickle.load(open('essay_cor_tokens.p', 'rb'))
if(os.path.isfile(ESSAY_COR_TOKENS_PATH)):
toks2 = pickle.load(open(ESSAY_COR_TOKENS_PATH, 'rb'))
else:
essay_corpus = open("essaycorpus.txt").read()
essay_corpus = open(ESSAY_CORPUS_PATH).read()
essay_corpus = sub_chars(essay_corpus)
toks2 = nltk.FreqDist(word.lower() for word in nltk.word_tokenize(essay_corpus))
pickle.dump(toks2, open('essay_cor_tokens.p', 'wb'))
pickle.dump(toks2, open(ESSAY_COR_TOKENS_PATH, 'wb'))
sep_words = []
for word in tab_toks1.keys():
tok1_present = tab_toks1[word]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment