Commit e2c155af by Vik Paruchuri

move around data files

parent 6b14ee83
...@@ -24,6 +24,8 @@ if not base_path.endswith("/"): ...@@ -24,6 +24,8 @@ if not base_path.endswith("/"):
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
NGRAM_PATH = base_path + "data/good_pos_ngrams.p"
ESSAY_CORPUS_PATH = util_functions.ESSAY_CORPUS_PATH
class FeatureExtractor(object): class FeatureExtractor(object):
def __init__(self): def __init__(self):
...@@ -65,13 +67,13 @@ class FeatureExtractor(object): ...@@ -65,13 +67,13 @@ class FeatureExtractor(object):
Gets a list of gramatically correct part of speech sequences from an input file called essaycorpus.txt Gets a list of gramatically correct part of speech sequences from an input file called essaycorpus.txt
Returns the list and caches the file Returns the list and caches the file
""" """
if(os.path.isfile(base_path + "good_pos_ngrams.p")): if(os.path.isfile(NGRAM_PATH)):
good_pos_ngrams = pickle.load(open(base_path + 'good_pos_ngrams.p', 'rb')) good_pos_ngrams = pickle.load(open(NGRAM_PATH, 'rb'))
elif os.path.isfile(base_path + "essaycorpus.txt"): elif os.path.isfile(ESSAY_CORPUS_PATH):
essay_corpus = open(base_path + "essaycorpus.txt").read() essay_corpus = open(ESSAY_CORPUS_PATH).read()
essay_corpus = util_functions.sub_chars(essay_corpus) essay_corpus = util_functions.sub_chars(essay_corpus)
good_pos_ngrams = util_functions.regenerate_good_tokens(essay_corpus) good_pos_ngrams = util_functions.regenerate_good_tokens(essay_corpus)
pickle.dump(good_pos_ngrams, open(base_path + 'good_pos_ngrams.p', 'wb')) pickle.dump(good_pos_ngrams, open(NGRAM_PATH, 'wb'))
else: else:
#Hard coded list in case the needed files cannot be found #Hard coded list in case the needed files cannot be found
good_pos_ngrams=['NN PRP', 'NN PRP .', 'NN PRP . DT', 'PRP .', 'PRP . DT', 'PRP . DT NNP', '. DT', good_pos_ngrams=['NN PRP', 'NN PRP .', 'NN PRP . DT', 'PRP .', 'PRP . DT', 'PRP . DT NNP', '. DT',
......
...@@ -15,6 +15,14 @@ import logging ...@@ -15,6 +15,14 @@ import logging
log=logging.getLogger(__name__) log=logging.getLogger(__name__)
base_path = os.path.dirname(__file__)
sys.path.append(base_path)
if not base_path.endswith("/"):
base_path=base_path+"/"
ESSAY_CORPUS_PATH = base_path + "data/essaycorpus.txt"
ESSAY_COR_TOKENS_PATH = base_path + "data/essay_cor_tokens.p"
class AlgorithmTypes(object): class AlgorithmTypes(object):
regression = "regression" regression = "regression"
classification = "classifiction" classification = "classifiction"
...@@ -417,13 +425,13 @@ def get_separator_words(toks1): ...@@ -417,13 +425,13 @@ def get_separator_words(toks1):
Returns a list of separator words Returns a list of separator words
""" """
tab_toks1 = nltk.FreqDist(word.lower() for word in toks1) tab_toks1 = nltk.FreqDist(word.lower() for word in toks1)
if(os.path.isfile("essay_cor_tokens.p")): if(os.path.isfile(ESSAY_COR_TOKENS_PATH)):
toks2 = pickle.load(open('essay_cor_tokens.p', 'rb')) toks2 = pickle.load(open(ESSAY_COR_TOKENS_PATH, 'rb'))
else: else:
essay_corpus = open("essaycorpus.txt").read() essay_corpus = open(ESSAY_CORPUS_PATH).read()
essay_corpus = sub_chars(essay_corpus) essay_corpus = sub_chars(essay_corpus)
toks2 = nltk.FreqDist(word.lower() for word in nltk.word_tokenize(essay_corpus)) toks2 = nltk.FreqDist(word.lower() for word in nltk.word_tokenize(essay_corpus))
pickle.dump(toks2, open('essay_cor_tokens.p', 'wb')) pickle.dump(toks2, open(ESSAY_COR_TOKENS_PATH, 'wb'))
sep_words = [] sep_words = []
for word in tab_toks1.keys(): for word in tab_toks1.keys():
tok1_present = tab_toks1[word] tok1_present = tab_toks1[word]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment