move around data files

e2c155af · Vik Paruchuri · 6b14ee83 · e2c155af · e2c155af · e2c155af
Commit e2c155af authored Feb 26, 2013 by Vik Paruchuri
Show whitespace changes
Inline Side-by-side

Showing with 19 additions and 9 deletions

data/essaycorpus.txt
+0 -0

data/good_pos_ngrams.p
+0 -0

feature_extractor.py
+7 -5

util_functions.py
+12 -4

No files found.
--- a/essaycorpus.txt
+++ b/essaycorpus.txt
--- a/good_pos_ngrams.p
+++ b/good_pos_ngrams.p
--- a/feature_extractor.py
+++ b/feature_extractor.py
@@ -24,6 +24,8 @@ if not base_path.endswith("/"):
 log = logging.getLogger(__name__)
+NGRAM_PATH = base_path + "data/good_pos_ngrams.p"
+ESSAY_CORPUS_PATH = util_functions.ESSAY_CORPUS_PATH
 class FeatureExtractor(object):
    def __init__(self):
@@ -65,13 +67,13 @@ class FeatureExtractor(object):
        Gets a list of gramatically correct part of speech sequences from an input file called essaycorpus.txt
        Returns the list and caches the file
        """
-        if(os.path.isfile(base_path + "good_pos_ngrams.p")):
+        if(os.path.isfile(NGRAM_PATH)):
-            good_pos_ngrams = pickle.load(open(base_path + 'good_pos_ngrams.p', 'rb'))
+            good_pos_ngrams = pickle.load(open(NGRAM_PATH, 'rb'))
-        elif os.path.isfile(base_path + "essaycorpus.txt"):
+        elif os.path.isfile(ESSAY_CORPUS_PATH):
-            essay_corpus = open(base_path + "essaycorpus.txt").read()
+            essay_corpus = open(ESSAY_CORPUS_PATH).read()
            essay_corpus = util_functions.sub_chars(essay_corpus)
            good_pos_ngrams = util_functions.regenerate_good_tokens(essay_corpus)
-            pickle.dump(good_pos_ngrams, open(base_path + 'good_pos_ngrams.p', 'wb'))
+            pickle.dump(good_pos_ngrams, open(NGRAM_PATH, 'wb'))
        else:
            #Hard coded list in case the needed files cannot be found
            good_pos_ngrams=['NN PRP', 'NN PRP .', 'NN PRP . DT', 'PRP .', 'PRP . DT', 'PRP . DT NNP', '. DT',

--- a/util_functions.py
+++ b/util_functions.py
@@ -15,6 +15,14 @@ import logging
 log=logging.getLogger(__name__)
+base_path = os.path.dirname(__file__)
+sys.path.append(base_path)
+if not base_path.endswith("/"):
+    base_path=base_path+"/"
+ESSAY_CORPUS_PATH = base_path + "data/essaycorpus.txt"
+ESSAY_COR_TOKENS_PATH = base_path + "data/essay_cor_tokens.p"
 class AlgorithmTypes(object):
    regression = "regression"
    classification = "classifiction"
@@ -417,13 +425,13 @@ def get_separator_words(toks1):
    Returns a list of separator words
    """
    tab_toks1 = nltk.FreqDist(word.lower() for word in toks1)
-    if(os.path.isfile("essay_cor_tokens.p")):
+    if(os.path.isfile(ESSAY_COR_TOKENS_PATH)):
-        toks2 = pickle.load(open('essay_cor_tokens.p', 'rb'))
+        toks2 = pickle.load(open(ESSAY_COR_TOKENS_PATH, 'rb'))
    else:
-        essay_corpus = open("essaycorpus.txt").read()
+        essay_corpus = open(ESSAY_CORPUS_PATH).read()
        essay_corpus = sub_chars(essay_corpus)
        toks2 = nltk.FreqDist(word.lower() for word in nltk.word_tokenize(essay_corpus))
-        pickle.dump(toks2, open('essay_cor_tokens.p', 'wb'))
+        pickle.dump(toks2, open(ESSAY_COR_TOKENS_PATH, 'wb'))
    sep_words = []
    for word in tab_toks1.keys():
        tok1_present = tab_toks1[word]