Commit 9a92afda by gradyward

Gutted the util_functions.py file

parent 90bde0cd
...@@ -14,7 +14,6 @@ import numpy ...@@ -14,7 +14,6 @@ import numpy
from itertools import chain from itertools import chain
import math import math
import nltk import nltk
import pickle
import logging import logging
import sys import sys
import tempfile import tempfile
...@@ -39,21 +38,6 @@ class AlgorithmTypes(object): ...@@ -39,21 +38,6 @@ class AlgorithmTypes(object):
classification = "classifiction" classification = "classifiction"
def create_model_path(model_path):
"""
Creates a path to model files
model_path - string
"""
if not model_path.startswith("/") and not model_path.startswith("models/"):
model_path = "/" + model_path
if not model_path.startswith("models"):
model_path = "models" + model_path
if not model_path.endswith(".p"):
model_path += ".p"
return model_path
def sub_chars(string): def sub_chars(string):
""" """
Strips illegal characters from a string. Used to sanitize input essays. Strips illegal characters from a string. Used to sanitize input essays.
...@@ -176,32 +160,6 @@ def make_unique(sequence): ...@@ -176,32 +160,6 @@ def make_unique(sequence):
return list(set(sequence)) return list(set(sequence))
def count_list(the_list):
"""
Generates a count of the number of times each unique item appears in a list
"""
count = the_list.count
result = [(item, count(item)) for item in set(the_list)]
result.sort()
return result
def regenerate_good_tokens(string):
"""
Given an input string, part of speech tags the string, then generates a list of
ngrams that appear in the string.
Used to define grammatically correct part of speech tag sequences.
Returns a list of part of speech tag sequences.
"""
toks = nltk.word_tokenize(string)
pos_string = nltk.pos_tag(toks)
pos_seq = [tag[1] for tag in pos_string]
pos_ngrams = ngrams(pos_seq, 2, 4)
# TODO POTENTIAL ISSUE WITH NON STABLE ALGORITHM F7!?!
sel_pos_ngrams = make_unique(pos_ngrams)
return sel_pos_ngrams
def get_vocab(essays, scores, max_features_pass_1=750, max_features_pass_2=200): def get_vocab(essays, scores, max_features_pass_1=750, max_features_pass_2=200):
""" """
Uses a fisher test to find words that are significant in that they separate Uses a fisher test to find words that are significant in that they separate
...@@ -255,41 +213,7 @@ def get_vocab(essays, scores, max_features_pass_1=750, max_features_pass_2=200): ...@@ -255,41 +213,7 @@ def get_vocab(essays, scores, max_features_pass_1=750, max_features_pass_2=200):
return vocab return vocab
def edit_distance(s1, s2): class InputError(Exception):
"""
Calculates string edit distance between string 1 and string 2.
Deletion, insertion, substitution, and transposition all increase edit distance.
"""
d = {}
lenstr1 = len(s1)
lenstr2 = len(s2)
for i in xrange(-1, lenstr1 + 1):
d[(i, -1)] = i + 1
for j in xrange(-1, lenstr2 + 1):
d[(-1, j)] = j + 1
for i in xrange(lenstr1):
for j in xrange(lenstr2):
if s1[i] == s2[j]:
cost = 0
else:
cost = 1
d[(i, j)] = min(
d[(i - 1, j)] + 1, # deletion
d[(i, j - 1)] + 1, # insertion
d[(i - 1, j - 1)] + cost, # substitution
)
if i and j and s1[i] == s2[j - 1] and s1[i - 1] == s2[j]:
d[(i, j)] = min(d[(i, j)], d[i - 2, j - 2] + cost) # transposition
return d[lenstr1 - 1, lenstr2 - 1]
class Error(Exception):
pass
class InputError(Error):
def __init__(self, expr, msg): def __init__(self, expr, msg):
self.expr = expr self.expr = expr
self.msg = msg self.msg = msg
...@@ -324,45 +248,6 @@ def gen_cv_preds(clf, arr, sel_score, num_chunks=3): ...@@ -324,45 +248,6 @@ def gen_cv_preds(clf, arr, sel_score, num_chunks=3):
return (all_preds) return (all_preds)
def gen_model(clf, arr, sel_score):
"""
Fits a classifier to data and a target score
clf is an input classifier that implements the fit method.
arr is a data array(X)
sel_score is the target list (y) where y[n] corresponds to X[n,:]
sim_fit is not a useful return value. Instead the clf is the useful output.
"""
set_score = numpy.asarray(sel_score, dtype=numpy.int)
sim_fit = clf.fit(arr, set_score)
return (sim_fit)
def gen_preds(clf, arr):
"""
Generates predictions on a novel data array using a fit classifier
clf is a classifier that has already been fit
arr is a data array identical in dimension to the array clf was trained on
Returns the array of predictions.
"""
if (hasattr(clf, "predict_proba")):
ret = clf.predict(arr)
# pred_score=preds.argmax(1)+min(x._score)
else:
ret = clf.predict(arr)
return ret
def calc_list_average(l):
"""
Calculates the average value of a list of numbers
Returns a float
"""
total = 0.0
for value in l:
total += value
return total / len(l)
stdev = lambda d: (sum((x - 1. * sum(d) / len(d)) ** 2 for x in d) / (1. * (len(d) - 1))) ** .5 stdev = lambda d: (sum((x - 1. * sum(d) / len(d)) ** 2 for x in d) / (1. * (len(d) - 1))) ** .5
...@@ -464,59 +349,3 @@ def get_wordnet_syns(word): ...@@ -464,59 +349,3 @@ def get_wordnet_syns(word):
synonyms.append(pat.sub(" ", swords.lower())) synonyms.append(pat.sub(" ", swords.lower()))
synonyms = make_unique(synonyms) synonyms = make_unique(synonyms)
return synonyms return synonyms
def get_separator_words(toks1):
"""
Finds the words that separate a list of tokens from a background corpus
Basically this generates a list of informative/interesting words in a set
toks1 is a list of words
Returns a list of separator words
"""
tab_toks1 = nltk.FreqDist(word.lower() for word in toks1)
if (os.path.isfile(ESSAY_COR_TOKENS_PATH)):
toks2 = pickle.load(open(ESSAY_COR_TOKENS_PATH, 'rb'))
else:
essay_corpus = open(ESSAY_CORPUS_PATH).read()
essay_corpus = sub_chars(essay_corpus)
toks2 = nltk.FreqDist(word.lower() for word in nltk.word_tokenize(essay_corpus))
pickle.dump(toks2, open(ESSAY_COR_TOKENS_PATH, 'wb'))
sep_words = []
for word in tab_toks1.keys():
tok1_present = tab_toks1[word]
if (tok1_present > 2):
tok1_total = tab_toks1._N
tok2_present = toks2[word]
tok2_total = toks2._N
fish_val = pvalue(tok1_present, tok2_present, tok1_total, tok2_total).two_tail
if (fish_val < .001 and tok1_present / float(tok1_total) > (tok2_present / float(tok2_total)) * 2):
sep_words.append(word)
sep_words = [w for w in sep_words if not w in nltk.corpus.stopwords.words("english") and len(w) > 5]
return sep_words
def encode_plus(s):
"""
Literally encodes the plus sign
input is a string
returns the string with plus signs encoded
"""
regex = r"\+"
pat = re.compile(regex)
return pat.sub("%2B", s)
def getMedian(numericValues):
"""
Gets the median of a list of values
Returns a float/int
"""
theValues = sorted(numericValues)
if len(theValues) % 2 == 1:
return theValues[(len(theValues) + 1) / 2 - 1]
else:
lower = theValues[len(theValues) / 2 - 1]
upper = theValues[len(theValues) / 2]
return (float(lower + upper)) / 2
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment