Commit 9ae1a8a3 by Will Daly

More regex patterns, add a stemmer

parent 4b77c5c0
"""
"""
import pickle
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import FeatureUnion
......@@ -23,6 +24,8 @@ WORD_PATTERNS = [
(r'.*ic$', 'JJ'),
(r'.*est$', 'JJ'),
(r'^a$', 'PREP'),
(r'.*s$', 'NNS'),
(r'.*', 'NN')
]
......@@ -34,6 +37,11 @@ def tokenizer(text):
]
def stemmer(text):
stemmer = nltk.PorterStemmer()
return [stemmer.stem(token) for token in nltk.word_tokenize(text)]
class ClassyAlgorithm(AIAlgorithm):
"""
A super-classy text classification algorithm :)
......@@ -54,11 +62,15 @@ class ClassyAlgorithm(AIAlgorithm):
"""
pipeline = FeatureUnion([
('tfid', TfidfVectorizer(min_df=1, ngram_range=(1, 2), stop_words='english')),
('tfid', TfidfVectorizer(
tokenizer=stemmer,
min_df=1,
ngram_range=(1, 2),
stop_words='english'
)),
('pos', CountVectorizer(tokenizer=tokenizer, ngram_range=(1, 2)))
])
transformed = pipeline.fit_transform([example.text for example in examples])
scores = [example.score for example in examples]
classifier = SVC()
classifier.fit(transformed, scores)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment