Commit e7ba3080 by Steven Bird

stylistic fixes, cleaned imports, fixed cyclic imports

parent 92e5fca0
...@@ -15,7 +15,6 @@ ...@@ -15,7 +15,6 @@
from __future__ import division from __future__ import division
from collections import defaultdict from collections import defaultdict
from nltk.align import AlignedSent from nltk.align import AlignedSent
from nltk.corpus import comtrans
class IBMModel1(object): class IBMModel1(object):
""" """
...@@ -28,6 +27,7 @@ class IBMModel1(object): ...@@ -28,6 +27,7 @@ class IBMModel1(object):
Step 2 - Estimate the probability of translation according to the Step 2 - Estimate the probability of translation according to the
evidence from Step 1. evidence from Step 1.
>>> from nltk.corpus import comtrans
>>> bitexts = comtrans.aligned_sents()[:100] >>> bitexts = comtrans.aligned_sents()[:100]
>>> ibm = IBMModel1(bitexts, 20) >>> ibm = IBMModel1(bitexts, 20)
......
...@@ -9,7 +9,6 @@ ...@@ -9,7 +9,6 @@
from __future__ import division from __future__ import division
from collections import defaultdict from collections import defaultdict
from nltk.align import AlignedSent from nltk.align import AlignedSent
from nltk.corpus import comtrans
from nltk.align.ibm1 import IBMModel1 from nltk.align.ibm1 import IBMModel1
class IBMModel2(object): class IBMModel2(object):
...@@ -26,6 +25,7 @@ class IBMModel2(object): ...@@ -26,6 +25,7 @@ class IBMModel2(object):
Step 3 - Estimate the probability of translation and alignment according Step 3 - Estimate the probability of translation and alignment according
to the evidence from Step 2. to the evidence from Step 2.
>>> from nltk.corpus import comtrans
>>> bitexts = comtrans.aligned_sents()[:100] >>> bitexts = comtrans.aligned_sents()[:100]
>>> ibm = IBMModel2(bitexts, 5) >>> ibm = IBMModel2(bitexts, 5)
>>> aligned_sent = ibm.align(bitexts[0]) >>> aligned_sent = ibm.align(bitexts[0])
......
...@@ -14,7 +14,7 @@ from nltk.internals import find_binary ...@@ -14,7 +14,7 @@ from nltk.internals import find_binary
try: try:
import numpy import numpy
except ImportError: except ImportError:
numpy = None pass
_tadm_bin = None _tadm_bin = None
def config_tadm(bin=None): def config_tadm(bin=None):
......
...@@ -31,14 +31,10 @@ http://borel.slu.edu/crubadan/index.html ...@@ -31,14 +31,10 @@ http://borel.slu.edu/crubadan/index.html
# Ensure that literal strings default to unicode rather than str. # Ensure that literal strings default to unicode rather than str.
from __future__ import print_function, unicode_literals from __future__ import print_function, unicode_literals
import nltk from nltk.compat import PY3
import nltk.compat
from nltk.corpus import CrubadanCorpusReader
from nltk.util import trigrams from nltk.util import trigrams
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
if nltk.compat.PY3: if PY3:
from sys import maxsize from sys import maxsize
else: else:
from sys import maxint from sys import maxint
...@@ -71,7 +67,8 @@ class TextCat(object): ...@@ -71,7 +67,8 @@ class TextCat(object):
"see https://pypi.python.org/pypi/regex for " "see https://pypi.python.org/pypi/regex for "
"further details.") "further details.")
self._corpus = CrubadanCorpusReader(nltk.data.find('corpora/crubadan'), '.*\.txt') from nltk.corpus import crubadan
self._corpus = crubadan
# Load all language ngrams into cache # Load all language ngrams into cache
for lang in self._corpus.langs(): for lang in self._corpus.langs():
self._corpus.lang_freq(lang) self._corpus.lang_freq(lang)
...@@ -82,6 +79,8 @@ class TextCat(object): ...@@ -82,6 +79,8 @@ class TextCat(object):
def profile(self, text): def profile(self, text):
''' Create FreqDist of trigrams within text ''' ''' Create FreqDist of trigrams within text '''
from nltk import word_tokenize, FreqDist
clean_text = self.remove_punctuation(text) clean_text = self.remove_punctuation(text)
tokens = word_tokenize(clean_text) tokens = word_tokenize(clean_text)
...@@ -115,7 +114,7 @@ class TextCat(object): ...@@ -115,7 +114,7 @@ class TextCat(object):
# Arbitrary but should be larger than # Arbitrary but should be larger than
# any possible trigram file length # any possible trigram file length
# in terms of total lines # in terms of total lines
if nltk.compat.PY3: if PY3:
dist = maxsize dist = maxsize
else: else:
dist = maxint dist = maxint
...@@ -148,41 +147,47 @@ class TextCat(object): ...@@ -148,41 +147,47 @@ class TextCat(object):
return min(self.last_distances, key=self.last_distances.get) return min(self.last_distances, key=self.last_distances.get)
#################################################') #################################################')
def demo(self): def demo():
from nltk.corpus import udhr from nltk.corpus import udhr
langs = ['Kurdish-UTF8', 'Abkhaz-UTF8', 'Farsi_Persian-UTF8', langs = ['Kurdish-UTF8', 'Abkhaz-UTF8', 'Farsi_Persian-UTF8',
'Hindi-UTF8', 'Hawaiian-UTF8', 'Russian-UTF8', 'Vietnamese-UTF8', 'Hindi-UTF8', 'Hawaiian-UTF8', 'Russian-UTF8', 'Vietnamese-UTF8',
'Serbian_Srpski-UTF8','Esperanto-UTF8'] 'Serbian_Srpski-UTF8','Esperanto-UTF8']
friendly = {'kmr':'Northern Kurdish', friendly = {'kmr':'Northern Kurdish',
'abk':'Abkhazian', 'abk':'Abkhazian',
'pes':'Iranian Persian', 'pes':'Iranian Persian',
'hin':'Hindi', 'hin':'Hindi',
'haw':'Hawaiian', 'haw':'Hawaiian',
'rus':'Russian', 'rus':'Russian',
'vie':'Vietnamese', 'vie':'Vietnamese',
'srp':'Serbian', 'srp':'Serbian',
'epo':'Esperanto'} 'epo':'Esperanto'}
for cur_lang in langs: tc = TextCat()
# Get raw data from UDHR corpus
raw_sentences = udhr.sents(cur_lang) for cur_lang in langs:
rows = len(raw_sentences) - 1 # Get raw data from UDHR corpus
cols = list(map(len, raw_sentences)) raw_sentences = udhr.sents(cur_lang)
rows = len(raw_sentences) - 1
cols = list(map(len, raw_sentences))
sample = '' sample = ''
# Generate a sample text of the language # Generate a sample text of the language
for i in range(0, rows): for i in range(0, rows):
cur_sent = '' cur_sent = ''
for j in range(0, cols[i]): for j in range(0, cols[i]):
cur_sent += ' ' + raw_sentences[i][j] cur_sent += ' ' + raw_sentences[i][j]
sample += cur_sent sample += cur_sent
# Try to detect what it is # Try to detect what it is
print('Language snippet: ' + sample[0:140] + '...') print('Language snippet: ' + sample[0:140] + '...')
guess = self.guess_language(sample) guess = tc.guess_language(sample)
print('Language detection: %s (%s)' % (guess, friendly[guess])) print('Language detection: %s (%s)' % (guess, friendly[guess]))
print('#' * 140) print('#' * 140)
if __name__ == '__main__':
demo()
...@@ -22,7 +22,7 @@ http://borel.slu.edu/crubadan/index.html ...@@ -22,7 +22,7 @@ http://borel.slu.edu/crubadan/index.html
from __future__ import print_function, unicode_literals from __future__ import print_function, unicode_literals
import re import re
import nltk.compat from nltk.compat import PY3
from os import path from os import path
from nltk.corpus.reader import CorpusReader from nltk.corpus.reader import CorpusReader
from nltk.probability import FreqDist from nltk.probability import FreqDist
...@@ -75,7 +75,7 @@ class CrubadanCorpusReader(CorpusReader): ...@@ -75,7 +75,7 @@ class CrubadanCorpusReader(CorpusReader):
if self._LANG_MAPPER_FILE not in self.fileids(): if self._LANG_MAPPER_FILE not in self.fileids():
raise RuntimeError("Could not find language mapper file: " + mapper_file) raise RuntimeError("Could not find language mapper file: " + mapper_file)
if nltk.compat.PY3: if PY3:
raw = open(mapper_file, 'r', encoding='utf-8').read().strip() raw = open(mapper_file, 'r', encoding='utf-8').read().strip()
else: else:
raw = open(mapper_file, 'rU').read().decode('utf-8').strip() raw = open(mapper_file, 'rU').read().decode('utf-8').strip()
...@@ -93,13 +93,13 @@ class CrubadanCorpusReader(CorpusReader): ...@@ -93,13 +93,13 @@ class CrubadanCorpusReader(CorpusReader):
raise Runtime("Could not find language n-gram file for " + lang) raise Runtime("Could not find language n-gram file for " + lang)
counts = FreqDist() counts = FreqDist()
if nltk.compat.PY3: if PY3:
f = open(ngram_file, 'r', encoding='utf-8') f = open(ngram_file, 'r', encoding='utf-8')
else: else:
f = open(ngram_file, 'rU') f = open(ngram_file, 'rU')
for line in f: for line in f:
if nltk.compat.PY3: if PY3:
data = line.split(' ') data = line.split(' ')
else: else:
data = line.decode('utf8').split(' ') data = line.decode('utf8').split(' ')
......
...@@ -15,7 +15,6 @@ from functools import reduce ...@@ -15,7 +15,6 @@ from functools import reduce
import subprocess import subprocess
from nltk.data import ZipFilePathPointer from nltk.data import ZipFilePathPointer
from nltk.tag import RegexpTagger
from nltk.tokenize import word_tokenize from nltk.tokenize import word_tokenize
from nltk.internals import find_binary from nltk.internals import find_binary
...@@ -43,6 +42,7 @@ class MaltParser(ParserI): ...@@ -43,6 +42,7 @@ class MaltParser(ParserI):
if tagger is not None: if tagger is not None:
self.tagger = tagger self.tagger = tagger
else: else:
from nltk.tag import RegexpTagger
self.tagger = RegexpTagger( self.tagger = RegexpTagger(
[(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
(r'(The|the|A|a|An|an)$', 'AT'), # articles (r'(The|the|A|a|An|an)$', 'AT'), # articles
......
...@@ -14,7 +14,6 @@ import logging ...@@ -14,7 +14,6 @@ import logging
from nltk.compat import xrange from nltk.compat import xrange
from nltk.parse.dependencygraph import DependencyGraph from nltk.parse.dependencygraph import DependencyGraph
from nltk.classify import NaiveBayesClassifier
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -111,6 +110,8 @@ class NaiveBayesDependencyScorer(DependencyScorerI): ...@@ -111,6 +110,8 @@ class NaiveBayesDependencyScorer(DependencyScorerI):
:param graphs: A list of dependency graphs to train the scorer. :param graphs: A list of dependency graphs to train the scorer.
""" """
from nltk.classify import NaiveBayesClassifier
# Create training labeled training examples # Create training labeled training examples
labeled_examples = [] labeled_examples = []
for graph in graphs: for graph in graphs:
......
...@@ -12,10 +12,14 @@ import pickle ...@@ -12,10 +12,14 @@ import pickle
from os import remove from os import remove
from copy import deepcopy from copy import deepcopy
from operator import itemgetter from operator import itemgetter
from scipy import sparse
from numpy import array try:
from sklearn.datasets import load_svmlight_file from numpy import array
from sklearn import svm from scipy import sparse
from sklearn.datasets import load_svmlight_file
from sklearn import svm
except ImportError:
pass
from nltk.parse import ParserI, DependencyGraph, DependencyEvaluator from nltk.parse import ParserI, DependencyGraph, DependencyEvaluator
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment