Commit af059ef3 by Steven Bird

fix merge conflict

parents e7e3112c e7ba3080
...@@ -15,7 +15,6 @@ ...@@ -15,7 +15,6 @@
from __future__ import division from __future__ import division
from collections import defaultdict from collections import defaultdict
from nltk.align import AlignedSent from nltk.align import AlignedSent
from nltk.corpus import comtrans
class IBMModel1(object): class IBMModel1(object):
""" """
...@@ -28,6 +27,7 @@ class IBMModel1(object): ...@@ -28,6 +27,7 @@ class IBMModel1(object):
Step 2 - Estimate the probability of translation according to the Step 2 - Estimate the probability of translation according to the
evidence from Step 1. evidence from Step 1.
>>> from nltk.corpus import comtrans
>>> bitexts = comtrans.aligned_sents()[:100] >>> bitexts = comtrans.aligned_sents()[:100]
>>> ibm = IBMModel1(bitexts, 20) >>> ibm = IBMModel1(bitexts, 20)
......
...@@ -9,7 +9,6 @@ ...@@ -9,7 +9,6 @@
from __future__ import division from __future__ import division
from collections import defaultdict from collections import defaultdict
from nltk.align import AlignedSent from nltk.align import AlignedSent
from nltk.corpus import comtrans
from nltk.align.ibm1 import IBMModel1 from nltk.align.ibm1 import IBMModel1
class IBMModel2(object): class IBMModel2(object):
...@@ -26,6 +25,7 @@ class IBMModel2(object): ...@@ -26,6 +25,7 @@ class IBMModel2(object):
Step 3 - Estimate the probability of translation and alignment according Step 3 - Estimate the probability of translation and alignment according
to the evidence from Step 2. to the evidence from Step 2.
>>> from nltk.corpus import comtrans
>>> bitexts = comtrans.aligned_sents()[:100] >>> bitexts = comtrans.aligned_sents()[:100]
>>> ibm = IBMModel2(bitexts, 5) >>> ibm = IBMModel2(bitexts, 5)
>>> aligned_sent = ibm.align(bitexts[0]) >>> aligned_sent = ibm.align(bitexts[0])
......
...@@ -95,3 +95,4 @@ from nltk.classify.maxent import (MaxentClassifier, BinaryMaxentFeatureEncoding, ...@@ -95,3 +95,4 @@ from nltk.classify.maxent import (MaxentClassifier, BinaryMaxentFeatureEncoding,
TypedMaxentFeatureEncoding, TypedMaxentFeatureEncoding,
ConditionalExponentialClassifier) ConditionalExponentialClassifier)
from nltk.classify.senna import Senna from nltk.classify.senna import Senna
from nltk.classify.textcat import TextCat
...@@ -14,7 +14,7 @@ from nltk.internals import find_binary ...@@ -14,7 +14,7 @@ from nltk.internals import find_binary
try: try:
import numpy import numpy
except ImportError: except ImportError:
numpy = None pass
_tadm_bin = None _tadm_bin = None
def config_tadm(bin=None): def config_tadm(bin=None):
......
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Language ID module using TextCat algorithm
#
# Copyright (C) 2001-2015 NLTK Project
# Author: Avital Pekker <avital.pekker@utoronto.ca>
#
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
A module for language identification using the TextCat algorithm.
An implementation of the text categorization algorithm
presented in Cavnar, W. B. and J. M. Trenkle,
"N-Gram-Based Text Categorization".
The algorithm takes advantage of Zipf's law and uses
n-gram frequencies to profile languages and text-yet to
be identified-then compares using a distance measure.
Language n-grams are provided by the "An Crubadan"
project. A corpus reader was created seperately to read
those files.
For details regarding the algorithm, see:
http://www.let.rug.nl/~vannoord/TextCat/textcat.pdf
For details about An Crubadan, see:
http://borel.slu.edu/crubadan/index.html
"""
# Ensure that literal strings default to unicode rather than str.
from __future__ import print_function, unicode_literals
from nltk.compat import PY3
from nltk.util import trigrams
if PY3:
from sys import maxsize
else:
from sys import maxint
# Note: this is NOT "re" you're likely used to. The regex module
# is an alternative to the standard re module that supports
# Unicode codepoint properties with the \p{} syntax.
# You may have to "pip install regx"
try:
import regex as re
except ImportError:
re = None
######################################################################
## Language identification using TextCat
######################################################################
class TextCat(object):
_corpus = None
fingerprints = {}
_START_CHAR = "<"
_END_CHAR = ">"
last_distances = {}
def __init__(self):
if not re:
raise EnvironmentError("classify.textcat requires the regex module that "
"supports unicode. Try '$ pip install regex' and "
"see https://pypi.python.org/pypi/regex for "
"further details.")
from nltk.corpus import crubadan
self._corpus = crubadan
# Load all language ngrams into cache
for lang in self._corpus.langs():
self._corpus.lang_freq(lang)
def remove_punctuation(self, text):
''' Get rid of punctuation except apostrophes '''
return re.sub(r"[^\P{P}\']+", "", text)
def profile(self, text):
''' Create FreqDist of trigrams within text '''
from nltk import word_tokenize, FreqDist
clean_text = self.remove_punctuation(text)
tokens = word_tokenize(clean_text)
fingerprint = FreqDist()
for t in tokens:
token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
token_trigrams = [''.join(tri) for tri in token_trigram_tuples]
for cur_trigram in token_trigrams:
if cur_trigram in fingerprint:
fingerprint[cur_trigram] += 1
else:
fingerprint[cur_trigram] = 1
return fingerprint
def calc_dist(self, lang, trigram, text_profile):
''' Calculate the "out-of-place" measure between the
text and language profile for a single trigram '''
lang_fd = self._corpus.lang_freq(lang)
dist = 0
if trigram in lang_fd:
idx_lang_profile = list(lang_fd.keys()).index(trigram)
idx_text = list(text_profile.keys()).index(trigram)
#print(idx_lang_profile, ", ", idx_text)
dist = abs(idx_lang_profile - idx_text)
else:
# Arbitrary but should be larger than
# any possible trigram file length
# in terms of total lines
if PY3:
dist = maxsize
else:
dist = maxint
return dist
def lang_dists(self, text):
''' Calculate the "out-of-place" measure between
the text and all languages '''
distances = {}
profile = self.profile(text)
# For all the languages
for lang in self._corpus._all_lang_freq.keys():
# Calculate distance metric for every trigram in
# input text to be identified
lang_dist = 0
for trigram in profile:
lang_dist += self.calc_dist(lang, trigram, profile)
distances[lang] = lang_dist
return distances
def guess_language(self, text):
''' Find the language with the min distance
to the text and return its ISO 639-3 code '''
self.last_distances = self.lang_dists(text)
return min(self.last_distances, key=self.last_distances.get)
#################################################')
def demo():
from nltk.corpus import udhr
langs = ['Kurdish-UTF8', 'Abkhaz-UTF8', 'Farsi_Persian-UTF8',
'Hindi-UTF8', 'Hawaiian-UTF8', 'Russian-UTF8', 'Vietnamese-UTF8',
'Serbian_Srpski-UTF8','Esperanto-UTF8']
friendly = {'kmr':'Northern Kurdish',
'abk':'Abkhazian',
'pes':'Iranian Persian',
'hin':'Hindi',
'haw':'Hawaiian',
'rus':'Russian',
'vie':'Vietnamese',
'srp':'Serbian',
'epo':'Esperanto'}
tc = TextCat()
for cur_lang in langs:
# Get raw data from UDHR corpus
raw_sentences = udhr.sents(cur_lang)
rows = len(raw_sentences) - 1
cols = list(map(len, raw_sentences))
sample = ''
# Generate a sample text of the language
for i in range(0, rows):
cur_sent = ''
for j in range(0, cols[i]):
cur_sent += ' ' + raw_sentences[i][j]
sample += cur_sent
# Try to detect what it is
print('Language snippet: ' + sample[0:140] + '...')
guess = tc.guess_language(sample)
print('Language detection: %s (%s)' % (guess, friendly[guess]))
print('#' * 140)
if __name__ == '__main__':
demo()
...@@ -95,6 +95,8 @@ conll2007 = LazyCorpusLoader( ...@@ -95,6 +95,8 @@ conll2007 = LazyCorpusLoader(
'conll2007', DependencyCorpusReader, '.*\.(test|train).*', encoding=[ 'conll2007', DependencyCorpusReader, '.*\.(test|train).*', encoding=[
('eus', 'ISO-8859-2'), ('eus', 'ISO-8859-2'),
('esp', 'utf8')]) ('esp', 'utf8')])
crubadan = LazyCorpusLoader(
'crubadan', CrubadanCorpusReader, '.*\.txt')
dependency_treebank = LazyCorpusLoader( dependency_treebank = LazyCorpusLoader(
'dependency_treebank', DependencyCorpusReader, '.*\.dp', 'dependency_treebank', DependencyCorpusReader, '.*\.dp',
encoding='ascii') encoding='ascii')
......
...@@ -94,6 +94,7 @@ from nltk.corpus.reader.udhr import * ...@@ -94,6 +94,7 @@ from nltk.corpus.reader.udhr import *
from nltk.corpus.reader.bnc import * from nltk.corpus.reader.bnc import *
from nltk.corpus.reader.sentiwordnet import * from nltk.corpus.reader.sentiwordnet import *
from nltk.corpus.reader.nkjp import * from nltk.corpus.reader.nkjp import *
from nltk.corpus.reader.crubadan import *
# Make sure that nltk.corpus.reader.bracket_parse gives the module, not # Make sure that nltk.corpus.reader.bracket_parse gives the module, not
# the function bracket_parse() defined in nltk.tree: # the function bracket_parse() defined in nltk.tree:
...@@ -129,5 +130,5 @@ __all__ = [ ...@@ -129,5 +130,5 @@ __all__ = [
'TimitTaggedCorpusReader', 'LinThesaurusCorpusReader', 'TimitTaggedCorpusReader', 'LinThesaurusCorpusReader',
'SemcorCorpusReader', 'FramenetCorpusReader', 'UdhrCorpusReader', 'SemcorCorpusReader', 'FramenetCorpusReader', 'UdhrCorpusReader',
'BNCCorpusReader', 'SentiWordNetCorpusReader', 'SentiSynset', 'BNCCorpusReader', 'SentiWordNetCorpusReader', 'SentiSynset',
'NKJPCorpusReader' 'NKJPCorpusReader', 'CrubadanCorpusReader'
] ]
# -*- coding: utf-8 -*-
# Natural Language Toolkit: An Crubadan N-grams Reader
#
# Copyright (C) 2001-2015 NLTK Project
# Author: Avital Pekker <avital.pekker@utoronto.ca>
#
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
An NLTK interface for the n-gram statistics gathered from
the corpora for each language using An Crubadan.
There are multiple potential applications for the data but
this reader was created with the goal of using it in the
context of language identification.
For details about An Crubadan, this data, and its potential uses, see:
http://borel.slu.edu/crubadan/index.html
"""
from __future__ import print_function, unicode_literals
import re
from nltk.compat import PY3
from os import path
from nltk.corpus.reader import CorpusReader
from nltk.probability import FreqDist
from nltk.data import ZipFilePathPointer
class CrubadanCorpusReader(CorpusReader):
"""
A corpus reader used to access language An Crubadan n-gram files.
"""
_LANG_MAPPER_FILE = 'table.txt'
_all_lang_freq = {}
def __init__(self, root, fileids, encoding='utf8', tagset=None):
super(CrubadanCorpusReader, self).__init__(root, fileids, encoding='utf8')
self._lang_mapping_data = []
self._load_lang_mapping_data()
def lang_freq(self, lang):
''' Return n-gram FreqDist for a specific language
given ISO 639-3 language code '''
if lang not in self._all_lang_freq:
self._all_lang_freq[lang] = self._load_lang_ngrams(lang)
return self._all_lang_freq[lang]
def langs(self):
''' Return a list of supported languages as ISO 639-3 codes '''
return [row[1] for row in self._lang_mapping_data]
def iso_to_crubadan(self, lang):
''' Return internal Crubadan code based on ISO 639-3 code '''
for i in self._lang_mapping_data:
if i[1].lower() == lang.lower():
return i[0]
def crubadan_to_iso(self, lang):
''' Return ISO 639-3 code given internal Crubadan code '''
for i in self._lang_mapping_data:
if i[0].lower() == lang.lower():
return i[1]
def _load_lang_mapping_data(self):
''' Load language mappings between codes and description from table.txt '''
if isinstance(self.root, ZipFilePathPointer):
raise RuntimeError("Please install the 'crubadan' corpus first, use nltk.download()")
mapper_file = path.join(self.root, self._LANG_MAPPER_FILE)
if self._LANG_MAPPER_FILE not in self.fileids():
raise RuntimeError("Could not find language mapper file: " + mapper_file)
if PY3:
raw = open(mapper_file, 'r', encoding='utf-8').read().strip()
else:
raw = open(mapper_file, 'rU').read().decode('utf-8').strip()
self._lang_mapping_data = [row.split('\t') for row in raw.split('\n')]
def _load_lang_ngrams(self, lang):
''' Load single n-gram language file given the ISO 639-3 language code
and return its FreqDist '''
crubadan_code = self.iso_to_crubadan(lang)
ngram_file = path.join(self.root, crubadan_code + '-3grams.txt')
if not path.isfile(ngram_file):
raise Runtime("Could not find language n-gram file for " + lang)
counts = FreqDist()
if PY3:
f = open(ngram_file, 'r', encoding='utf-8')
else:
f = open(ngram_file, 'rU')
for line in f:
if PY3:
data = line.split(' ')
else:
data = line.decode('utf8').split(' ')
ngram = data[1].strip('\n')
freq = int(data[0])
counts[ngram] = freq
return counts
...@@ -15,7 +15,6 @@ from functools import reduce ...@@ -15,7 +15,6 @@ from functools import reduce
import subprocess import subprocess
from nltk.data import ZipFilePathPointer from nltk.data import ZipFilePathPointer
from nltk.tag import RegexpTagger
from nltk.tokenize import word_tokenize from nltk.tokenize import word_tokenize
from nltk.internals import find_binary from nltk.internals import find_binary
...@@ -43,6 +42,7 @@ class MaltParser(ParserI): ...@@ -43,6 +42,7 @@ class MaltParser(ParserI):
if tagger is not None: if tagger is not None:
self.tagger = tagger self.tagger = tagger
else: else:
from nltk.tag import RegexpTagger
self.tagger = RegexpTagger( self.tagger = RegexpTagger(
[(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
(r'(The|the|A|a|An|an)$', 'AT'), # articles (r'(The|the|A|a|An|an)$', 'AT'), # articles
......
...@@ -14,7 +14,6 @@ import logging ...@@ -14,7 +14,6 @@ import logging
from nltk.compat import xrange from nltk.compat import xrange
from nltk.parse.dependencygraph import DependencyGraph from nltk.parse.dependencygraph import DependencyGraph
from nltk.classify import NaiveBayesClassifier
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -111,6 +110,8 @@ class NaiveBayesDependencyScorer(DependencyScorerI): ...@@ -111,6 +110,8 @@ class NaiveBayesDependencyScorer(DependencyScorerI):
:param graphs: A list of dependency graphs to train the scorer. :param graphs: A list of dependency graphs to train the scorer.
""" """
from nltk.classify import NaiveBayesClassifier
# Create training labeled training examples # Create training labeled training examples
labeled_examples = [] labeled_examples = []
for graph in graphs: for graph in graphs:
......
...@@ -16,8 +16,8 @@ from os import remove ...@@ -16,8 +16,8 @@ from os import remove
from copy import deepcopy from copy import deepcopy
from operator import itemgetter from operator import itemgetter
try: try:
from scipy import sparse
from numpy import array from numpy import array
from scipy import sparse
from sklearn.datasets import load_svmlight_file from sklearn.datasets import load_svmlight_file
from sklearn import svm from sklearn import svm
except ImportError: except ImportError:
......
.. Copyright (C) 2001-2015 NLTK Project
.. For license information, see LICENSE.TXT
Crubadan Corpus Reader
======================
Crubadan is an NLTK corpus reader for ngram files provided
by the Crubadan project. It supports several languages.
>>> from nltk.corpus import crubadan
>>> crubadan.langs()
----------------------------------------
Language code mapping and helper methods
----------------------------------------
The web crawler that generated the 3-gram frequencies works at the
level of "writing systems" rather than languages. These are assigned
internal 2-3 letter "writing system codes" that require mapping to the
standard ISO 639-3 codes.
For details, please refer to the README in nltk_data/crubadan
folder after installing it.
To translate ISO 639-3 codes to "Crubadan Code":
>>> crubadan.iso_to_crubadan('eng')
'en'
>>> crubadan.iso_to_crubadan('fra')
'fr'
>>> crubadan.iso_to_crubadan('aaa')
None
In reverse, print ISO 639-3 code if we have the Crubadan Code:
>>> crubadan.crubadan_to_iso('en')
'eng'
>>> crubadan.crubadan_to_iso('fr')
'fra'
>>> crubadan.crubadan_to_iso('aa')
None
---------------------------
Accessing ngram frequencies
---------------------------
On initialization, the reader will create a dictionary of every
language supported by the Crubadan project, mapping the ISO 639-3
language code to its corresponding ngram frequency.
The end result is a dictionary of FreqDists representing all the
languages that can be accessed via "all_lang_freq":
>>> crubadan.all_lang_freq # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
{'roh': FreqDist({'<da': 33783, ...})}
You can access individual language FreqDist and the ngrams within them as follows:
say you're interested in knowing the frequency of the ngram 'the' in English:
>>> english_fd = crubadan.all_lang_freq['eng']
>>> english_fd['the']
728135
Alternatively, you may use the "lang_freq" method to retrieve the FreqDist
directly, simply pass the ISO 639-3 code for the language you're interested in:
>>> crubadan.lang_freq('eng') # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
FreqDist({'<th': 954187, 'the': 728135, 'he>': 655273, 'ing': 360758, 'nd>': 359481, 'ng>':
351373, '<an': 345633, '<to': 333091, 'ed>': 329517, '<of': 316431, ...})
A companion convenience method to above is "ngram_freq", which can be
used to retrieve a specific ngram frequecy more explicitly. As usual,
pass in the language of interest by its ISO 639-3 code and the ngram
of interest. Using the example above to get the frequency of 'the' in English:
>>> crubadan.ngram_freq('eng', 'the')
728135
A ngram that isn't found within the language will return 0:
>>> crubadan.ngram_freq('eng', 'test')
0
A language that isn't supported will raise an exception:
>>> crubadan.ngram_freq('elvish', 'test')
Traceback (most recent call last):
...
CrubadanError: Unsupported language [elvish]
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment