fix merge conflict

af059ef3 · Steven Bird · e7e3112c · e7ba3080 · af059ef3 · af059ef3
Commit af059ef3 authored May 18, 2015 by Steven Bird
12 changed files
--- a/nltk/align/ibm1.py
+++ b/nltk/align/ibm1.py
@@ -15,7 +15,6 @@
 from __future__  import division
 from collections import defaultdict
 from nltk.align  import AlignedSent
-from nltk.corpus import comtrans
 class IBMModel1(object):
    """
@@ -28,6 +27,7 @@ class IBMModel1(object):
    Step 2 - Estimate the probability of translation according to the 
             evidence from Step 1. 
+    >>> from nltk.corpus import comtrans
    >>> bitexts = comtrans.aligned_sents()[:100]
    >>> ibm = IBMModel1(bitexts, 20)

--- a/nltk/align/ibm2.py
+++ b/nltk/align/ibm2.py
@@ -9,7 +9,6 @@
 from __future__  import division
 from collections import defaultdict
 from nltk.align  import AlignedSent
-from nltk.corpus import comtrans
 from nltk.align.ibm1 import IBMModel1
 class IBMModel2(object):
@@ -26,6 +25,7 @@ class IBMModel2(object):
    Step 3 - Estimate the probability of translation and alignment according 
             to the evidence from Step 2. 
+    >>> from nltk.corpus import comtrans
    >>> bitexts = comtrans.aligned_sents()[:100]
    >>> ibm = IBMModel2(bitexts, 5)
    >>> aligned_sent = ibm.align(bitexts[0])

--- a/nltk/classify/__init__.py
+++ b/nltk/classify/__init__.py
@@ -95,3 +95,4 @@ from nltk.classify.maxent import (MaxentClassifier, BinaryMaxentFeatureEncoding,
                                  TypedMaxentFeatureEncoding,
                                  ConditionalExponentialClassifier)
 from nltk.classify.senna import Senna
+from nltk.classify.textcat import TextCat
--- a/nltk/classify/tadm.py
+++ b/nltk/classify/tadm.py
@@ -14,7 +14,7 @@ from nltk.internals import find_binary
 try:
    import numpy
 except ImportError:
-    numpy = None
+    pass
 _tadm_bin = None
 def config_tadm(bin=None):

--- a/nltk/classify/textcat.py
+++ b/nltk/classify/textcat.py
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Language ID module using TextCat algorithm
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Author: Avital Pekker <avital.pekker@utoronto.ca>
+#
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+"""
+A module for language identification using the TextCat algorithm.
+An implementation of the text categorization algorithm
+presented in Cavnar, W. B. and J. M. Trenkle, 
+"N-Gram-Based Text Categorization".
+The algorithm takes advantage of Zipf's law and uses 
+n-gram frequencies to profile languages and text-yet to
+be identified-then compares using a distance measure.
+Language n-grams are provided by the "An Crubadan"
+project. A corpus reader was created seperately to read
+those files.
+For details regarding the algorithm, see:
+http://www.let.rug.nl/~vannoord/TextCat/textcat.pdf
+For details about An Crubadan, see:
+http://borel.slu.edu/crubadan/index.html
+"""
+# Ensure that literal strings default to unicode rather than str.
+from __future__ import print_function, unicode_literals
+from nltk.compat import PY3
+from nltk.util import trigrams
+if PY3:
+    from sys import maxsize
+else:
+    from sys import maxint
+# Note: this is NOT "re" you're likely used to. The regex module
+# is an alternative to the standard re module that supports
+# Unicode codepoint properties with the \p{} syntax.
+# You may have to "pip install regx"
+try:
+    import regex as re
+except ImportError:
+    re = None
+######################################################################
+##  Language identification using TextCat
+######################################################################
+class TextCat(object):
+    _corpus = None
+    fingerprints = {}
+    _START_CHAR = "<"
+    _END_CHAR = ">"
+    last_distances = {}
+    def __init__(self):
+        if not re:
+            raise EnvironmentError("classify.textcat requires the regex module that "
+                                   "supports unicode. Try '$ pip install regex' and "
+                                   "see https://pypi.python.org/pypi/regex for "
+                                   "further details.")
+        from nltk.corpus import crubadan
+        self._corpus = crubadan
+        # Load all language ngrams into cache
+        for lang in self._corpus.langs():
+            self._corpus.lang_freq(lang)
+    def remove_punctuation(self, text):
+        ''' Get rid of punctuation except apostrophes '''
+        return re.sub(r"[^\P{P}\']+", "", text)
+    def profile(self, text):
+        ''' Create FreqDist of trigrams within text '''
+        from nltk import word_tokenize, FreqDist
+        clean_text = self.remove_punctuation(text)
+        tokens = word_tokenize(clean_text)
+        fingerprint = FreqDist()
+        for t in tokens:
+            token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
+            token_trigrams = [''.join(tri) for tri in token_trigram_tuples]
+            for cur_trigram in token_trigrams:
+                if cur_trigram in fingerprint:
+                    fingerprint[cur_trigram] += 1
+                else:
+                    fingerprint[cur_trigram] = 1
+        return fingerprint
+    def calc_dist(self, lang, trigram, text_profile):
+        ''' Calculate the "out-of-place" measure between the
+            text and language profile for a single trigram '''
+        lang_fd = self._corpus.lang_freq(lang)
+        dist = 0
+        if trigram in lang_fd:
+            idx_lang_profile = list(lang_fd.keys()).index(trigram)
+            idx_text = list(text_profile.keys()).index(trigram)
+            #print(idx_lang_profile, ", ", idx_text)
+            dist = abs(idx_lang_profile - idx_text) 
+        else:
+            # Arbitrary but should be larger than
+            # any possible trigram file length
+            # in terms of total lines
+            if PY3:
+                dist = maxsize
+            else:
+                dist = maxint
+        return dist
+    def lang_dists(self, text):
+        ''' Calculate the "out-of-place" measure between
+            the text and all languages '''
+        distances = {}
+        profile = self.profile(text)
+        # For all the languages
+        for lang in self._corpus._all_lang_freq.keys():
+            # Calculate distance metric for every trigram in
+            # input text to be identified
+            lang_dist = 0
+            for trigram in profile:
+                lang_dist += self.calc_dist(lang, trigram, profile)
+            distances[lang] = lang_dist
+        return distances
+    def guess_language(self, text):
+        ''' Find the language with the min distance
+            to the text and return its ISO 639-3 code '''
+        self.last_distances = self.lang_dists(text)
+        return min(self.last_distances, key=self.last_distances.get)
+        #################################################')
+def demo():
+    from nltk.corpus import udhr
+    langs = ['Kurdish-UTF8', 'Abkhaz-UTF8', 'Farsi_Persian-UTF8',
+             'Hindi-UTF8', 'Hawaiian-UTF8', 'Russian-UTF8', 'Vietnamese-UTF8',
+             'Serbian_Srpski-UTF8','Esperanto-UTF8']
+    friendly = {'kmr':'Northern Kurdish',
+                'abk':'Abkhazian',
+                'pes':'Iranian Persian',
+                'hin':'Hindi',
+                'haw':'Hawaiian',
+                'rus':'Russian',
+                'vie':'Vietnamese',
+                'srp':'Serbian',
+                'epo':'Esperanto'}
+    tc = TextCat()
+    for cur_lang in langs:
+        # Get raw data from UDHR corpus
+        raw_sentences = udhr.sents(cur_lang)
+        rows = len(raw_sentences) - 1
+        cols = list(map(len, raw_sentences))
+        sample = ''
+        # Generate a sample text of the language
+        for i in range(0, rows):
+            cur_sent = ''
+            for j in range(0, cols[i]):
+                cur_sent += ' ' + raw_sentences[i][j]
+            sample += cur_sent
+        # Try to detect what it is
+        print('Language snippet: ' + sample[0:140] + '...')
+        guess = tc.guess_language(sample)
+        print('Language detection: %s (%s)' % (guess, friendly[guess]))
+        print('#' * 140)
+if __name__ == '__main__':
+    demo()
--- a/nltk/corpus/__init__.py
+++ b/nltk/corpus/__init__.py
@@ -95,6 +95,8 @@ conll2007 = LazyCorpusLoader(
    'conll2007', DependencyCorpusReader, '.*\.(test|train).*', encoding=[
        ('eus', 'ISO-8859-2'),
        ('esp', 'utf8')])
+crubadan = LazyCorpusLoader(
+    'crubadan', CrubadanCorpusReader, '.*\.txt')
 dependency_treebank = LazyCorpusLoader(
    'dependency_treebank', DependencyCorpusReader, '.*\.dp',
    encoding='ascii')

--- a/nltk/corpus/reader/__init__.py
+++ b/nltk/corpus/reader/__init__.py
@@ -94,6 +94,7 @@ from nltk.corpus.reader.udhr import *
 from nltk.corpus.reader.bnc import *
 from nltk.corpus.reader.sentiwordnet import *
 from nltk.corpus.reader.nkjp import *
+from nltk.corpus.reader.crubadan import *
 # Make sure that nltk.corpus.reader.bracket_parse gives the module, not
 # the function bracket_parse() defined in nltk.tree:
@@ -129,5 +130,5 @@ __all__ = [
    'TimitTaggedCorpusReader', 'LinThesaurusCorpusReader',
    'SemcorCorpusReader', 'FramenetCorpusReader', 'UdhrCorpusReader',
    'BNCCorpusReader', 'SentiWordNetCorpusReader', 'SentiSynset',
-    'NKJPCorpusReader'
+    'NKJPCorpusReader', 'CrubadanCorpusReader'
 ]
--- a/nltk/corpus/reader/crubadan.py
+++ b/nltk/corpus/reader/crubadan.py
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: An Crubadan N-grams Reader
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Author: Avital Pekker <avital.pekker@utoronto.ca>
+#
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+"""
+An NLTK interface for the n-gram statistics gathered from
+the corpora for each language using An Crubadan.
+There are multiple potential applications for the data but
+this reader was created with the goal of using it in the
+context of language identification.
+For details about An Crubadan, this data, and its potential uses, see:
+http://borel.slu.edu/crubadan/index.html
+"""
+from __future__ import print_function, unicode_literals
+import re
+from nltk.compat import PY3
+from os import path
+from nltk.corpus.reader import CorpusReader
+from nltk.probability import FreqDist
+from nltk.data import ZipFilePathPointer
+class CrubadanCorpusReader(CorpusReader):
+    """
+    A corpus reader used to access language An Crubadan n-gram files.
+    """
+    _LANG_MAPPER_FILE = 'table.txt'
+    _all_lang_freq = {}
+    def __init__(self, root, fileids, encoding='utf8', tagset=None):
+        super(CrubadanCorpusReader, self).__init__(root, fileids, encoding='utf8')
+        self._lang_mapping_data = []
+        self._load_lang_mapping_data()
+    def lang_freq(self, lang):
+        ''' Return n-gram FreqDist for a specific language
+            given ISO 639-3 language code '''
+        if lang not in self._all_lang_freq:
+            self._all_lang_freq[lang] = self._load_lang_ngrams(lang)
+        return self._all_lang_freq[lang]
+    def langs(self):
+        ''' Return a list of supported languages as ISO 639-3 codes '''
+        return [row[1] for row in self._lang_mapping_data]
+    def iso_to_crubadan(self, lang):
+        ''' Return internal Crubadan code based on ISO 639-3 code '''
+        for i in self._lang_mapping_data:
+            if i[1].lower() == lang.lower():
+                return i[0]
+    def crubadan_to_iso(self, lang):
+        ''' Return ISO 639-3 code given internal Crubadan code '''
+        for i in self._lang_mapping_data:
+            if i[0].lower() == lang.lower():
+                return i[1]
+    def _load_lang_mapping_data(self):
+        ''' Load language mappings between codes and description from table.txt '''
+        if isinstance(self.root, ZipFilePathPointer):
+            raise RuntimeError("Please install the 'crubadan' corpus first, use nltk.download()")
+        mapper_file = path.join(self.root, self._LANG_MAPPER_FILE)
+        if self._LANG_MAPPER_FILE not in self.fileids():
+            raise RuntimeError("Could not find language mapper file: " + mapper_file)
+        if PY3:
+            raw = open(mapper_file, 'r', encoding='utf-8').read().strip()
+        else:
+            raw = open(mapper_file, 'rU').read().decode('utf-8').strip()
+        self._lang_mapping_data = [row.split('\t') for row in raw.split('\n')]
+    def _load_lang_ngrams(self, lang):
+        ''' Load single n-gram language file given the ISO 639-3 language code
+            and return its FreqDist '''
+        crubadan_code = self.iso_to_crubadan(lang)
+        ngram_file = path.join(self.root, crubadan_code + '-3grams.txt')
+        if not path.isfile(ngram_file):
+            raise Runtime("Could not find language n-gram file for " + lang)
+        counts = FreqDist()
+        if PY3:
+            f = open(ngram_file, 'r', encoding='utf-8')
+        else:
+            f = open(ngram_file, 'rU')
+        for line in f:
+            if PY3:
+                data = line.split(' ')
+            else:
+                data = line.decode('utf8').split(' ')
+            ngram = data[1].strip('\n')
+            freq = int(data[0])
+            counts[ngram] = freq
+        return counts
--- a/nltk/parse/malt.py
+++ b/nltk/parse/malt.py
@@ -15,7 +15,6 @@ from functools import reduce
 import subprocess
 from nltk.data import ZipFilePathPointer
-from nltk.tag import RegexpTagger
 from nltk.tokenize import word_tokenize
 from nltk.internals import find_binary
@@ -43,6 +42,7 @@ class MaltParser(ParserI):
        if tagger is not None:
            self.tagger = tagger
        else:
+            from nltk.tag import RegexpTagger
            self.tagger = RegexpTagger(
            [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
             (r'(The|the|A|a|An|an)$', 'AT'),   # articles

--- a/nltk/parse/nonprojectivedependencyparser.py
+++ b/nltk/parse/nonprojectivedependencyparser.py
@@ -14,7 +14,6 @@ import logging
 from nltk.compat import xrange
 from nltk.parse.dependencygraph import DependencyGraph
-from nltk.classify import NaiveBayesClassifier
 logger = logging.getLogger(__name__)
@@ -111,6 +110,8 @@ class NaiveBayesDependencyScorer(DependencyScorerI):
        :param graphs: A list of dependency graphs to train the scorer.
        """
+        from nltk.classify import NaiveBayesClassifier
        # Create training labeled training examples
        labeled_examples = []
        for graph in graphs:

--- a/nltk/parse/transitionparser.py
+++ b/nltk/parse/transitionparser.py
@@ -16,8 +16,8 @@ from os import remove
 from copy import deepcopy
 from operator import itemgetter
 try:
-    from scipy import sparse
    from numpy import array
+    from scipy import sparse
    from sklearn.datasets import load_svmlight_file
    from sklearn import svm
 except ImportError:

--- a/nltk/test/crubadan.doctest
+++ b/nltk/test/crubadan.doctest
+.. Copyright (C) 2001-2015 NLTK Project
+.. For license information, see LICENSE.TXT
+Crubadan Corpus Reader
+======================
+Crubadan is an NLTK corpus reader for ngram files provided
+by the Crubadan project. It supports several languages.
+    >>> from nltk.corpus import crubadan
+    >>> crubadan.langs()
+----------------------------------------
+Language code mapping and helper methods
+----------------------------------------
+The web crawler that generated the 3-gram frequencies works at the
+level of "writing systems" rather than languages. These are assigned
+internal 2-3 letter "writing system codes" that require mapping to the
+standard ISO 639-3 codes.
+For details, please refer to the README in nltk_data/crubadan
+folder after installing it.
+To translate ISO 639-3 codes to "Crubadan Code":
+    >>> crubadan.iso_to_crubadan('eng')
+    'en'
+    >>> crubadan.iso_to_crubadan('fra')
+    'fr'
+    >>> crubadan.iso_to_crubadan('aaa')
+    None
+In reverse, print ISO 639-3 code if we have the Crubadan Code:
+    >>> crubadan.crubadan_to_iso('en')
+    'eng'
+    >>> crubadan.crubadan_to_iso('fr')
+    'fra'
+    >>> crubadan.crubadan_to_iso('aa')
+    None
+---------------------------
+Accessing ngram frequencies
+---------------------------
+On initialization, the reader will create a dictionary of every
+language supported by the Crubadan project, mapping the ISO 639-3
+language code to its corresponding ngram frequency.
+The end result is a dictionary of FreqDists representing all the
+languages that can be accessed via "all_lang_freq":
+    >>> crubadan.all_lang_freq # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
+    {'roh': FreqDist({'<da': 33783, ...})}
+You can access individual language FreqDist and the ngrams within them as follows:
+say you're interested in knowing the frequency of the ngram 'the' in English:
+    >>> english_fd = crubadan.all_lang_freq['eng']
+    >>> english_fd['the']
+    728135
+Alternatively, you may use the "lang_freq" method to retrieve the FreqDist
+directly, simply pass the ISO 639-3 code for the language you're interested in:
+    >>> crubadan.lang_freq('eng') # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
+    FreqDist({'<th': 954187, 'the': 728135, 'he>': 655273, 'ing': 360758, 'nd>': 359481, 'ng>':
+    351373, '<an': 345633, '<to': 333091, 'ed>': 329517, '<of': 316431, ...})
+A companion convenience method to above is "ngram_freq", which can be
+used to retrieve a specific ngram frequecy more explicitly. As usual, 
+pass in the language of interest by its ISO 639-3 code and the ngram
+of interest. Using the example above to get the frequency of 'the' in English:
+    >>> crubadan.ngram_freq('eng', 'the')
+    728135
+A ngram that isn't found within the language will return 0:
+    >>> crubadan.ngram_freq('eng', 'test')
+    0
+A language that isn't supported will raise an exception:
+    >>> crubadan.ngram_freq('elvish', 'test')
+    Traceback (most recent call last):
+    ...
+    CrubadanError: Unsupported language [elvish]