Merge pull request #890 from avitalp/crubadan

Added corpus reader for n-gram frequencies #845 #884

Merge pull request #890 from avitalp/crubadan
Added corpus reader for n-gram frequencies #845 #884
03c7a9fa · Steven Bird · 383173aa · 82cc85bd · 03c7a9fa · 03c7a9fa
Commit 03c7a9fa authored Feb 21, 2015 by Steven Bird
Hide whitespace changes
Inline Side-by-side

Showing with 274 additions and 171 deletions

nltk/classify/__init__.py
+1 -0

nltk/classify/textcat.py
+166 -0

nltk/corpus/reader/crubadan.py
+61 -81

nltk/test/crubadan.doctest
+46 -90

No files found.
--- a/nltk/classify/__init__.py
+++ b/nltk/classify/__init__.py
@@ -95,3 +95,4 @@ from nltk.classify.maxent import (MaxentClassifier, BinaryMaxentFeatureEncoding,
                                  TypedMaxentFeatureEncoding,
                                  ConditionalExponentialClassifier)
 from nltk.classify.senna import Senna
+from nltk.classify.textcat import TextCat
--- a/nltk/classify/textcat.py
+++ b/nltk/classify/textcat.py
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Language ID module using TextCat algorithm
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Author: Avital Pekker <avital.pekker@utoronto.ca>
+#
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+A module for language identification using the TextCat algorithm.
+An implementation of the text categorization algorithm
+presented in Cavnar, W. B. and J. M. Trenkle, 
+"N-Gram-Based Text Categorization".
+
+The algorithm takes advantage of Zipf's law and uses 
+n-gram frequencies to profile languages and text-yet to
+be identified-then compares using a distance measure.
+
+Language n-grams are provided by the "An Crubadan"
+project. A corpus reader was created seperately to read
+those files.
+
+For details regarding the algorithm, see:
+http://www.let.rug.nl/~vannoord/TextCat/textcat.pdf
+
+For details about An Crubadan, see:
+http://borel.slu.edu/crubadan/index.html
+"""
+
+# Ensure that your own literal strings default to unicode rather than str.
+from __future__ import print_function, unicode_literals
+
+import nltk
+from nltk.corpus import CrubadanCorpusReader
+from nltk.tokenize import word_tokenize
+from nltk.probability import FreqDist
+
+from sys import maxint
+
+# Note: this is NOT "re" you're likely used to. The regex module
+# is an alternative to the standard re module that supports
+# Unicode codepoint properties with the \p{} syntax.
+# You may have to "pip install regx"
+import regex as re  
+######################################################################
+##  Language identification using TextCat
+######################################################################
+
+class TextCat():
+
+    _corpus = None
+    fingerprints = {}
+    _START_CHAR = "<".encode('utf8')
+    _END_CHAR = ">".encode('utf8')
+    
+    last_distances = {}
+    
+    def __init__(self):
+        self._corpus = CrubadanCorpusReader(nltk.data.find('corpora/crubadan'), '.*\.txt')
+        
+    def trigrams(self, text):
+        padded_text = self._START_CHAR + text + self._END_CHAR
+        trigrams = []
+        # Generate 3-grams for given text
+        for i in range(0, len(padded_text) - 2):
+            cur_trigram = padded_text[i:(i + 3)]
+            if len(cur_trigram) == 2:
+                cur_trigram = cur_trigram + self._END_CHAR
+
+            trigrams.append(cur_trigram)
+
+        return trigrams
+
+    def _print_trigrams(self, trigrams):
+        for t in trigrams:
+            print(t)
+        
+    def remove_punctuation(self, text):
+        ''' Get rid of punctuation except apostrophes '''
+        return re.sub(ur"[^\P{P}\']+", "", text.decode('utf8'))
+    
+    def profile(self, text):
+        ''' Create FreqDist of trigrams within text '''
+        clean_text = self.remove_punctuation(text)
+        tokens = word_tokenize(clean_text)
+        
+        fingerprint = FreqDist()
+        for t in tokens:
+            token_trigrams = self.trigrams(t)
+            for cur_trigram in token_trigrams:
+                if cur_trigram in fingerprint:
+                    fingerprint[cur_trigram] += 1
+                else:
+                    fingerprint[cur_trigram] = 1
+
+        return fingerprint
+        
+    def calc_dist(self, lang, trigram, text_profile):
+        ''' Calculate the "out-of-place" measure between the
+            text and language profile for a single trigram '''
+        lang_fd = self._corpus.all_lang_freq[lang]
+        dist = 0
+        
+        if trigram in lang_fd:
+            idx_lang_profile = lang_fd.keys().index(trigram)
+            idx_text = text_profile.keys().index(trigram)
+
+            dist = abs(idx_lang_profile - idx_text) 
+        else:
+            # Arbitrary but should be larger than
+            # any possible trigram file length
+            # in terms of total lines
+            dist = maxint
+
+        return dist
+        
+    def lang_dists(self, text):
+        ''' Calculate the "out-of-place" measure between
+            the text and all languages '''
+        distances = {}
+        profile = self.profile(text)
+        # For all the languages
+        for lang in self._corpus.all_lang_freq.keys():
+            # Calculate distance metric for every trigram in
+            # input text to be identified
+            lang_dist = 0
+            for trigram in profile:
+                lang_dist += self.calc_dist(lang, trigram, profile)
+        
+            distances[lang] = lang_dist
+            
+        return distances
+    
+    def guess_language(self, text):
+        ''' Find the language with the min distance
+            to the text and return its ISO 639-3 code '''
+        self.last_distances = self.lang_dists(text)
+        
+        return min(r, key=self.last_distances.get)
+        
+    def demo(self):
+        ''' Demo of language guessing using a bunch of UTF-8 encoded
+            text files with snippets of text copied from news websites
+            around the web in different languages '''
+        from os import listdir
+        from os.path import isfile
+        # Current dir
+        path = '.'
+        lang_samples = []
+        
+        for f in listdir(path):
+            if isfile(f):
+                m = re.match('sample_\w+\.txt', f)
+                if m: lang_samples.append(f)
+                
+        print(lang_samples)
+        for f in lang_samples:
+            cur_sample = open(f, 'rU')
+            cur_data = cur_sample.read()
+            print('Language sample file: ' + f)
+            print('Contents snippet:  ' + cur_data.decode('utf8')[0:140])
+            print('#################################################')
+            print('Language detection: ' + self.guess_language(cur_data))
+            print('#################################################')
+
--- a/nltk/corpus/reader/crubadan.py
+++ b/nltk/corpus/reader/crubadan.py
@@ -44,103 +44,34 @@ class CrubadanCorpusReader(CorpusReader):
        super(CrubadanCorpusReader, self).__init__(root, fileids, encoding='utf8')
        self._lang_mapping_data = []
        self._load_lang_mapping_data()
-    
-    def load_all_ngrams(self):
-        ''' Create a dictionary of every supported language mapping 
-            the ISO 639-3 language code to its corresponding n-gram
-            FreqDist. The result can be accessed via "all_lang_freq" var '''
-        
-        # Filter out non n-gram files from the corpus dir
-        valid_files = []
-        for f in self.fileids():
-            m = re.search('(\w+)' + re.escape("-3grams.txt"), f)
-            if m:
-                valid_files.append( m.group() )
-                
-        for f in valid_files:
-            ngram_file = self.root + '/' + f
-            
-            import os.path
-            
-            if os.path.isfile(ngram_file):
-                crubadan_code = f.split('-')[0]
-                iso_code = self.crubadan_to_iso(crubadan_code)
-
-                fd = self.load_lang_ngrams(iso_code)
-                self.all_lang_freq[iso_code] = fd
-        
-    def load_lang_ngrams(self, lang):
-        ''' Load single n-gram language file given the ISO 639-3 language code
-            and return its FreqDist '''
-        
-        crubadan_code = self.iso_to_crubadan(lang)
-        ngram_file = self.root + '/' + unicode(crubadan_code) + '-3grams.txt'
-        import os.path
-        
-        if not os.path.isfile(ngram_file):
-            raise CrubadanError("Could not find language n-gram file for [" + lang + "].")
-
-        counts = FreqDist()
-            
-        f = open(ngram_file, 'rU')
+        self._load_all_ngrams()
        
-        for line in f:
-            data = line.decode('utf-8').split(u' ')
-            
-            ngram = data[1].strip('\n')
-            freq = int(data[0])
-            
-            counts[ngram] = freq
-            
-        return counts
-    
    def lang_freq(self, lang):
        ''' Return n-gram FreqDist for a specific language
            given ISO 639-3 language code '''
+        
        if len(self.all_lang_freq) == 0:
-            return self.load_lang_ngrams(lang)
-        else:
-            return self.all_lang_freq[lang]
+            self._load_all_ngrams()
+
+        return self.all_lang_freq[lang]
    
    def ngram_freq(self, lang, ngram):
        ''' Return n-gram frequency as integer given
            an ISO 639-3 language code and n-gram '''
        if lang not in self.all_lang_freq:
-            raise CrubadanError("Unsupproted language [" + lang + "].")
+            raise CrubadanError("Unsupported language [" + lang + "].")
            
        lf = self.all_lang_freq[lang]
        return lf[ngram]
        
-    def supported_langs(self):
-        ''' Return a list of supported languages in human-friendly form '''
-        l = []
+    def langs(self):
+        ''' Return a list of supported languages as ISO 639-3 codes '''
+        supported = []
        for i in self._lang_mapping_data:
-            l.append(i[2])
+            supported.append(i[1])
+            
+        return supported
            
-        return l
-    
-    def lang_supported(self, lang):
-        ''' Check if a language is supported (language passed in as ISO 639-3 code) '''
-        for i in self._lang_mapping_data:
-            if i[1].lower() == lang.lower():
-                return True
-        
-        return False
-
-    def iso_to_friendly(self, lang):
-        ''' Return human-friendly name for a lanuage based on ISO 639-3 code '''
-        for i in self._lang_mapping_data:
-            if i[1].lower() == lang.lower():
-                return unicode(i[2])
-        
-        return None
-    
-    def friendly_to_iso(self, lang):
-        ''' Return ISO 639-3 code from human-friendly language name (eg: "English" -> "en") '''
-        for i in self._lang_mapping_data:
-            if i[2].lower() == lang.lower():
-                return unicode(i[1])
-        
    def iso_to_crubadan(self, lang):
        ''' Return internal Crubadan code based on ISO 639-3 code '''
        for i in self._lang_mapping_data:
@@ -171,6 +102,55 @@ class CrubadanCorpusReader(CorpusReader):
        if self._lang_mapping_data[ len(self._lang_mapping_data) - 1 ] == [u'']:
            self._lang_mapping_data.pop()

+    def _load_lang_ngrams(self, lang):
+        ''' Load single n-gram language file given the ISO 639-3 language code
+            and return its FreqDist '''
+        
+        crubadan_code = self.iso_to_crubadan(lang)
+        ngram_file = self.root + '/' + unicode(crubadan_code) + '-3grams.txt'
+        import os.path
+        
+        if not os.path.isfile(ngram_file):
+            raise CrubadanError("Could not find language n-gram file for [" + lang + "].")
+
+        counts = FreqDist()
+            
+        f = open(ngram_file, 'rU')
+        
+        for line in f:
+            data = line.decode('utf-8').split(u' ')
+            
+            ngram = data[1].strip('\n')
+            freq = int(data[0])
+            
+            counts[ngram] = freq
+            
+        return counts
+
+    def _load_all_ngrams(self):
+        ''' Create a dictionary of every supported language mapping 
+            the ISO 639-3 language code to its corresponding n-gram
+            FreqDist. The result can be accessed via "all_lang_freq" var '''
+        
+        # Filter out non n-gram files from the corpus dir
+        valid_files = []
+        for f in self.fileids():
+            m = re.search('(\w+)' + re.escape("-3grams.txt"), f)
+            if m:
+                valid_files.append( m.group() )
+                
+        for f in valid_files:
+            ngram_file = self.root + '/' + f
+            
+            import os.path
+            
+            if os.path.isfile(ngram_file):
+                crubadan_code = f.split('-')[0]
+                iso_code = self.crubadan_to_iso(crubadan_code)
+
+                fd = self._load_lang_ngrams(iso_code)
+                self.all_lang_freq[iso_code] = fd
+
    def _is_utf8(self, str):
        ''' Check if a string is utf8 encoded '''
        try:

--- a/nltk/test/crubadan.doctest
+++ b/nltk/test/crubadan.doctest
@@ -4,8 +4,8 @@
 Crubadan Corpus Reader
 ======================

-Crubadan is just an NLTK corpus reader for n-gram files provided by
-the Crubadan project, and can be imported like this:
+Crubadan is an NLTK corpus reader for n-gram files provided
+by the Crubadan project, and can be imported like this:

    >>> import nltk
    >>> from nltk.corpus import crubadan
@@ -15,56 +15,28 @@ and easy to remember code, try:

    >>> from nltk.corpus import crubadan as cb

-
 ------------------------
 Language support methods
 ------------------------
-    To get a list of supported languages in human-friendly form:
+
+    To get a list of supported languages as ISO 639-3 codes:
    
-    >>> crubadan.supported_langs() # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
-    [u'Abkhaz', u'Abua', u'Aceh', u'Acholi', ..., u'Lachiguiri Zapotec', u'Zulu']
+    >>> crubadan.langs() # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
+    ['abk', 'abn', 'ace', ..., 'zne', 'zpa', 'zul']
    
-    To check if a single specific language is supported pass in 
-    its ISO 639-3 code and receive a boolean back:
-    >>> crubadan.lang_supported('eng')
-    True
-    >>> crubadan.lang_supported('fra')
-    True
+    To check if a single specific language is supported, check
+    membership in this list. For example, to check for English:
+    
+    >>> 'Yes' if 'eng' in crubadan.langs() else 'No'
+    'Yes'

-    Unsupported language, Ghotuo ('aaa' in ISO 639-3):
-    >>> crubadan.lang_supported('aaa')
-    False
+    And for an unsupported language, say, Ghotuo ('aaa' in ISO 639-3):
+    >>> 'Yes' if 'aaa' in crubadan.langs() else 'No'
+    'No'

 ----------------------------------------
 Language code mapping and helper methods
 ----------------------------------------
-    Print human-friendly name for a lanuage based on ISO 639-3 code.
-    
-    For reference, a combined view of the language code tables of 
-    ISO 639 parts 1, 2, and 3 may be viewed at:
-    http://www-01.sil.org/iso639-3/codes.asp
-
-    >>> crubadan.iso_to_friendly('eng')
-    u'English'
-    >>> crubadan.iso_to_friendly('fra')    
-    u'French'
-
-    Unsupported language returns None, Ghotuo:
-    >>> name_found = crubadan.iso_to_friendly('aaa')
-    >>> name_found is None
-    True
-
-    Now, backwards; print ISO 639-3 code if have the human-friendly name.
-    >>> crubadan.friendly_to_iso('English')
-    u'eng'
-
-    >>> crubadan.friendly_to_iso('French')
-    u'fra'
-
-    Unsupported language returns None, using Ghotuo again:
-    >>> code_found = crubadan.friendly_to_iso('Ghotuo')
-    >>> code_found is None
-    True

    The web crawler that generated the 3-gram frequencies
    actually works at the level of "writing systems" rather
@@ -75,26 +47,31 @@ Language code mapping and helper methods
    For more info, please refer to the README in nltk/nltk_data/crubadan
    folder after installing it.
    
-    To translate ISO 639-3 codes to "Crubadan Code":
+    To translate ISO 639-3 codes to "Crubadan Code", for example, English:
    >>> crubadan.iso_to_crubadan('eng')
-    u'en'
+    'en'

+    And, French:
    >>> crubadan.iso_to_crubadan('fra')
-    u'fr'
+    'fr'
    
-    Unsupported language returns None, using Ghotuo again:
+    Unsupported language returns None, using Ghotuo example from above:
    >>> code_found = crubadan.iso_to_crubadan('aaa')
    >>> code_found is None
    True

-    In reverse; print ISO 639-3 code if we have Crubadan Code:
+    In reverse; print ISO 639-3 code if we have Crubadan Code for English:
    >>> crubadan.crubadan_to_iso('en')
-    u'eng'
+    'eng'
+    
+    Another example, French:
    >>> crubadan.crubadan_to_iso('fr')
-    u'fra'
+    'fra'

-    Unsupported language returns None. Say the theoretical
-    Crubadan Code for the Ghotuo language would have been 'aa':
+    Unsupported language returns None. Crubadan does not
+    support Ghotuo, so let's continue using it as an example.
+    Say the theoretical Crubadan Code for the Ghotuo language
+    would have been 'aa', then:
    >>> code_found = crubadan.crubadan_to_iso('aa')
    >>> code_found is None
    True
@@ -103,57 +80,37 @@ Language code mapping and helper methods
 N-gram and their frequencies methods
 ------------------------------------

-    To get a dictionary of every supported language mapping 
-    the ISO 639-3 language code to its corresponding n-gram
-    frequency:
-    
-    >>> crubadan.load_all_ngrams()
-
-    This can be rather slow and may take several seconds but the
-    end result is a FreqDist which can be accessed via "all_lang_freq":
+    On initialization the reader will create a dictionary of every
+    language supported by the Crubadan project, mapping the ISO 639-3
+    language code to its corresponding n-gram frequency.
+    The end result is a dictionary of FreqDists representing all the
+    languages that can be accessed via "all_lang_freq":
    
    >>> crubadan.all_lang_freq # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
-    {u'roh': FreqDist({u'<da': 33783, ...})}
+    {'roh': FreqDist({'<da': 33783, ...})}

-    Each of these FreqDist can further be accessed for any n-gram, for instance,
-    if you're interested in knowing the frequency of the n-gram 'the' in English:
+    You can access individual language FreqDist and the n-grams within them as follows:
+    say you're interested in knowing the frequency of the n-gram 'the' in English:

-    >>> english_fd = crubadan.load_lang_ngrams('eng')
+    >>> english_fd = crubadan.all_lang_freq['eng']
    >>> english_fd['the']
    728135

-    If, instead, you're interested only in a particular language and
-    don't want to load all the language files, you may use 
-    "load_lang_ngrams" method. The takes an ISO 639-3 code to 
-    identify the language you're interested in:
-
-    >>> crubadan.load_lang_ngrams('eng') # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
-    FreqDist({u'<th': 954187, u'the': 728135, u'he>': 655273, u'ing': 360758, u'nd>': 359481, u'ng>': 351373,
-    u'<an': 345633, u'<to': 333091, u'ed>': 329517, u'<of': 316431, ...})
-
-    If you've used the "load_all_ngrams" method and want to
-    retrieve a FreqDist for a specific language anytime after,
-    pass the ISO 639-3 code for the language you're interested 
-    to the "lang_freq" method:
+    Alternatively, you may use the "lang_freq" method to retrieve the FreqDist
+    directly, simply pass the ISO 639-3 code for the language you're interested in:
    
    >>> crubadan.lang_freq('eng') # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
-    FreqDist({u'<th': 954187, u'the': 728135, u'he>': 655273, u'ing': 360758, u'nd>': 359481, u'ng>':
-    351373, u'<an': 345633, u'<to': 333091, u'ed>': 329517, u'<of': 316431, ...})
-
-    Note: this method will call the "load_lang_ngrams" method automatically
-    if you've not done so yourself and maybe slow on first run as a result.
+    FreqDist({'<th': 954187, 'the': 728135, 'he>': 655273, 'ing': 360758, 'nd>': 359481, 'ng>':
+    351373, '<an': 345633, '<to': 333091, 'ed>': 329517, '<of': 316431, ...})

-    Finally, although may always use the FreqDist directly
-    to retrieve a specific n-gram frequecy, you may use the
-    following convenience method instead:
+    A companion convenience method to above is "ngram_freq", which can be
+    used to retrieve a specific n-gram frequecy more explicitly. As usual, 
+    pass in the language of interest by its ISO 639-3 code and the n-gram
+    of interest. Using the example above to get the frequency of 'the' in English:
    
    >>> crubadan.ngram_freq('eng', 'the')
    728135

-    The above returns the frequency of the n-gram 'the' for English.
-    As usual, pass in the language of interest by its ISO 639-3 code
-    and the n-gram of interest.
-    
    A n-gram that isn't found within the language will return 0:
    >>> crubadan.ngram_freq('eng', 'test')
    0
@@ -162,5 +119,4 @@ N-gram and their frequencies methods
    >>> crubadan.ngram_freq('elvish', 'test')
    Traceback (most recent call last):
    ...
-    KeyError: 'elvish'
-
+    CrubadanError: Unsupported language [elvish]