Merge pull request #927 from avitalp/crubadan

Changes to reflect all modifications discussed in nltk#924

Merge pull request #927 from avitalp/crubadan
Changes to reflect all modifications discussed in nltk#924
92e5fca0 · Steven Bird · da73484f · 226624b8 · 92e5fca0 · 92e5fca0
Commit 92e5fca0 authored May 18, 2015 by Steven Bird
Hide whitespace changes
Inline Side-by-side

Showing with 99 additions and 83 deletions

nltk/classify/textcat.py
+78 -70

nltk/corpus/reader/crubadan.py
+21 -13

No files found.
--- a/nltk/classify/textcat.py
+++ b/nltk/classify/textcat.py
@@ -28,33 +28,39 @@ For details about An Crubadan, see:
 http://borel.slu.edu/crubadan/index.html
 """

-# Ensure that your own literal strings default to unicode rather than str.
+# Ensure that literal strings default to unicode rather than str.
 from __future__ import print_function, unicode_literals

 import nltk
-from nltk.corpus import crubadan
+import nltk.compat
+from nltk.corpus import CrubadanCorpusReader
+from nltk.util import trigrams
 from nltk.tokenize import word_tokenize
 from nltk.probability import FreqDist

-from sys import maxint
+if nltk.compat.PY3:
+    from sys import maxsize
+else:
+    from sys import maxint

 # Note: this is NOT "re" you're likely used to. The regex module
 # is an alternative to the standard re module that supports
 # Unicode codepoint properties with the \p{} syntax.
 # You may have to "pip install regx"
-
 try:
-    import regex
+    import regex as re
 except ImportError:
-    pass
-
+    re = None
+######################################################################
+##  Language identification using TextCat
+######################################################################

 class TextCat(object):

    _corpus = None
    fingerprints = {}
-    _START_CHAR = "<".encode('utf8')
-    _END_CHAR = ">".encode('utf8')
+    _START_CHAR = "<"
+    _END_CHAR = ">"
    
    last_distances = {}
    
@@ -65,29 +71,14 @@ class TextCat(object):
                                   "see https://pypi.python.org/pypi/regex for "
                                   "further details.")

-        self._corpus = crubadan
-        
-    def trigrams(self, text):
-        padded_text = self._START_CHAR + text + self._END_CHAR
-        trigrams = []
-
-        # Generate 3-grams for given text
-        for i in range(0, len(padded_text) - 2):
-            cur_trigram = padded_text[i:(i + 3)]
-            if len(cur_trigram) == 2:
-                cur_trigram = cur_trigram + self._END_CHAR
-
-            trigrams.append(cur_trigram)
-
-        return trigrams
-
-    def _print_trigrams(self, trigrams):
-        for t in trigrams:
-            print(t)
+        self._corpus = CrubadanCorpusReader(nltk.data.find('corpora/crubadan'), '.*\.txt')
+        # Load all language ngrams into cache
+        for lang in self._corpus.langs():
+            self._corpus.lang_freq(lang)
        
    def remove_punctuation(self, text):
        ''' Get rid of punctuation except apostrophes '''
-        return regex.sub(r"[^\P{P}\']+", "", text.decode('utf8'))
+        return re.sub(r"[^\P{P}\']+", "", text)
    
    def profile(self, text):
        ''' Create FreqDist of trigrams within text '''
@@ -96,7 +87,9 @@ class TextCat(object):
        
        fingerprint = FreqDist()
        for t in tokens:
-            token_trigrams = self.trigrams(t)
+            token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
+            token_trigrams = [''.join(tri) for tri in token_trigram_tuples]
+
            for cur_trigram in token_trigrams:
                if cur_trigram in fingerprint:
                    fingerprint[cur_trigram] += 1
@@ -108,32 +101,41 @@ class TextCat(object):
    def calc_dist(self, lang, trigram, text_profile):
        ''' Calculate the "out-of-place" measure between the
            text and language profile for a single trigram '''
-        lang_fd = self._corpus.all_lang_freq[lang]
+
+        lang_fd = self._corpus.lang_freq(lang)
        dist = 0
-        
+
        if trigram in lang_fd:
-            idx_lang_profile = lang_fd.keys().index(trigram)
-            idx_text = text_profile.keys().index(trigram)
+            idx_lang_profile = list(lang_fd.keys()).index(trigram)
+            idx_text = list(text_profile.keys()).index(trigram)

+            #print(idx_lang_profile, ", ", idx_text)
            dist = abs(idx_lang_profile - idx_text) 
        else:
            # Arbitrary but should be larger than
            # any possible trigram file length
            # in terms of total lines
-            dist = maxint
+            if nltk.compat.PY3:
+                dist = maxsize
+            else:
+                dist = maxint

        return dist
        
    def lang_dists(self, text):
        ''' Calculate the "out-of-place" measure between
            the text and all languages '''
+        
        distances = {}
        profile = self.profile(text)
-
-        for lang in self._corpus.all_lang_freq.keys():
+        # For all the languages
+        for lang in self._corpus._all_lang_freq.keys():
+            # Calculate distance metric for every trigram in
+            # input text to be identified
            lang_dist = 0
            for trigram in profile:
                lang_dist += self.calc_dist(lang, trigram, profile)
+        
            distances[lang] = lang_dist
            
        return distances
@@ -144,37 +146,43 @@ class TextCat(object):
        self.last_distances = self.lang_dists(text)
        
        return min(self.last_distances, key=self.last_distances.get)
+        #################################################')
+
+    def demo(self):
+        from nltk.corpus import udhr
+
+        langs = ['Kurdish-UTF8', 'Abkhaz-UTF8', 'Farsi_Persian-UTF8',
+                 'Hindi-UTF8', 'Hawaiian-UTF8', 'Russian-UTF8', 'Vietnamese-UTF8',
+                 'Serbian_Srpski-UTF8','Esperanto-UTF8']
+
+        friendly = {'kmr':'Northern Kurdish',
+                    'abk':'Abkhazian',
+                    'pes':'Iranian Persian',
+                    'hin':'Hindi',
+                    'haw':'Hawaiian',
+                    'rus':'Russian',
+                    'vie':'Vietnamese',
+                    'srp':'Serbian',
+                    'epo':'Esperanto'}
        
-def demo():
-    ''' Demo of language guessing using a bunch of UTF-8 encoded
-        text files with snippets of text copied from news websites
-        around the web in different languages '''
-
-    from os import listdir
-    from os.path import isfile
-
-    path = '.'
-    lang_samples = []
-
-    tc = TextCat()
-        
-    for f in listdir(path):
-        if isfile(f):
-            m = regex.match('sample_\w+\.txt', f)
-            if m: lang_samples.append(f)
-                
-    print(lang_samples)
-    for f in lang_samples:
-        cur_sample = open(f, 'rU')
-        cur_data = cur_sample.read()
-        print('Language sample file: ' + f)
-        print('Contents snippet:  ' + cur_data.decode('utf8')[0:140])
-        print('#################################################')
-        print('Language detection: ' + tc.guess_language(cur_data))
-        print('#################################################')
-
-if __name__ == '__main__':
-    import doctest
-    doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE |
-    doctest.ELLIPSIS)
-
+        for cur_lang in langs:
+            # Get raw data from UDHR corpus
+            raw_sentences = udhr.sents(cur_lang)
+            rows = len(raw_sentences) - 1
+            cols = list(map(len, raw_sentences))
+
+            sample = ''
+          
+            # Generate a sample text of the language
+            for i in range(0, rows):
+                cur_sent = ''
+                for j in range(0, cols[i]):
+                    cur_sent += ' ' + raw_sentences[i][j]
+            
+                sample += cur_sent
+          
+            # Try to detect what it is
+            print('Language snippet: ' + sample[0:140] + '...')
+            guess = self.guess_language(sample)
+            print('Language detection: %s (%s)' % (guess, friendly[guess]))
+            print('#' * 140)
--- a/nltk/corpus/reader/crubadan.py
+++ b/nltk/corpus/reader/crubadan.py
@@ -22,8 +22,8 @@ http://borel.slu.edu/crubadan/index.html
 from __future__ import print_function, unicode_literals

 import re
+import nltk.compat
 from os import path
-
 from nltk.corpus.reader import CorpusReader
 from nltk.probability import FreqDist
 from nltk.data import ZipFilePathPointer
@@ -58,13 +58,13 @@ class CrubadanCorpusReader(CorpusReader):
        ''' Return internal Crubadan code based on ISO 639-3 code '''
        for i in self._lang_mapping_data:
            if i[1].lower() == lang.lower():
-                return unicode(i[0])
+                return i[0]
    
    def crubadan_to_iso(self, lang):
        ''' Return ISO 639-3 code given internal Crubadan code '''
        for i in self._lang_mapping_data:
            if i[0].lower() == lang.lower():
-                return unicode(i[1])
+                return i[1]
    
    def _load_lang_mapping_data(self):
        ''' Load language mappings between codes and description from table.txt '''
@@ -74,8 +74,12 @@ class CrubadanCorpusReader(CorpusReader):
        mapper_file = path.join(self.root, self._LANG_MAPPER_FILE)
        if self._LANG_MAPPER_FILE not in self.fileids():
            raise RuntimeError("Could not find language mapper file: " + mapper_file)
-        
-        raw = open(mapper_file, 'rU').read().decode('utf-8').strip()
+
+        if nltk.compat.PY3:
+            raw = open(mapper_file, 'r', encoding='utf-8').read().strip()
+        else:
+            raw = open(mapper_file, 'rU').read().decode('utf-8').strip()
+
        self._lang_mapping_data = [row.split('\t') for row in raw.split('\n')]
        
    def _load_lang_ngrams(self, lang):
@@ -83,22 +87,27 @@ class CrubadanCorpusReader(CorpusReader):
            and return its FreqDist '''
        
        crubadan_code = self.iso_to_crubadan(lang)
-        ngram_file = path.join(self.root, unicode(crubadan_code) + '-3grams.txt')
+        ngram_file = path.join(self.root, crubadan_code + '-3grams.txt')
        
        if not path.isfile(ngram_file):
            raise Runtime("Could not find language n-gram file for " + lang)

        counts = FreqDist()
-            
-        f = open(ngram_file, 'rU')
-        
+        if nltk.compat.PY3:
+            f = open(ngram_file, 'r', encoding='utf-8')
+        else:
+            f = open(ngram_file, 'rU')
+
        for line in f:
-            data = line.decode('utf-8').split(' ')
-            
+            if nltk.compat.PY3:
+                data = line.split(' ')
+            else:
+                data = line.decode('utf8').split(' ')
+
            ngram = data[1].strip('\n')
            freq = int(data[0])
            
            counts[ngram] = freq
            
        return counts
-        
\ No newline at end of file
+