clean ups and python 3 fixes

9c5fd33d · Steven Bird · 41a1cae5 · 9c5fd33d · 9c5fd33d · 9c5fd33d
Commit 9c5fd33d authored Mar 12, 2015 by Steven Bird
Hide whitespace changes
Inline Side-by-side

Showing with 54 additions and 94 deletions

nltk/classify/textcat.py
+8 -11

nltk/corpus/reader/crubadan.py
+2 -6

nltk/test/crubadan.doctest
+44 -77

No files found.
--- a/nltk/classify/textcat.py
+++ b/nltk/classify/textcat.py
@@ -42,13 +42,12 @@ from sys import maxint
 # is an alternative to the standard re module that supports
 # Unicode codepoint properties with the \p{} syntax.
 # You may have to "pip install regx"
+
 try:
-    import regex as re  
+    import regex
 except ImportError:
-    re = None
-######################################################################
-##  Language identification using TextCat
-######################################################################
+    pass
+

 class TextCat(object):

@@ -71,6 +70,7 @@ class TextCat(object):
    def trigrams(self, text):
        padded_text = self._START_CHAR + text + self._END_CHAR
        trigrams = []
+
        # Generate 3-grams for given text
        for i in range(0, len(padded_text) - 2):
            cur_trigram = padded_text[i:(i + 3)]
@@ -87,7 +87,7 @@ class TextCat(object):
        
    def remove_punctuation(self, text):
        ''' Get rid of punctuation except apostrophes '''
-        return re.sub(ur"[^\P{P}\']+", "", text.decode('utf8'))
+        return regex.sub(r"[^\P{P}\']+", "", text.decode('utf8'))
    
    def profile(self, text):
        ''' Create FreqDist of trigrams within text '''
@@ -129,14 +129,11 @@ class TextCat(object):
            the text and all languages '''
        distances = {}
        profile = self.profile(text)
-        # For all the languages
+
        for lang in self._corpus.all_lang_freq.keys():
-            # Calculate distance metric for every trigram in
-            # input text to be identified
            lang_dist = 0
            for trigram in profile:
                lang_dist += self.calc_dist(lang, trigram, profile)
-        
            distances[lang] = lang_dist
            
        return distances
@@ -160,7 +157,7 @@ class TextCat(object):
        
        for f in listdir(path):
            if isfile(f):
-                m = re.match('sample_\w+\.txt', f)
+                m = regex.match('sample_\w+\.txt', f)
                if m: lang_samples.append(f)
                
        print(lang_samples)

--- a/nltk/corpus/reader/crubadan.py
+++ b/nltk/corpus/reader/crubadan.py
@@ -19,7 +19,6 @@ For details about An Crubadan, this data, and its potential uses, see:
 http://borel.slu.edu/crubadan/index.html
 """

-# Ensure that your own literal strings default to unicode rather than str.
 from __future__ import print_function, unicode_literals

 from nltk.corpus.reader import CorpusReader
@@ -28,9 +27,6 @@ from nltk.data import ZipFilePathPointer

 import re
 from re import escape, search
-######################################################################
-##  An Crubadan N-gram Corpus Reader
-######################################################################

 class CrubadanCorpusReader(CorpusReader):
    """
@@ -99,7 +95,7 @@ class CrubadanCorpusReader(CorpusReader):
            self._lang_mapping_data.append( row.split('\t') )
        
        # Get rid of empty entry if last line in file is blank
-        if self._lang_mapping_data[ len(self._lang_mapping_data) - 1 ] == [u'']:
+        if self._lang_mapping_data[ len(self._lang_mapping_data) - 1 ] == ['']:
            self._lang_mapping_data.pop()

    def _load_lang_ngrams(self, lang):
@@ -118,7 +114,7 @@ class CrubadanCorpusReader(CorpusReader):
        f = open(ngram_file, 'rU')
        
        for line in f:
-            data = line.decode('utf-8').split(u' ')
+            data = line.decode('utf-8').split(' ')
            
            ngram = data[1].strip('\n')
            freq = int(data[0])

--- a/nltk/test/crubadan.doctest
+++ b/nltk/test/crubadan.doctest
@@ -4,118 +4,85 @@
 Crubadan Corpus Reader
 ======================

-Crubadan is an NLTK corpus reader for n-gram files provided
-by the Crubadan project, and can be imported like this:
+Crubadan is an NLTK corpus reader for ngram files provided
+by the Crubadan project. It supports several languages.

-    >>> import nltk
    >>> from nltk.corpus import crubadan
-
-The name can be hard to remember so for more compact
-and easy to remember code, try:
-
-    >>> from nltk.corpus import crubadan as cb
-
------------------------
-Language support methods
------------------------
-
-    To get a list of supported languages as ISO 639-3 codes:
-    
-    >>> crubadan.langs() # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
-    ['abk', 'abn', 'ace', ..., 'zne', 'zpa', 'zul']
-    
-    To check if a single specific language is supported, check
-    membership in this list. For example, to check for English:
-    
-    >>> 'Yes' if 'eng' in crubadan.langs() else 'No'
-    'Yes'
-
-    And for an unsupported language, say, Ghotuo ('aaa' in ISO 639-3):
-    >>> 'Yes' if 'aaa' in crubadan.langs() else 'No'
-    'No'
+    >>> crubadan.langs()

 ----------------------------------------
 Language code mapping and helper methods
 ----------------------------------------

-    The web crawler that generated the 3-gram frequencies
-    actually works at the level of "writing systems" rather
-    than "languages". It assigned internal 2-3 letter 
-    "writing system codes" that require mapping to the standard
-    ISO 639-3 codes. 
-    
-    For more info, please refer to the README in nltk/nltk_data/crubadan
-    folder after installing it.
-    
-    To translate ISO 639-3 codes to "Crubadan Code", for example, English:
+The web crawler that generated the 3-gram frequencies works at the
+level of "writing systems" rather than languages. These are assigned
+internal 2-3 letter "writing system codes" that require mapping to the
+standard ISO 639-3 codes.
+
+For details, please refer to the README in nltk_data/crubadan
+folder after installing it.
+
+To translate ISO 639-3 codes to "Crubadan Code":
+
    >>> crubadan.iso_to_crubadan('eng')
    'en'
-
-    And, French:
    >>> crubadan.iso_to_crubadan('fra')
    'fr'
-    
-    Unsupported language returns None, using Ghotuo example from above:
-    >>> code_found = crubadan.iso_to_crubadan('aaa')
-    >>> code_found is None
-    True
+    >>> crubadan.iso_to_crubadan('aaa')
+    None
+
+In reverse, print ISO 639-3 code if we have the Crubadan Code:

-    In reverse; print ISO 639-3 code if we have Crubadan Code for English:
    >>> crubadan.crubadan_to_iso('en')
    'eng'
-    
-    Another example, French:
    >>> crubadan.crubadan_to_iso('fr')
    'fra'
+    >>> crubadan.crubadan_to_iso('aa')
+    None
+
+---------------------------
+Accessing ngram frequencies
+---------------------------
+
+On initialization, the reader will create a dictionary of every
+language supported by the Crubadan project, mapping the ISO 639-3
+language code to its corresponding ngram frequency.
+
+The end result is a dictionary of FreqDists representing all the
+languages that can be accessed via "all_lang_freq":

-    Unsupported language returns None. Crubadan does not
-    support Ghotuo, so let's continue using it as an example.
-    Say the theoretical Crubadan Code for the Ghotuo language
-    would have been 'aa', then:
-    >>> code_found = crubadan.crubadan_to_iso('aa')
-    >>> code_found is None
-    True
-
------------------------------------
-N-gram and their frequencies methods
------------------------------------
-
-    On initialization the reader will create a dictionary of every
-    language supported by the Crubadan project, mapping the ISO 639-3
-    language code to its corresponding n-gram frequency.
-    The end result is a dictionary of FreqDists representing all the
-    languages that can be accessed via "all_lang_freq":
-    
    >>> crubadan.all_lang_freq # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
    {'roh': FreqDist({'<da': 33783, ...})}

-    You can access individual language FreqDist and the n-grams within them as follows:
-    say you're interested in knowing the frequency of the n-gram 'the' in English:
+You can access individual language FreqDist and the ngrams within them as follows:
+say you're interested in knowing the frequency of the ngram 'the' in English:

    >>> english_fd = crubadan.all_lang_freq['eng']
    >>> english_fd['the']
    728135

-    Alternatively, you may use the "lang_freq" method to retrieve the FreqDist
-    directly, simply pass the ISO 639-3 code for the language you're interested in:
-    
+Alternatively, you may use the "lang_freq" method to retrieve the FreqDist
+directly, simply pass the ISO 639-3 code for the language you're interested in:
+
    >>> crubadan.lang_freq('eng') # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
    FreqDist({'<th': 954187, 'the': 728135, 'he>': 655273, 'ing': 360758, 'nd>': 359481, 'ng>':
    351373, '<an': 345633, '<to': 333091, 'ed>': 329517, '<of': 316431, ...})

-    A companion convenience method to above is "ngram_freq", which can be
-    used to retrieve a specific n-gram frequecy more explicitly. As usual, 
-    pass in the language of interest by its ISO 639-3 code and the n-gram
-    of interest. Using the example above to get the frequency of 'the' in English:
-    
+A companion convenience method to above is "ngram_freq", which can be
+used to retrieve a specific ngram frequecy more explicitly. As usual, 
+pass in the language of interest by its ISO 639-3 code and the ngram
+of interest. Using the example above to get the frequency of 'the' in English:
+
    >>> crubadan.ngram_freq('eng', 'the')
    728135

-    A n-gram that isn't found within the language will return 0:
+A ngram that isn't found within the language will return 0:
+
    >>> crubadan.ngram_freq('eng', 'test')
    0

-    A language that isn't supported will throw an exception:
+A language that isn't supported will raise an exception:
+
    >>> crubadan.ngram_freq('elvish', 'test')
    Traceback (most recent call last):
    ...