Commit 9c5fd33d by Steven Bird

clean ups and python 3 fixes

parent 41a1cae5
...@@ -42,13 +42,12 @@ from sys import maxint ...@@ -42,13 +42,12 @@ from sys import maxint
# is an alternative to the standard re module that supports # is an alternative to the standard re module that supports
# Unicode codepoint properties with the \p{} syntax. # Unicode codepoint properties with the \p{} syntax.
# You may have to "pip install regx" # You may have to "pip install regx"
try: try:
import regex as re import regex
except ImportError: except ImportError:
re = None pass
######################################################################
## Language identification using TextCat
######################################################################
class TextCat(object): class TextCat(object):
...@@ -71,6 +70,7 @@ class TextCat(object): ...@@ -71,6 +70,7 @@ class TextCat(object):
def trigrams(self, text): def trigrams(self, text):
padded_text = self._START_CHAR + text + self._END_CHAR padded_text = self._START_CHAR + text + self._END_CHAR
trigrams = [] trigrams = []
# Generate 3-grams for given text # Generate 3-grams for given text
for i in range(0, len(padded_text) - 2): for i in range(0, len(padded_text) - 2):
cur_trigram = padded_text[i:(i + 3)] cur_trigram = padded_text[i:(i + 3)]
...@@ -87,7 +87,7 @@ class TextCat(object): ...@@ -87,7 +87,7 @@ class TextCat(object):
def remove_punctuation(self, text): def remove_punctuation(self, text):
''' Get rid of punctuation except apostrophes ''' ''' Get rid of punctuation except apostrophes '''
return re.sub(ur"[^\P{P}\']+", "", text.decode('utf8')) return regex.sub(r"[^\P{P}\']+", "", text.decode('utf8'))
def profile(self, text): def profile(self, text):
''' Create FreqDist of trigrams within text ''' ''' Create FreqDist of trigrams within text '''
...@@ -129,14 +129,11 @@ class TextCat(object): ...@@ -129,14 +129,11 @@ class TextCat(object):
the text and all languages ''' the text and all languages '''
distances = {} distances = {}
profile = self.profile(text) profile = self.profile(text)
# For all the languages
for lang in self._corpus.all_lang_freq.keys(): for lang in self._corpus.all_lang_freq.keys():
# Calculate distance metric for every trigram in
# input text to be identified
lang_dist = 0 lang_dist = 0
for trigram in profile: for trigram in profile:
lang_dist += self.calc_dist(lang, trigram, profile) lang_dist += self.calc_dist(lang, trigram, profile)
distances[lang] = lang_dist distances[lang] = lang_dist
return distances return distances
...@@ -160,7 +157,7 @@ class TextCat(object): ...@@ -160,7 +157,7 @@ class TextCat(object):
for f in listdir(path): for f in listdir(path):
if isfile(f): if isfile(f):
m = re.match('sample_\w+\.txt', f) m = regex.match('sample_\w+\.txt', f)
if m: lang_samples.append(f) if m: lang_samples.append(f)
print(lang_samples) print(lang_samples)
......
...@@ -19,7 +19,6 @@ For details about An Crubadan, this data, and its potential uses, see: ...@@ -19,7 +19,6 @@ For details about An Crubadan, this data, and its potential uses, see:
http://borel.slu.edu/crubadan/index.html http://borel.slu.edu/crubadan/index.html
""" """
# Ensure that your own literal strings default to unicode rather than str.
from __future__ import print_function, unicode_literals from __future__ import print_function, unicode_literals
from nltk.corpus.reader import CorpusReader from nltk.corpus.reader import CorpusReader
...@@ -28,9 +27,6 @@ from nltk.data import ZipFilePathPointer ...@@ -28,9 +27,6 @@ from nltk.data import ZipFilePathPointer
import re import re
from re import escape, search from re import escape, search
######################################################################
## An Crubadan N-gram Corpus Reader
######################################################################
class CrubadanCorpusReader(CorpusReader): class CrubadanCorpusReader(CorpusReader):
""" """
...@@ -99,7 +95,7 @@ class CrubadanCorpusReader(CorpusReader): ...@@ -99,7 +95,7 @@ class CrubadanCorpusReader(CorpusReader):
self._lang_mapping_data.append( row.split('\t') ) self._lang_mapping_data.append( row.split('\t') )
# Get rid of empty entry if last line in file is blank # Get rid of empty entry if last line in file is blank
if self._lang_mapping_data[ len(self._lang_mapping_data) - 1 ] == [u'']: if self._lang_mapping_data[ len(self._lang_mapping_data) - 1 ] == ['']:
self._lang_mapping_data.pop() self._lang_mapping_data.pop()
def _load_lang_ngrams(self, lang): def _load_lang_ngrams(self, lang):
...@@ -118,7 +114,7 @@ class CrubadanCorpusReader(CorpusReader): ...@@ -118,7 +114,7 @@ class CrubadanCorpusReader(CorpusReader):
f = open(ngram_file, 'rU') f = open(ngram_file, 'rU')
for line in f: for line in f:
data = line.decode('utf-8').split(u' ') data = line.decode('utf-8').split(' ')
ngram = data[1].strip('\n') ngram = data[1].strip('\n')
freq = int(data[0]) freq = int(data[0])
......
...@@ -4,118 +4,85 @@ ...@@ -4,118 +4,85 @@
Crubadan Corpus Reader Crubadan Corpus Reader
====================== ======================
Crubadan is an NLTK corpus reader for n-gram files provided Crubadan is an NLTK corpus reader for ngram files provided
by the Crubadan project, and can be imported like this: by the Crubadan project. It supports several languages.
>>> import nltk
>>> from nltk.corpus import crubadan >>> from nltk.corpus import crubadan
>>> crubadan.langs()
The name can be hard to remember so for more compact
and easy to remember code, try:
>>> from nltk.corpus import crubadan as cb
------------------------
Language support methods
------------------------
To get a list of supported languages as ISO 639-3 codes:
>>> crubadan.langs() # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
['abk', 'abn', 'ace', ..., 'zne', 'zpa', 'zul']
To check if a single specific language is supported, check
membership in this list. For example, to check for English:
>>> 'Yes' if 'eng' in crubadan.langs() else 'No'
'Yes'
And for an unsupported language, say, Ghotuo ('aaa' in ISO 639-3):
>>> 'Yes' if 'aaa' in crubadan.langs() else 'No'
'No'
---------------------------------------- ----------------------------------------
Language code mapping and helper methods Language code mapping and helper methods
---------------------------------------- ----------------------------------------
The web crawler that generated the 3-gram frequencies The web crawler that generated the 3-gram frequencies works at the
actually works at the level of "writing systems" rather level of "writing systems" rather than languages. These are assigned
than "languages". It assigned internal 2-3 letter internal 2-3 letter "writing system codes" that require mapping to the
"writing system codes" that require mapping to the standard standard ISO 639-3 codes.
ISO 639-3 codes.
For details, please refer to the README in nltk_data/crubadan
For more info, please refer to the README in nltk/nltk_data/crubadan folder after installing it.
folder after installing it.
To translate ISO 639-3 codes to "Crubadan Code":
To translate ISO 639-3 codes to "Crubadan Code", for example, English:
>>> crubadan.iso_to_crubadan('eng') >>> crubadan.iso_to_crubadan('eng')
'en' 'en'
And, French:
>>> crubadan.iso_to_crubadan('fra') >>> crubadan.iso_to_crubadan('fra')
'fr' 'fr'
>>> crubadan.iso_to_crubadan('aaa')
Unsupported language returns None, using Ghotuo example from above: None
>>> code_found = crubadan.iso_to_crubadan('aaa')
>>> code_found is None In reverse, print ISO 639-3 code if we have the Crubadan Code:
True
In reverse; print ISO 639-3 code if we have Crubadan Code for English:
>>> crubadan.crubadan_to_iso('en') >>> crubadan.crubadan_to_iso('en')
'eng' 'eng'
Another example, French:
>>> crubadan.crubadan_to_iso('fr') >>> crubadan.crubadan_to_iso('fr')
'fra' 'fra'
>>> crubadan.crubadan_to_iso('aa')
None
---------------------------
Accessing ngram frequencies
---------------------------
On initialization, the reader will create a dictionary of every
language supported by the Crubadan project, mapping the ISO 639-3
language code to its corresponding ngram frequency.
The end result is a dictionary of FreqDists representing all the
languages that can be accessed via "all_lang_freq":
Unsupported language returns None. Crubadan does not
support Ghotuo, so let's continue using it as an example.
Say the theoretical Crubadan Code for the Ghotuo language
would have been 'aa', then:
>>> code_found = crubadan.crubadan_to_iso('aa')
>>> code_found is None
True
------------------------------------
N-gram and their frequencies methods
------------------------------------
On initialization the reader will create a dictionary of every
language supported by the Crubadan project, mapping the ISO 639-3
language code to its corresponding n-gram frequency.
The end result is a dictionary of FreqDists representing all the
languages that can be accessed via "all_lang_freq":
>>> crubadan.all_lang_freq # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE >>> crubadan.all_lang_freq # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
{'roh': FreqDist({'<da': 33783, ...})} {'roh': FreqDist({'<da': 33783, ...})}
You can access individual language FreqDist and the n-grams within them as follows: You can access individual language FreqDist and the ngrams within them as follows:
say you're interested in knowing the frequency of the n-gram 'the' in English: say you're interested in knowing the frequency of the ngram 'the' in English:
>>> english_fd = crubadan.all_lang_freq['eng'] >>> english_fd = crubadan.all_lang_freq['eng']
>>> english_fd['the'] >>> english_fd['the']
728135 728135
Alternatively, you may use the "lang_freq" method to retrieve the FreqDist Alternatively, you may use the "lang_freq" method to retrieve the FreqDist
directly, simply pass the ISO 639-3 code for the language you're interested in: directly, simply pass the ISO 639-3 code for the language you're interested in:
>>> crubadan.lang_freq('eng') # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE >>> crubadan.lang_freq('eng') # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
FreqDist({'<th': 954187, 'the': 728135, 'he>': 655273, 'ing': 360758, 'nd>': 359481, 'ng>': FreqDist({'<th': 954187, 'the': 728135, 'he>': 655273, 'ing': 360758, 'nd>': 359481, 'ng>':
351373, '<an': 345633, '<to': 333091, 'ed>': 329517, '<of': 316431, ...}) 351373, '<an': 345633, '<to': 333091, 'ed>': 329517, '<of': 316431, ...})
A companion convenience method to above is "ngram_freq", which can be A companion convenience method to above is "ngram_freq", which can be
used to retrieve a specific n-gram frequecy more explicitly. As usual, used to retrieve a specific ngram frequecy more explicitly. As usual,
pass in the language of interest by its ISO 639-3 code and the n-gram pass in the language of interest by its ISO 639-3 code and the ngram
of interest. Using the example above to get the frequency of 'the' in English: of interest. Using the example above to get the frequency of 'the' in English:
>>> crubadan.ngram_freq('eng', 'the') >>> crubadan.ngram_freq('eng', 'the')
728135 728135
A n-gram that isn't found within the language will return 0: A ngram that isn't found within the language will return 0:
>>> crubadan.ngram_freq('eng', 'test') >>> crubadan.ngram_freq('eng', 'test')
0 0
A language that isn't supported will throw an exception: A language that isn't supported will raise an exception:
>>> crubadan.ngram_freq('elvish', 'test') >>> crubadan.ngram_freq('elvish', 'test')
Traceback (most recent call last): Traceback (most recent call last):
... ...
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment