Commit 9c5fd33d by Steven Bird

clean ups and python 3 fixes

parent 41a1cae5
......@@ -42,13 +42,12 @@ from sys import maxint
# is an alternative to the standard re module that supports
# Unicode codepoint properties with the \p{} syntax.
# You may have to "pip install regx"
try:
import regex as re
import regex
except ImportError:
re = None
######################################################################
## Language identification using TextCat
######################################################################
pass
class TextCat(object):
......@@ -71,6 +70,7 @@ class TextCat(object):
def trigrams(self, text):
padded_text = self._START_CHAR + text + self._END_CHAR
trigrams = []
# Generate 3-grams for given text
for i in range(0, len(padded_text) - 2):
cur_trigram = padded_text[i:(i + 3)]
......@@ -87,7 +87,7 @@ class TextCat(object):
def remove_punctuation(self, text):
''' Get rid of punctuation except apostrophes '''
return re.sub(ur"[^\P{P}\']+", "", text.decode('utf8'))
return regex.sub(r"[^\P{P}\']+", "", text.decode('utf8'))
def profile(self, text):
''' Create FreqDist of trigrams within text '''
......@@ -129,14 +129,11 @@ class TextCat(object):
the text and all languages '''
distances = {}
profile = self.profile(text)
# For all the languages
for lang in self._corpus.all_lang_freq.keys():
# Calculate distance metric for every trigram in
# input text to be identified
lang_dist = 0
for trigram in profile:
lang_dist += self.calc_dist(lang, trigram, profile)
distances[lang] = lang_dist
return distances
......@@ -160,7 +157,7 @@ class TextCat(object):
for f in listdir(path):
if isfile(f):
m = re.match('sample_\w+\.txt', f)
m = regex.match('sample_\w+\.txt', f)
if m: lang_samples.append(f)
print(lang_samples)
......
......@@ -19,7 +19,6 @@ For details about An Crubadan, this data, and its potential uses, see:
http://borel.slu.edu/crubadan/index.html
"""
# Ensure that your own literal strings default to unicode rather than str.
from __future__ import print_function, unicode_literals
from nltk.corpus.reader import CorpusReader
......@@ -28,9 +27,6 @@ from nltk.data import ZipFilePathPointer
import re
from re import escape, search
######################################################################
## An Crubadan N-gram Corpus Reader
######################################################################
class CrubadanCorpusReader(CorpusReader):
"""
......@@ -99,7 +95,7 @@ class CrubadanCorpusReader(CorpusReader):
self._lang_mapping_data.append( row.split('\t') )
# Get rid of empty entry if last line in file is blank
if self._lang_mapping_data[ len(self._lang_mapping_data) - 1 ] == [u'']:
if self._lang_mapping_data[ len(self._lang_mapping_data) - 1 ] == ['']:
self._lang_mapping_data.pop()
def _load_lang_ngrams(self, lang):
......@@ -118,7 +114,7 @@ class CrubadanCorpusReader(CorpusReader):
f = open(ngram_file, 'rU')
for line in f:
data = line.decode('utf-8').split(u' ')
data = line.decode('utf-8').split(' ')
ngram = data[1].strip('\n')
freq = int(data[0])
......
......@@ -4,118 +4,85 @@
Crubadan Corpus Reader
======================
Crubadan is an NLTK corpus reader for n-gram files provided
by the Crubadan project, and can be imported like this:
Crubadan is an NLTK corpus reader for ngram files provided
by the Crubadan project. It supports several languages.
>>> import nltk
>>> from nltk.corpus import crubadan
The name can be hard to remember so for more compact
and easy to remember code, try:
>>> from nltk.corpus import crubadan as cb
------------------------
Language support methods
------------------------
To get a list of supported languages as ISO 639-3 codes:
>>> crubadan.langs() # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
['abk', 'abn', 'ace', ..., 'zne', 'zpa', 'zul']
To check if a single specific language is supported, check
membership in this list. For example, to check for English:
>>> 'Yes' if 'eng' in crubadan.langs() else 'No'
'Yes'
And for an unsupported language, say, Ghotuo ('aaa' in ISO 639-3):
>>> 'Yes' if 'aaa' in crubadan.langs() else 'No'
'No'
>>> crubadan.langs()
----------------------------------------
Language code mapping and helper methods
----------------------------------------
The web crawler that generated the 3-gram frequencies
actually works at the level of "writing systems" rather
than "languages". It assigned internal 2-3 letter
"writing system codes" that require mapping to the standard
ISO 639-3 codes.
For more info, please refer to the README in nltk/nltk_data/crubadan
folder after installing it.
To translate ISO 639-3 codes to "Crubadan Code", for example, English:
The web crawler that generated the 3-gram frequencies works at the
level of "writing systems" rather than languages. These are assigned
internal 2-3 letter "writing system codes" that require mapping to the
standard ISO 639-3 codes.
For details, please refer to the README in nltk_data/crubadan
folder after installing it.
To translate ISO 639-3 codes to "Crubadan Code":
>>> crubadan.iso_to_crubadan('eng')
'en'
And, French:
>>> crubadan.iso_to_crubadan('fra')
'fr'
Unsupported language returns None, using Ghotuo example from above:
>>> code_found = crubadan.iso_to_crubadan('aaa')
>>> code_found is None
True
>>> crubadan.iso_to_crubadan('aaa')
None
In reverse, print ISO 639-3 code if we have the Crubadan Code:
In reverse; print ISO 639-3 code if we have Crubadan Code for English:
>>> crubadan.crubadan_to_iso('en')
'eng'
Another example, French:
>>> crubadan.crubadan_to_iso('fr')
'fra'
>>> crubadan.crubadan_to_iso('aa')
None
---------------------------
Accessing ngram frequencies
---------------------------
On initialization, the reader will create a dictionary of every
language supported by the Crubadan project, mapping the ISO 639-3
language code to its corresponding ngram frequency.
The end result is a dictionary of FreqDists representing all the
languages that can be accessed via "all_lang_freq":
Unsupported language returns None. Crubadan does not
support Ghotuo, so let's continue using it as an example.
Say the theoretical Crubadan Code for the Ghotuo language
would have been 'aa', then:
>>> code_found = crubadan.crubadan_to_iso('aa')
>>> code_found is None
True
------------------------------------
N-gram and their frequencies methods
------------------------------------
On initialization the reader will create a dictionary of every
language supported by the Crubadan project, mapping the ISO 639-3
language code to its corresponding n-gram frequency.
The end result is a dictionary of FreqDists representing all the
languages that can be accessed via "all_lang_freq":
>>> crubadan.all_lang_freq # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
{'roh': FreqDist({'<da': 33783, ...})}
You can access individual language FreqDist and the n-grams within them as follows:
say you're interested in knowing the frequency of the n-gram 'the' in English:
You can access individual language FreqDist and the ngrams within them as follows:
say you're interested in knowing the frequency of the ngram 'the' in English:
>>> english_fd = crubadan.all_lang_freq['eng']
>>> english_fd['the']
728135
Alternatively, you may use the "lang_freq" method to retrieve the FreqDist
directly, simply pass the ISO 639-3 code for the language you're interested in:
Alternatively, you may use the "lang_freq" method to retrieve the FreqDist
directly, simply pass the ISO 639-3 code for the language you're interested in:
>>> crubadan.lang_freq('eng') # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
FreqDist({'<th': 954187, 'the': 728135, 'he>': 655273, 'ing': 360758, 'nd>': 359481, 'ng>':
351373, '<an': 345633, '<to': 333091, 'ed>': 329517, '<of': 316431, ...})
A companion convenience method to above is "ngram_freq", which can be
used to retrieve a specific n-gram frequecy more explicitly. As usual,
pass in the language of interest by its ISO 639-3 code and the n-gram
of interest. Using the example above to get the frequency of 'the' in English:
A companion convenience method to above is "ngram_freq", which can be
used to retrieve a specific ngram frequecy more explicitly. As usual,
pass in the language of interest by its ISO 639-3 code and the ngram
of interest. Using the example above to get the frequency of 'the' in English:
>>> crubadan.ngram_freq('eng', 'the')
728135
A n-gram that isn't found within the language will return 0:
A ngram that isn't found within the language will return 0:
>>> crubadan.ngram_freq('eng', 'test')
0
A language that isn't supported will throw an exception:
A language that isn't supported will raise an exception:
>>> crubadan.ngram_freq('elvish', 'test')
Traceback (most recent call last):
...
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment