Commit 03c7a9fa by Steven Bird

Merge pull request #890 from avitalp/crubadan

Added corpus reader for n-gram frequencies #845 #884
parents 383173aa 82cc85bd
......@@ -95,3 +95,4 @@ from nltk.classify.maxent import (MaxentClassifier, BinaryMaxentFeatureEncoding,
TypedMaxentFeatureEncoding,
ConditionalExponentialClassifier)
from nltk.classify.senna import Senna
from nltk.classify.textcat import TextCat
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Language ID module using TextCat algorithm
#
# Copyright (C) 2001-2015 NLTK Project
# Author: Avital Pekker <avital.pekker@utoronto.ca>
#
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
A module for language identification using the TextCat algorithm.
An implementation of the text categorization algorithm
presented in Cavnar, W. B. and J. M. Trenkle,
"N-Gram-Based Text Categorization".
The algorithm takes advantage of Zipf's law and uses
n-gram frequencies to profile languages and text-yet to
be identified-then compares using a distance measure.
Language n-grams are provided by the "An Crubadan"
project. A corpus reader was created seperately to read
those files.
For details regarding the algorithm, see:
http://www.let.rug.nl/~vannoord/TextCat/textcat.pdf
For details about An Crubadan, see:
http://borel.slu.edu/crubadan/index.html
"""
# Ensure that your own literal strings default to unicode rather than str.
from __future__ import print_function, unicode_literals
import nltk
from nltk.corpus import CrubadanCorpusReader
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from sys import maxint
# Note: this is NOT "re" you're likely used to. The regex module
# is an alternative to the standard re module that supports
# Unicode codepoint properties with the \p{} syntax.
# You may have to "pip install regx"
import regex as re
######################################################################
## Language identification using TextCat
######################################################################
class TextCat():
_corpus = None
fingerprints = {}
_START_CHAR = "<".encode('utf8')
_END_CHAR = ">".encode('utf8')
last_distances = {}
def __init__(self):
self._corpus = CrubadanCorpusReader(nltk.data.find('corpora/crubadan'), '.*\.txt')
def trigrams(self, text):
padded_text = self._START_CHAR + text + self._END_CHAR
trigrams = []
# Generate 3-grams for given text
for i in range(0, len(padded_text) - 2):
cur_trigram = padded_text[i:(i + 3)]
if len(cur_trigram) == 2:
cur_trigram = cur_trigram + self._END_CHAR
trigrams.append(cur_trigram)
return trigrams
def _print_trigrams(self, trigrams):
for t in trigrams:
print(t)
def remove_punctuation(self, text):
''' Get rid of punctuation except apostrophes '''
return re.sub(ur"[^\P{P}\']+", "", text.decode('utf8'))
def profile(self, text):
''' Create FreqDist of trigrams within text '''
clean_text = self.remove_punctuation(text)
tokens = word_tokenize(clean_text)
fingerprint = FreqDist()
for t in tokens:
token_trigrams = self.trigrams(t)
for cur_trigram in token_trigrams:
if cur_trigram in fingerprint:
fingerprint[cur_trigram] += 1
else:
fingerprint[cur_trigram] = 1
return fingerprint
def calc_dist(self, lang, trigram, text_profile):
''' Calculate the "out-of-place" measure between the
text and language profile for a single trigram '''
lang_fd = self._corpus.all_lang_freq[lang]
dist = 0
if trigram in lang_fd:
idx_lang_profile = lang_fd.keys().index(trigram)
idx_text = text_profile.keys().index(trigram)
dist = abs(idx_lang_profile - idx_text)
else:
# Arbitrary but should be larger than
# any possible trigram file length
# in terms of total lines
dist = maxint
return dist
def lang_dists(self, text):
''' Calculate the "out-of-place" measure between
the text and all languages '''
distances = {}
profile = self.profile(text)
# For all the languages
for lang in self._corpus.all_lang_freq.keys():
# Calculate distance metric for every trigram in
# input text to be identified
lang_dist = 0
for trigram in profile:
lang_dist += self.calc_dist(lang, trigram, profile)
distances[lang] = lang_dist
return distances
def guess_language(self, text):
''' Find the language with the min distance
to the text and return its ISO 639-3 code '''
self.last_distances = self.lang_dists(text)
return min(r, key=self.last_distances.get)
def demo(self):
''' Demo of language guessing using a bunch of UTF-8 encoded
text files with snippets of text copied from news websites
around the web in different languages '''
from os import listdir
from os.path import isfile
# Current dir
path = '.'
lang_samples = []
for f in listdir(path):
if isfile(f):
m = re.match('sample_\w+\.txt', f)
if m: lang_samples.append(f)
print(lang_samples)
for f in lang_samples:
cur_sample = open(f, 'rU')
cur_data = cur_sample.read()
print('Language sample file: ' + f)
print('Contents snippet: ' + cur_data.decode('utf8')[0:140])
print('#################################################')
print('Language detection: ' + self.guess_language(cur_data))
print('#################################################')
......@@ -44,103 +44,34 @@ class CrubadanCorpusReader(CorpusReader):
super(CrubadanCorpusReader, self).__init__(root, fileids, encoding='utf8')
self._lang_mapping_data = []
self._load_lang_mapping_data()
def load_all_ngrams(self):
''' Create a dictionary of every supported language mapping
the ISO 639-3 language code to its corresponding n-gram
FreqDist. The result can be accessed via "all_lang_freq" var '''
# Filter out non n-gram files from the corpus dir
valid_files = []
for f in self.fileids():
m = re.search('(\w+)' + re.escape("-3grams.txt"), f)
if m:
valid_files.append( m.group() )
for f in valid_files:
ngram_file = self.root + '/' + f
import os.path
if os.path.isfile(ngram_file):
crubadan_code = f.split('-')[0]
iso_code = self.crubadan_to_iso(crubadan_code)
fd = self.load_lang_ngrams(iso_code)
self.all_lang_freq[iso_code] = fd
def load_lang_ngrams(self, lang):
''' Load single n-gram language file given the ISO 639-3 language code
and return its FreqDist '''
crubadan_code = self.iso_to_crubadan(lang)
ngram_file = self.root + '/' + unicode(crubadan_code) + '-3grams.txt'
import os.path
if not os.path.isfile(ngram_file):
raise CrubadanError("Could not find language n-gram file for [" + lang + "].")
counts = FreqDist()
f = open(ngram_file, 'rU')
self._load_all_ngrams()
for line in f:
data = line.decode('utf-8').split(u' ')
ngram = data[1].strip('\n')
freq = int(data[0])
counts[ngram] = freq
return counts
def lang_freq(self, lang):
''' Return n-gram FreqDist for a specific language
given ISO 639-3 language code '''
if len(self.all_lang_freq) == 0:
return self.load_lang_ngrams(lang)
else:
return self.all_lang_freq[lang]
self._load_all_ngrams()
return self.all_lang_freq[lang]
def ngram_freq(self, lang, ngram):
''' Return n-gram frequency as integer given
an ISO 639-3 language code and n-gram '''
if lang not in self.all_lang_freq:
raise CrubadanError("Unsupproted language [" + lang + "].")
raise CrubadanError("Unsupported language [" + lang + "].")
lf = self.all_lang_freq[lang]
return lf[ngram]
def supported_langs(self):
''' Return a list of supported languages in human-friendly form '''
l = []
def langs(self):
''' Return a list of supported languages as ISO 639-3 codes '''
supported = []
for i in self._lang_mapping_data:
l.append(i[2])
supported.append(i[1])
return supported
return l
def lang_supported(self, lang):
''' Check if a language is supported (language passed in as ISO 639-3 code) '''
for i in self._lang_mapping_data:
if i[1].lower() == lang.lower():
return True
return False
def iso_to_friendly(self, lang):
''' Return human-friendly name for a lanuage based on ISO 639-3 code '''
for i in self._lang_mapping_data:
if i[1].lower() == lang.lower():
return unicode(i[2])
return None
def friendly_to_iso(self, lang):
''' Return ISO 639-3 code from human-friendly language name (eg: "English" -> "en") '''
for i in self._lang_mapping_data:
if i[2].lower() == lang.lower():
return unicode(i[1])
def iso_to_crubadan(self, lang):
''' Return internal Crubadan code based on ISO 639-3 code '''
for i in self._lang_mapping_data:
......@@ -171,6 +102,55 @@ class CrubadanCorpusReader(CorpusReader):
if self._lang_mapping_data[ len(self._lang_mapping_data) - 1 ] == [u'']:
self._lang_mapping_data.pop()
def _load_lang_ngrams(self, lang):
''' Load single n-gram language file given the ISO 639-3 language code
and return its FreqDist '''
crubadan_code = self.iso_to_crubadan(lang)
ngram_file = self.root + '/' + unicode(crubadan_code) + '-3grams.txt'
import os.path
if not os.path.isfile(ngram_file):
raise CrubadanError("Could not find language n-gram file for [" + lang + "].")
counts = FreqDist()
f = open(ngram_file, 'rU')
for line in f:
data = line.decode('utf-8').split(u' ')
ngram = data[1].strip('\n')
freq = int(data[0])
counts[ngram] = freq
return counts
def _load_all_ngrams(self):
''' Create a dictionary of every supported language mapping
the ISO 639-3 language code to its corresponding n-gram
FreqDist. The result can be accessed via "all_lang_freq" var '''
# Filter out non n-gram files from the corpus dir
valid_files = []
for f in self.fileids():
m = re.search('(\w+)' + re.escape("-3grams.txt"), f)
if m:
valid_files.append( m.group() )
for f in valid_files:
ngram_file = self.root + '/' + f
import os.path
if os.path.isfile(ngram_file):
crubadan_code = f.split('-')[0]
iso_code = self.crubadan_to_iso(crubadan_code)
fd = self._load_lang_ngrams(iso_code)
self.all_lang_freq[iso_code] = fd
def _is_utf8(self, str):
''' Check if a string is utf8 encoded '''
try:
......
......@@ -4,8 +4,8 @@
Crubadan Corpus Reader
======================
Crubadan is just an NLTK corpus reader for n-gram files provided by
the Crubadan project, and can be imported like this:
Crubadan is an NLTK corpus reader for n-gram files provided
by the Crubadan project, and can be imported like this:
>>> import nltk
>>> from nltk.corpus import crubadan
......@@ -15,56 +15,28 @@ and easy to remember code, try:
>>> from nltk.corpus import crubadan as cb
------------------------
Language support methods
------------------------
To get a list of supported languages in human-friendly form:
To get a list of supported languages as ISO 639-3 codes:
>>> crubadan.supported_langs() # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
[u'Abkhaz', u'Abua', u'Aceh', u'Acholi', ..., u'Lachiguiri Zapotec', u'Zulu']
>>> crubadan.langs() # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
['abk', 'abn', 'ace', ..., 'zne', 'zpa', 'zul']
To check if a single specific language is supported pass in
its ISO 639-3 code and receive a boolean back:
>>> crubadan.lang_supported('eng')
True
>>> crubadan.lang_supported('fra')
True
To check if a single specific language is supported, check
membership in this list. For example, to check for English:
>>> 'Yes' if 'eng' in crubadan.langs() else 'No'
'Yes'
Unsupported language, Ghotuo ('aaa' in ISO 639-3):
>>> crubadan.lang_supported('aaa')
False
And for an unsupported language, say, Ghotuo ('aaa' in ISO 639-3):
>>> 'Yes' if 'aaa' in crubadan.langs() else 'No'
'No'
----------------------------------------
Language code mapping and helper methods
----------------------------------------
Print human-friendly name for a lanuage based on ISO 639-3 code.
For reference, a combined view of the language code tables of
ISO 639 parts 1, 2, and 3 may be viewed at:
http://www-01.sil.org/iso639-3/codes.asp
>>> crubadan.iso_to_friendly('eng')
u'English'
>>> crubadan.iso_to_friendly('fra')
u'French'
Unsupported language returns None, Ghotuo:
>>> name_found = crubadan.iso_to_friendly('aaa')
>>> name_found is None
True
Now, backwards; print ISO 639-3 code if have the human-friendly name.
>>> crubadan.friendly_to_iso('English')
u'eng'
>>> crubadan.friendly_to_iso('French')
u'fra'
Unsupported language returns None, using Ghotuo again:
>>> code_found = crubadan.friendly_to_iso('Ghotuo')
>>> code_found is None
True
The web crawler that generated the 3-gram frequencies
actually works at the level of "writing systems" rather
......@@ -75,26 +47,31 @@ Language code mapping and helper methods
For more info, please refer to the README in nltk/nltk_data/crubadan
folder after installing it.
To translate ISO 639-3 codes to "Crubadan Code":
To translate ISO 639-3 codes to "Crubadan Code", for example, English:
>>> crubadan.iso_to_crubadan('eng')
u'en'
'en'
And, French:
>>> crubadan.iso_to_crubadan('fra')
u'fr'
'fr'
Unsupported language returns None, using Ghotuo again:
Unsupported language returns None, using Ghotuo example from above:
>>> code_found = crubadan.iso_to_crubadan('aaa')
>>> code_found is None
True
In reverse; print ISO 639-3 code if we have Crubadan Code:
In reverse; print ISO 639-3 code if we have Crubadan Code for English:
>>> crubadan.crubadan_to_iso('en')
u'eng'
'eng'
Another example, French:
>>> crubadan.crubadan_to_iso('fr')
u'fra'
'fra'
Unsupported language returns None. Say the theoretical
Crubadan Code for the Ghotuo language would have been 'aa':
Unsupported language returns None. Crubadan does not
support Ghotuo, so let's continue using it as an example.
Say the theoretical Crubadan Code for the Ghotuo language
would have been 'aa', then:
>>> code_found = crubadan.crubadan_to_iso('aa')
>>> code_found is None
True
......@@ -103,57 +80,37 @@ Language code mapping and helper methods
N-gram and their frequencies methods
------------------------------------
To get a dictionary of every supported language mapping
the ISO 639-3 language code to its corresponding n-gram
frequency:
>>> crubadan.load_all_ngrams()
This can be rather slow and may take several seconds but the
end result is a FreqDist which can be accessed via "all_lang_freq":
On initialization the reader will create a dictionary of every
language supported by the Crubadan project, mapping the ISO 639-3
language code to its corresponding n-gram frequency.
The end result is a dictionary of FreqDists representing all the
languages that can be accessed via "all_lang_freq":
>>> crubadan.all_lang_freq # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
{u'roh': FreqDist({u'<da': 33783, ...})}
{'roh': FreqDist({'<da': 33783, ...})}
Each of these FreqDist can further be accessed for any n-gram, for instance,
if you're interested in knowing the frequency of the n-gram 'the' in English:
You can access individual language FreqDist and the n-grams within them as follows:
say you're interested in knowing the frequency of the n-gram 'the' in English:
>>> english_fd = crubadan.load_lang_ngrams('eng')
>>> english_fd = crubadan.all_lang_freq['eng']
>>> english_fd['the']
728135
If, instead, you're interested only in a particular language and
don't want to load all the language files, you may use
"load_lang_ngrams" method. The takes an ISO 639-3 code to
identify the language you're interested in:
>>> crubadan.load_lang_ngrams('eng') # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
FreqDist({u'<th': 954187, u'the': 728135, u'he>': 655273, u'ing': 360758, u'nd>': 359481, u'ng>': 351373,
u'<an': 345633, u'<to': 333091, u'ed>': 329517, u'<of': 316431, ...})
If you've used the "load_all_ngrams" method and want to
retrieve a FreqDist for a specific language anytime after,
pass the ISO 639-3 code for the language you're interested
to the "lang_freq" method:
Alternatively, you may use the "lang_freq" method to retrieve the FreqDist
directly, simply pass the ISO 639-3 code for the language you're interested in:
>>> crubadan.lang_freq('eng') # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
FreqDist({u'<th': 954187, u'the': 728135, u'he>': 655273, u'ing': 360758, u'nd>': 359481, u'ng>':
351373, u'<an': 345633, u'<to': 333091, u'ed>': 329517, u'<of': 316431, ...})
Note: this method will call the "load_lang_ngrams" method automatically
if you've not done so yourself and maybe slow on first run as a result.
FreqDist({'<th': 954187, 'the': 728135, 'he>': 655273, 'ing': 360758, 'nd>': 359481, 'ng>':
351373, '<an': 345633, '<to': 333091, 'ed>': 329517, '<of': 316431, ...})
Finally, although may always use the FreqDist directly
to retrieve a specific n-gram frequecy, you may use the
following convenience method instead:
A companion convenience method to above is "ngram_freq", which can be
used to retrieve a specific n-gram frequecy more explicitly. As usual,
pass in the language of interest by its ISO 639-3 code and the n-gram
of interest. Using the example above to get the frequency of 'the' in English:
>>> crubadan.ngram_freq('eng', 'the')
728135
The above returns the frequency of the n-gram 'the' for English.
As usual, pass in the language of interest by its ISO 639-3 code
and the n-gram of interest.
A n-gram that isn't found within the language will return 0:
>>> crubadan.ngram_freq('eng', 'test')
0
......@@ -162,5 +119,4 @@ N-gram and their frequencies methods
>>> crubadan.ngram_freq('elvish', 'test')
Traceback (most recent call last):
...
KeyError: 'elvish'
CrubadanError: Unsupported language [elvish]
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment