Commit 92e5fca0 by Steven Bird

Merge pull request #927 from avitalp/crubadan

Changes to reflect all modifications discussed in nltk#924
parents da73484f 226624b8
......@@ -28,33 +28,39 @@ For details about An Crubadan, see:
http://borel.slu.edu/crubadan/index.html
"""
# Ensure that your own literal strings default to unicode rather than str.
# Ensure that literal strings default to unicode rather than str.
from __future__ import print_function, unicode_literals
import nltk
from nltk.corpus import crubadan
import nltk.compat
from nltk.corpus import CrubadanCorpusReader
from nltk.util import trigrams
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from sys import maxint
if nltk.compat.PY3:
from sys import maxsize
else:
from sys import maxint
# Note: this is NOT "re" you're likely used to. The regex module
# is an alternative to the standard re module that supports
# Unicode codepoint properties with the \p{} syntax.
# You may have to "pip install regx"
try:
import regex
import regex as re
except ImportError:
pass
re = None
######################################################################
## Language identification using TextCat
######################################################################
class TextCat(object):
_corpus = None
fingerprints = {}
_START_CHAR = "<".encode('utf8')
_END_CHAR = ">".encode('utf8')
_START_CHAR = "<"
_END_CHAR = ">"
last_distances = {}
......@@ -65,29 +71,14 @@ class TextCat(object):
"see https://pypi.python.org/pypi/regex for "
"further details.")
self._corpus = crubadan
def trigrams(self, text):
padded_text = self._START_CHAR + text + self._END_CHAR
trigrams = []
# Generate 3-grams for given text
for i in range(0, len(padded_text) - 2):
cur_trigram = padded_text[i:(i + 3)]
if len(cur_trigram) == 2:
cur_trigram = cur_trigram + self._END_CHAR
trigrams.append(cur_trigram)
return trigrams
def _print_trigrams(self, trigrams):
for t in trigrams:
print(t)
self._corpus = CrubadanCorpusReader(nltk.data.find('corpora/crubadan'), '.*\.txt')
# Load all language ngrams into cache
for lang in self._corpus.langs():
self._corpus.lang_freq(lang)
def remove_punctuation(self, text):
''' Get rid of punctuation except apostrophes '''
return regex.sub(r"[^\P{P}\']+", "", text.decode('utf8'))
return re.sub(r"[^\P{P}\']+", "", text)
def profile(self, text):
''' Create FreqDist of trigrams within text '''
......@@ -96,7 +87,9 @@ class TextCat(object):
fingerprint = FreqDist()
for t in tokens:
token_trigrams = self.trigrams(t)
token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
token_trigrams = [''.join(tri) for tri in token_trigram_tuples]
for cur_trigram in token_trigrams:
if cur_trigram in fingerprint:
fingerprint[cur_trigram] += 1
......@@ -108,32 +101,41 @@ class TextCat(object):
def calc_dist(self, lang, trigram, text_profile):
''' Calculate the "out-of-place" measure between the
text and language profile for a single trigram '''
lang_fd = self._corpus.all_lang_freq[lang]
lang_fd = self._corpus.lang_freq(lang)
dist = 0
if trigram in lang_fd:
idx_lang_profile = lang_fd.keys().index(trigram)
idx_text = text_profile.keys().index(trigram)
idx_lang_profile = list(lang_fd.keys()).index(trigram)
idx_text = list(text_profile.keys()).index(trigram)
#print(idx_lang_profile, ", ", idx_text)
dist = abs(idx_lang_profile - idx_text)
else:
# Arbitrary but should be larger than
# any possible trigram file length
# in terms of total lines
dist = maxint
if nltk.compat.PY3:
dist = maxsize
else:
dist = maxint
return dist
def lang_dists(self, text):
''' Calculate the "out-of-place" measure between
the text and all languages '''
distances = {}
profile = self.profile(text)
for lang in self._corpus.all_lang_freq.keys():
# For all the languages
for lang in self._corpus._all_lang_freq.keys():
# Calculate distance metric for every trigram in
# input text to be identified
lang_dist = 0
for trigram in profile:
lang_dist += self.calc_dist(lang, trigram, profile)
distances[lang] = lang_dist
return distances
......@@ -144,37 +146,43 @@ class TextCat(object):
self.last_distances = self.lang_dists(text)
return min(self.last_distances, key=self.last_distances.get)
#################################################')
def demo(self):
from nltk.corpus import udhr
langs = ['Kurdish-UTF8', 'Abkhaz-UTF8', 'Farsi_Persian-UTF8',
'Hindi-UTF8', 'Hawaiian-UTF8', 'Russian-UTF8', 'Vietnamese-UTF8',
'Serbian_Srpski-UTF8','Esperanto-UTF8']
friendly = {'kmr':'Northern Kurdish',
'abk':'Abkhazian',
'pes':'Iranian Persian',
'hin':'Hindi',
'haw':'Hawaiian',
'rus':'Russian',
'vie':'Vietnamese',
'srp':'Serbian',
'epo':'Esperanto'}
def demo():
''' Demo of language guessing using a bunch of UTF-8 encoded
text files with snippets of text copied from news websites
around the web in different languages '''
from os import listdir
from os.path import isfile
path = '.'
lang_samples = []
tc = TextCat()
for f in listdir(path):
if isfile(f):
m = regex.match('sample_\w+\.txt', f)
if m: lang_samples.append(f)
print(lang_samples)
for f in lang_samples:
cur_sample = open(f, 'rU')
cur_data = cur_sample.read()
print('Language sample file: ' + f)
print('Contents snippet: ' + cur_data.decode('utf8')[0:140])
print('#################################################')
print('Language detection: ' + tc.guess_language(cur_data))
print('#################################################')
if __name__ == '__main__':
import doctest
doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE |
doctest.ELLIPSIS)
for cur_lang in langs:
# Get raw data from UDHR corpus
raw_sentences = udhr.sents(cur_lang)
rows = len(raw_sentences) - 1
cols = list(map(len, raw_sentences))
sample = ''
# Generate a sample text of the language
for i in range(0, rows):
cur_sent = ''
for j in range(0, cols[i]):
cur_sent += ' ' + raw_sentences[i][j]
sample += cur_sent
# Try to detect what it is
print('Language snippet: ' + sample[0:140] + '...')
guess = self.guess_language(sample)
print('Language detection: %s (%s)' % (guess, friendly[guess]))
print('#' * 140)
......@@ -22,8 +22,8 @@ http://borel.slu.edu/crubadan/index.html
from __future__ import print_function, unicode_literals
import re
import nltk.compat
from os import path
from nltk.corpus.reader import CorpusReader
from nltk.probability import FreqDist
from nltk.data import ZipFilePathPointer
......@@ -58,13 +58,13 @@ class CrubadanCorpusReader(CorpusReader):
''' Return internal Crubadan code based on ISO 639-3 code '''
for i in self._lang_mapping_data:
if i[1].lower() == lang.lower():
return unicode(i[0])
return i[0]
def crubadan_to_iso(self, lang):
''' Return ISO 639-3 code given internal Crubadan code '''
for i in self._lang_mapping_data:
if i[0].lower() == lang.lower():
return unicode(i[1])
return i[1]
def _load_lang_mapping_data(self):
''' Load language mappings between codes and description from table.txt '''
......@@ -74,8 +74,12 @@ class CrubadanCorpusReader(CorpusReader):
mapper_file = path.join(self.root, self._LANG_MAPPER_FILE)
if self._LANG_MAPPER_FILE not in self.fileids():
raise RuntimeError("Could not find language mapper file: " + mapper_file)
raw = open(mapper_file, 'rU').read().decode('utf-8').strip()
if nltk.compat.PY3:
raw = open(mapper_file, 'r', encoding='utf-8').read().strip()
else:
raw = open(mapper_file, 'rU').read().decode('utf-8').strip()
self._lang_mapping_data = [row.split('\t') for row in raw.split('\n')]
def _load_lang_ngrams(self, lang):
......@@ -83,22 +87,27 @@ class CrubadanCorpusReader(CorpusReader):
and return its FreqDist '''
crubadan_code = self.iso_to_crubadan(lang)
ngram_file = path.join(self.root, unicode(crubadan_code) + '-3grams.txt')
ngram_file = path.join(self.root, crubadan_code + '-3grams.txt')
if not path.isfile(ngram_file):
raise Runtime("Could not find language n-gram file for " + lang)
counts = FreqDist()
f = open(ngram_file, 'rU')
if nltk.compat.PY3:
f = open(ngram_file, 'r', encoding='utf-8')
else:
f = open(ngram_file, 'rU')
for line in f:
data = line.decode('utf-8').split(' ')
if nltk.compat.PY3:
data = line.split(' ')
else:
data = line.decode('utf8').split(' ')
ngram = data[1].strip('\n')
freq = int(data[0])
counts[ngram] = freq
return counts
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment