Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
N
nltk
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
edx
nltk
Commits
92e5fca0
Commit
92e5fca0
authored
May 18, 2015
by
Steven Bird
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #927 from avitalp/crubadan
Changes to reflect all modifications discussed in nltk#924
parents
da73484f
226624b8
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
99 additions
and
83 deletions
+99
-83
nltk/classify/textcat.py
+78
-70
nltk/corpus/reader/crubadan.py
+21
-13
No files found.
nltk/classify/textcat.py
View file @
92e5fca0
...
...
@@ -28,33 +28,39 @@ For details about An Crubadan, see:
http://borel.slu.edu/crubadan/index.html
"""
# Ensure that
your own
literal strings default to unicode rather than str.
# Ensure that literal strings default to unicode rather than str.
from
__future__
import
print_function
,
unicode_literals
import
nltk
from
nltk.corpus
import
crubadan
import
nltk.compat
from
nltk.corpus
import
CrubadanCorpusReader
from
nltk.util
import
trigrams
from
nltk.tokenize
import
word_tokenize
from
nltk.probability
import
FreqDist
from
sys
import
maxint
if
nltk
.
compat
.
PY3
:
from
sys
import
maxsize
else
:
from
sys
import
maxint
# Note: this is NOT "re" you're likely used to. The regex module
# is an alternative to the standard re module that supports
# Unicode codepoint properties with the \p{} syntax.
# You may have to "pip install regx"
try
:
import
regex
import
regex
as
re
except
ImportError
:
pass
re
=
None
######################################################################
## Language identification using TextCat
######################################################################
class
TextCat
(
object
):
_corpus
=
None
fingerprints
=
{}
_START_CHAR
=
"<"
.
encode
(
'utf8'
)
_END_CHAR
=
">"
.
encode
(
'utf8'
)
_START_CHAR
=
"<"
_END_CHAR
=
">"
last_distances
=
{}
...
...
@@ -65,29 +71,14 @@ class TextCat(object):
"see https://pypi.python.org/pypi/regex for "
"further details."
)
self
.
_corpus
=
crubadan
def
trigrams
(
self
,
text
):
padded_text
=
self
.
_START_CHAR
+
text
+
self
.
_END_CHAR
trigrams
=
[]
# Generate 3-grams for given text
for
i
in
range
(
0
,
len
(
padded_text
)
-
2
):
cur_trigram
=
padded_text
[
i
:(
i
+
3
)]
if
len
(
cur_trigram
)
==
2
:
cur_trigram
=
cur_trigram
+
self
.
_END_CHAR
trigrams
.
append
(
cur_trigram
)
return
trigrams
def
_print_trigrams
(
self
,
trigrams
):
for
t
in
trigrams
:
print
(
t
)
self
.
_corpus
=
CrubadanCorpusReader
(
nltk
.
data
.
find
(
'corpora/crubadan'
),
'.*
\
.txt'
)
# Load all language ngrams into cache
for
lang
in
self
.
_corpus
.
langs
():
self
.
_corpus
.
lang_freq
(
lang
)
def
remove_punctuation
(
self
,
text
):
''' Get rid of punctuation except apostrophes '''
return
re
gex
.
sub
(
r"[^\P{P}\']+"
,
""
,
text
.
decode
(
'utf8'
)
)
return
re
.
sub
(
r"[^\P{P}\']+"
,
""
,
text
)
def
profile
(
self
,
text
):
''' Create FreqDist of trigrams within text '''
...
...
@@ -96,7 +87,9 @@ class TextCat(object):
fingerprint
=
FreqDist
()
for
t
in
tokens
:
token_trigrams
=
self
.
trigrams
(
t
)
token_trigram_tuples
=
trigrams
(
self
.
_START_CHAR
+
t
+
self
.
_END_CHAR
)
token_trigrams
=
[
''
.
join
(
tri
)
for
tri
in
token_trigram_tuples
]
for
cur_trigram
in
token_trigrams
:
if
cur_trigram
in
fingerprint
:
fingerprint
[
cur_trigram
]
+=
1
...
...
@@ -108,32 +101,41 @@ class TextCat(object):
def
calc_dist
(
self
,
lang
,
trigram
,
text_profile
):
''' Calculate the "out-of-place" measure between the
text and language profile for a single trigram '''
lang_fd
=
self
.
_corpus
.
all_lang_freq
[
lang
]
lang_fd
=
self
.
_corpus
.
lang_freq
(
lang
)
dist
=
0
if
trigram
in
lang_fd
:
idx_lang_profile
=
l
ang_fd
.
keys
(
)
.
index
(
trigram
)
idx_text
=
text_profile
.
keys
(
)
.
index
(
trigram
)
idx_lang_profile
=
l
ist
(
lang_fd
.
keys
()
)
.
index
(
trigram
)
idx_text
=
list
(
text_profile
.
keys
()
)
.
index
(
trigram
)
#print(idx_lang_profile, ", ", idx_text)
dist
=
abs
(
idx_lang_profile
-
idx_text
)
else
:
# Arbitrary but should be larger than
# any possible trigram file length
# in terms of total lines
dist
=
maxint
if
nltk
.
compat
.
PY3
:
dist
=
maxsize
else
:
dist
=
maxint
return
dist
def
lang_dists
(
self
,
text
):
''' Calculate the "out-of-place" measure between
the text and all languages '''
distances
=
{}
profile
=
self
.
profile
(
text
)
for
lang
in
self
.
_corpus
.
all_lang_freq
.
keys
():
# For all the languages
for
lang
in
self
.
_corpus
.
_all_lang_freq
.
keys
():
# Calculate distance metric for every trigram in
# input text to be identified
lang_dist
=
0
for
trigram
in
profile
:
lang_dist
+=
self
.
calc_dist
(
lang
,
trigram
,
profile
)
distances
[
lang
]
=
lang_dist
return
distances
...
...
@@ -144,37 +146,43 @@ class TextCat(object):
self
.
last_distances
=
self
.
lang_dists
(
text
)
return
min
(
self
.
last_distances
,
key
=
self
.
last_distances
.
get
)
#################################################')
def
demo
(
self
):
from
nltk.corpus
import
udhr
langs
=
[
'Kurdish-UTF8'
,
'Abkhaz-UTF8'
,
'Farsi_Persian-UTF8'
,
'Hindi-UTF8'
,
'Hawaiian-UTF8'
,
'Russian-UTF8'
,
'Vietnamese-UTF8'
,
'Serbian_Srpski-UTF8'
,
'Esperanto-UTF8'
]
friendly
=
{
'kmr'
:
'Northern Kurdish'
,
'abk'
:
'Abkhazian'
,
'pes'
:
'Iranian Persian'
,
'hin'
:
'Hindi'
,
'haw'
:
'Hawaiian'
,
'rus'
:
'Russian'
,
'vie'
:
'Vietnamese'
,
'srp'
:
'Serbian'
,
'epo'
:
'Esperanto'
}
def
demo
():
''' Demo of language guessing using a bunch of UTF-8 encoded
text files with snippets of text copied from news websites
around the web in different languages '''
from
os
import
listdir
from
os.path
import
isfile
path
=
'.'
lang_samples
=
[]
tc
=
TextCat
()
for
f
in
listdir
(
path
):
if
isfile
(
f
):
m
=
regex
.
match
(
'sample_
\
w+
\
.txt'
,
f
)
if
m
:
lang_samples
.
append
(
f
)
print
(
lang_samples
)
for
f
in
lang_samples
:
cur_sample
=
open
(
f
,
'rU'
)
cur_data
=
cur_sample
.
read
()
print
(
'Language sample file: '
+
f
)
print
(
'Contents snippet: '
+
cur_data
.
decode
(
'utf8'
)[
0
:
140
])
print
(
'#################################################'
)
print
(
'Language detection: '
+
tc
.
guess_language
(
cur_data
))
print
(
'#################################################'
)
if
__name__
==
'__main__'
:
import
doctest
doctest
.
testmod
(
optionflags
=
doctest
.
NORMALIZE_WHITESPACE
|
doctest
.
ELLIPSIS
)
for
cur_lang
in
langs
:
# Get raw data from UDHR corpus
raw_sentences
=
udhr
.
sents
(
cur_lang
)
rows
=
len
(
raw_sentences
)
-
1
cols
=
list
(
map
(
len
,
raw_sentences
))
sample
=
''
# Generate a sample text of the language
for
i
in
range
(
0
,
rows
):
cur_sent
=
''
for
j
in
range
(
0
,
cols
[
i
]):
cur_sent
+=
' '
+
raw_sentences
[
i
][
j
]
sample
+=
cur_sent
# Try to detect what it is
print
(
'Language snippet: '
+
sample
[
0
:
140
]
+
'...'
)
guess
=
self
.
guess_language
(
sample
)
print
(
'Language detection:
%
s (
%
s)'
%
(
guess
,
friendly
[
guess
]))
print
(
'#'
*
140
)
nltk/corpus/reader/crubadan.py
View file @
92e5fca0
...
...
@@ -22,8 +22,8 @@ http://borel.slu.edu/crubadan/index.html
from
__future__
import
print_function
,
unicode_literals
import
re
import
nltk.compat
from
os
import
path
from
nltk.corpus.reader
import
CorpusReader
from
nltk.probability
import
FreqDist
from
nltk.data
import
ZipFilePathPointer
...
...
@@ -58,13 +58,13 @@ class CrubadanCorpusReader(CorpusReader):
''' Return internal Crubadan code based on ISO 639-3 code '''
for
i
in
self
.
_lang_mapping_data
:
if
i
[
1
]
.
lower
()
==
lang
.
lower
():
return
unicode
(
i
[
0
])
return
i
[
0
]
def
crubadan_to_iso
(
self
,
lang
):
''' Return ISO 639-3 code given internal Crubadan code '''
for
i
in
self
.
_lang_mapping_data
:
if
i
[
0
]
.
lower
()
==
lang
.
lower
():
return
unicode
(
i
[
1
])
return
i
[
1
]
def
_load_lang_mapping_data
(
self
):
''' Load language mappings between codes and description from table.txt '''
...
...
@@ -74,8 +74,12 @@ class CrubadanCorpusReader(CorpusReader):
mapper_file
=
path
.
join
(
self
.
root
,
self
.
_LANG_MAPPER_FILE
)
if
self
.
_LANG_MAPPER_FILE
not
in
self
.
fileids
():
raise
RuntimeError
(
"Could not find language mapper file: "
+
mapper_file
)
raw
=
open
(
mapper_file
,
'rU'
)
.
read
()
.
decode
(
'utf-8'
)
.
strip
()
if
nltk
.
compat
.
PY3
:
raw
=
open
(
mapper_file
,
'r'
,
encoding
=
'utf-8'
)
.
read
()
.
strip
()
else
:
raw
=
open
(
mapper_file
,
'rU'
)
.
read
()
.
decode
(
'utf-8'
)
.
strip
()
self
.
_lang_mapping_data
=
[
row
.
split
(
'
\t
'
)
for
row
in
raw
.
split
(
'
\n
'
)]
def
_load_lang_ngrams
(
self
,
lang
):
...
...
@@ -83,22 +87,27 @@ class CrubadanCorpusReader(CorpusReader):
and return its FreqDist '''
crubadan_code
=
self
.
iso_to_crubadan
(
lang
)
ngram_file
=
path
.
join
(
self
.
root
,
unicode
(
crubadan_code
)
+
'-3grams.txt'
)
ngram_file
=
path
.
join
(
self
.
root
,
crubadan_code
+
'-3grams.txt'
)
if
not
path
.
isfile
(
ngram_file
):
raise
Runtime
(
"Could not find language n-gram file for "
+
lang
)
counts
=
FreqDist
()
f
=
open
(
ngram_file
,
'rU'
)
if
nltk
.
compat
.
PY3
:
f
=
open
(
ngram_file
,
'r'
,
encoding
=
'utf-8'
)
else
:
f
=
open
(
ngram_file
,
'rU'
)
for
line
in
f
:
data
=
line
.
decode
(
'utf-8'
)
.
split
(
' '
)
if
nltk
.
compat
.
PY3
:
data
=
line
.
split
(
' '
)
else
:
data
=
line
.
decode
(
'utf8'
)
.
split
(
' '
)
ngram
=
data
[
1
]
.
strip
(
'
\n
'
)
freq
=
int
(
data
[
0
])
counts
[
ngram
]
=
freq
return
counts
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment