Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
N
nltk
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
edx
nltk
Commits
dad5418c
Commit
dad5418c
authored
Feb 11, 2015
by
Steven Bird
Browse files
Options
Browse Files
Download
Plain Diff
resolve merge conflict
parents
38ee7a90
4e3f9cc1
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
184 additions
and
2 deletions
+184
-2
nltk/corpus/__init__.py
+2
-0
nltk/corpus/reader/__init__.py
+2
-2
nltk/corpus/reader/langid.py
+180
-0
No files found.
nltk/corpus/__init__.py
View file @
dad5418c
...
@@ -65,6 +65,8 @@ from nltk.tokenize import RegexpTokenizer
...
@@ -65,6 +65,8 @@ from nltk.tokenize import RegexpTokenizer
from
nltk.corpus.util
import
LazyCorpusLoader
from
nltk.corpus.util
import
LazyCorpusLoader
from
nltk.corpus.reader
import
*
from
nltk.corpus.reader
import
*
crubadan
=
LazyCorpusLoader
(
'crubadan'
,
CrubadanCorpusReader
,
'.*
\
.txt'
)
abc
=
LazyCorpusLoader
(
abc
=
LazyCorpusLoader
(
'abc'
,
PlaintextCorpusReader
,
r'(?!\.).*\.txt'
,
encoding
=
[
'abc'
,
PlaintextCorpusReader
,
r'(?!\.).*\.txt'
,
encoding
=
[
(
'science'
,
'latin_1'
),
(
'science'
,
'latin_1'
),
...
...
nltk/corpus/reader/__init__.py
View file @
dad5418c
...
@@ -93,7 +93,7 @@ from nltk.corpus.reader.framenet import *
...
@@ -93,7 +93,7 @@ from nltk.corpus.reader.framenet import *
from
nltk.corpus.reader.udhr
import
*
from
nltk.corpus.reader.udhr
import
*
from
nltk.corpus.reader.bnc
import
*
from
nltk.corpus.reader.bnc
import
*
from
nltk.corpus.reader.sentiwordnet
import
*
from
nltk.corpus.reader.sentiwordnet
import
*
from
nltk.corpus.reader.langid
import
*
# Make sure that nltk.corpus.reader.bracket_parse gives the module, not
# Make sure that nltk.corpus.reader.bracket_parse gives the module, not
# the function bracket_parse() defined in nltk.tree:
# the function bracket_parse() defined in nltk.tree:
from
nltk.corpus.reader
import
bracket_parse
from
nltk.corpus.reader
import
bracket_parse
...
@@ -128,5 +128,5 @@ __all__ = [
...
@@ -128,5 +128,5 @@ __all__ = [
'TimitTaggedCorpusReader'
,
'LinThesaurusCorpusReader'
,
'TimitTaggedCorpusReader'
,
'LinThesaurusCorpusReader'
,
'SemcorCorpusReader'
,
'FramenetCorpusReader'
,
'UdhrCorpusReader'
,
'SemcorCorpusReader'
,
'FramenetCorpusReader'
,
'UdhrCorpusReader'
,
'BNCCorpusReader'
,
'SentiWordNetCorpusReader'
,
'SentiSynset'
,
'BNCCorpusReader'
,
'SentiWordNetCorpusReader'
,
'SentiSynset'
,
'NKJPCorpusReader'
'NKJPCorpusReader'
,
'CrubadanCorpusReader'
]
]
nltk/corpus/reader/langid.py
0 → 100644
View file @
dad5418c
# -*- coding: utf-8 -*-
# Natural Language Toolkit: An Crubadan N-grams Reader
#
# Copyright (C) 2001-2015 NLTK Project
# Author: Avital Pekker <avital.pekker@utoronto.ca>
#
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
An NLTK interface for the n-gram statistics gathered from
the corpora for each language using An Crubadan.
There are multiple potential applications for the data but
this reader was created with the goal of using it in the
context of language identification.
For details about An Crubadan, this data, and its potential uses, see:
http://borel.slu.edu/crubadan/index.html
"""
# Ensure that your own literal strings default to unicode rather than str.
from
__future__
import
print_function
,
unicode_literals
from
nltk.corpus.reader
import
CorpusReader
from
nltk.probability
import
FreqDist
import
re
from
re
import
escape
,
search
######################################################################
## An Crubadan N-gram Corpus Reader
######################################################################
class
CrubadanCorpusReader
(
CorpusReader
):
"""
A corpus reader used to access language An Crubadan n-gram files.
"""
_LANG_MAPPER_FILE
=
'table.txt'
all_lang_freq
=
{}
def
__init__
(
self
,
root
,
fileids
,
encoding
=
'utf8'
,
tagset
=
None
):
super
(
CrubadanCorpusReader
,
self
)
.
__init__
(
root
,
fileids
,
encoding
=
'utf8'
)
self
.
_lang_mapping_data
=
[]
self
.
_load_lang_mapping_data
()
def
load_all_ngrams
(
self
):
''' Create a dictionary of every supported language mapping
the ISO 639-3 language code to its corresponding n-gram
FreqDist. The result can be accessed via "all_lang_freq" var '''
# Filter out non n-gram files from the corpus dir
valid_files
=
[]
for
f
in
self
.
fileids
():
m
=
re
.
search
(
'(
\
w+)'
+
re
.
escape
(
"-3grams.txt"
),
f
)
if
m
:
valid_files
.
append
(
m
.
group
()
)
for
f
in
valid_files
:
ngram_file
=
self
.
root
+
'/'
+
f
import
os.path
if
os
.
path
.
isfile
(
ngram_file
):
crubadan_code
=
f
.
split
(
'-'
)[
0
]
iso_code
=
self
.
crubadan_to_iso
(
crubadan_code
)
fd
=
self
.
load_lang_ngrams
(
iso_code
)
self
.
all_lang_freq
[
iso_code
]
=
fd
def
load_lang_ngrams
(
self
,
lang
):
''' Load single n-gram language file given the ISO 639-3 language code
and return its FreqDist '''
crubadan_code
=
self
.
iso_to_crubadan
(
lang
)
ngram_file
=
self
.
root
+
'/'
+
unicode
(
crubadan_code
)
+
'-3grams.txt'
import
os.path
if
not
os
.
path
.
isfile
(
ngram_file
):
raise
CrubadanError
(
"Could not find language n-gram file for ["
+
lang
+
"]."
)
counts
=
FreqDist
()
f
=
open
(
ngram_file
,
'rU'
)
for
line
in
f
:
data
=
line
.
decode
(
'utf-8'
)
.
split
(
u' '
)
ngram
=
data
[
1
]
.
strip
(
'
\n
'
)
freq
=
int
(
data
[
0
])
counts
[
ngram
]
=
freq
return
counts
def
lang_freq
(
self
,
lang
):
''' Return n-gram FreqDist for a specific language
given ISO 639-3 language code '''
if
len
(
self
.
all_lang_freq
)
==
0
:
return
self
.
load_lang_ngrams
(
lang
)
else
:
return
self
.
all_lang_freq
[
lang
]
def
ngram_freq
(
self
,
lang
,
ngram
):
''' Return n-gram frequency as integer given
an ISO 639-3 language code and n-gram '''
lf
=
self
.
all_lang_freq
[
lang
]
return
lf
[
ngram
]
def
supported_langs
(
self
):
''' Return a list of supported languages in human-friendly form '''
l
=
[]
for
i
in
self
.
_lang_mapping_data
:
l
.
append
(
i
[
2
])
return
l
def
lang_supported
(
self
,
lang
):
''' Check if a language is supported (language passed in as ISO 639-3 code) '''
for
i
in
self
.
_lang_mapping_data
:
if
i
[
1
]
.
lower
()
==
lang
.
lower
():
return
True
return
False
def
iso_to_friendly
(
self
,
lang
):
''' Return human-friendly name for a lanuage based on ISO 639-3 code '''
for
i
in
self
.
_lang_mapping_data
:
if
i
[
1
]
.
lower
()
==
lang
.
lower
():
return
unicode
(
i
[
2
])
return
None
def
friendly_to_iso
(
self
,
lang
):
''' Return ISO 639-3 code from human-friendly language name (eg: "English" -> "en") '''
for
i
in
self
.
_lang_mapping_data
:
if
i
[
2
]
.
lower
()
==
lang
.
lower
():
return
unicode
(
i
[
1
])
def
iso_to_crubadan
(
self
,
lang
):
''' Return internal Crubadan code based on ISO 639-3 code '''
for
i
in
self
.
_lang_mapping_data
:
if
i
[
1
]
.
lower
()
==
lang
.
lower
():
return
unicode
(
i
[
0
])
def
crubadan_to_iso
(
self
,
lang
):
''' Return ISO 639-3 code given internal Crubadan code '''
for
i
in
self
.
_lang_mapping_data
:
if
i
[
0
]
.
lower
()
==
lang
.
lower
():
return
unicode
(
i
[
1
])
def
_load_lang_mapping_data
(
self
):
''' Load language mappings between codes and description from table.txt '''
mapper_file
=
self
.
root
+
'/'
+
self
.
_LANG_MAPPER_FILE
if
self
.
_LANG_MAPPER_FILE
not
in
self
.
fileids
():
raise
CrubadanError
(
"Could not find language mapper file ["
+
mapper_file
+
"]."
)
f
=
open
(
mapper_file
,
'rU'
)
data
=
f
.
read
()
.
decode
(
'utf-8'
)
.
split
(
'
\n
'
)
for
row
in
data
:
self
.
_lang_mapping_data
.
append
(
row
.
split
(
'
\t
'
)
)
# Get rid of empty entry if last line in file is blank
if
self
.
_lang_mapping_data
[
len
(
self
.
_lang_mapping_data
)
-
1
]
==
[
u''
]:
self
.
_lang_mapping_data
.
pop
()
def
_is_utf8
(
self
,
str
):
''' Check if a string is utf8 encoded '''
try
:
str
.
decode
(
'utf-8'
)
return
True
except
UnicodeError
:
return
False
class
CrubadanError
(
Exception
):
"""An exception class for Crubadan n-gram reader related errors."""
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment