Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
N
nltk
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
edx
nltk
Commits
e7ba3080
Commit
e7ba3080
authored
May 18, 2015
by
Steven Bird
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
stylistic fixes, cleaned imports, fixed cyclic imports
parent
92e5fca0
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
64 additions
and
54 deletions
+64
-54
nltk/align/ibm1.py
+1
-1
nltk/align/ibm2.py
+1
-1
nltk/classify/tadm.py
+1
-1
nltk/classify/textcat.py
+46
-41
nltk/corpus/reader/crubadan.py
+4
-4
nltk/parse/malt.py
+1
-1
nltk/parse/nonprojectivedependencyparser.py
+2
-1
nltk/parse/transitionparser.py
+8
-4
No files found.
nltk/align/ibm1.py
View file @
e7ba3080
...
@@ -15,7 +15,6 @@
...
@@ -15,7 +15,6 @@
from
__future__
import
division
from
__future__
import
division
from
collections
import
defaultdict
from
collections
import
defaultdict
from
nltk.align
import
AlignedSent
from
nltk.align
import
AlignedSent
from
nltk.corpus
import
comtrans
class
IBMModel1
(
object
):
class
IBMModel1
(
object
):
"""
"""
...
@@ -28,6 +27,7 @@ class IBMModel1(object):
...
@@ -28,6 +27,7 @@ class IBMModel1(object):
Step 2 - Estimate the probability of translation according to the
Step 2 - Estimate the probability of translation according to the
evidence from Step 1.
evidence from Step 1.
>>> from nltk.corpus import comtrans
>>> bitexts = comtrans.aligned_sents()[:100]
>>> bitexts = comtrans.aligned_sents()[:100]
>>> ibm = IBMModel1(bitexts, 20)
>>> ibm = IBMModel1(bitexts, 20)
...
...
nltk/align/ibm2.py
View file @
e7ba3080
...
@@ -9,7 +9,6 @@
...
@@ -9,7 +9,6 @@
from
__future__
import
division
from
__future__
import
division
from
collections
import
defaultdict
from
collections
import
defaultdict
from
nltk.align
import
AlignedSent
from
nltk.align
import
AlignedSent
from
nltk.corpus
import
comtrans
from
nltk.align.ibm1
import
IBMModel1
from
nltk.align.ibm1
import
IBMModel1
class
IBMModel2
(
object
):
class
IBMModel2
(
object
):
...
@@ -26,6 +25,7 @@ class IBMModel2(object):
...
@@ -26,6 +25,7 @@ class IBMModel2(object):
Step 3 - Estimate the probability of translation and alignment according
Step 3 - Estimate the probability of translation and alignment according
to the evidence from Step 2.
to the evidence from Step 2.
>>> from nltk.corpus import comtrans
>>> bitexts = comtrans.aligned_sents()[:100]
>>> bitexts = comtrans.aligned_sents()[:100]
>>> ibm = IBMModel2(bitexts, 5)
>>> ibm = IBMModel2(bitexts, 5)
>>> aligned_sent = ibm.align(bitexts[0])
>>> aligned_sent = ibm.align(bitexts[0])
...
...
nltk/classify/tadm.py
View file @
e7ba3080
...
@@ -14,7 +14,7 @@ from nltk.internals import find_binary
...
@@ -14,7 +14,7 @@ from nltk.internals import find_binary
try
:
try
:
import
numpy
import
numpy
except
ImportError
:
except
ImportError
:
numpy
=
None
pass
_tadm_bin
=
None
_tadm_bin
=
None
def
config_tadm
(
bin
=
None
):
def
config_tadm
(
bin
=
None
):
...
...
nltk/classify/textcat.py
View file @
e7ba3080
...
@@ -31,14 +31,10 @@ http://borel.slu.edu/crubadan/index.html
...
@@ -31,14 +31,10 @@ http://borel.slu.edu/crubadan/index.html
# Ensure that literal strings default to unicode rather than str.
# Ensure that literal strings default to unicode rather than str.
from
__future__
import
print_function
,
unicode_literals
from
__future__
import
print_function
,
unicode_literals
import
nltk
from
nltk.compat
import
PY3
import
nltk.compat
from
nltk.corpus
import
CrubadanCorpusReader
from
nltk.util
import
trigrams
from
nltk.util
import
trigrams
from
nltk.tokenize
import
word_tokenize
from
nltk.probability
import
FreqDist
if
nltk
.
compat
.
PY3
:
if
PY3
:
from
sys
import
maxsize
from
sys
import
maxsize
else
:
else
:
from
sys
import
maxint
from
sys
import
maxint
...
@@ -71,7 +67,8 @@ class TextCat(object):
...
@@ -71,7 +67,8 @@ class TextCat(object):
"see https://pypi.python.org/pypi/regex for "
"see https://pypi.python.org/pypi/regex for "
"further details."
)
"further details."
)
self
.
_corpus
=
CrubadanCorpusReader
(
nltk
.
data
.
find
(
'corpora/crubadan'
),
'.*
\
.txt'
)
from
nltk.corpus
import
crubadan
self
.
_corpus
=
crubadan
# Load all language ngrams into cache
# Load all language ngrams into cache
for
lang
in
self
.
_corpus
.
langs
():
for
lang
in
self
.
_corpus
.
langs
():
self
.
_corpus
.
lang_freq
(
lang
)
self
.
_corpus
.
lang_freq
(
lang
)
...
@@ -82,6 +79,8 @@ class TextCat(object):
...
@@ -82,6 +79,8 @@ class TextCat(object):
def
profile
(
self
,
text
):
def
profile
(
self
,
text
):
''' Create FreqDist of trigrams within text '''
''' Create FreqDist of trigrams within text '''
from
nltk
import
word_tokenize
,
FreqDist
clean_text
=
self
.
remove_punctuation
(
text
)
clean_text
=
self
.
remove_punctuation
(
text
)
tokens
=
word_tokenize
(
clean_text
)
tokens
=
word_tokenize
(
clean_text
)
...
@@ -115,7 +114,7 @@ class TextCat(object):
...
@@ -115,7 +114,7 @@ class TextCat(object):
# Arbitrary but should be larger than
# Arbitrary but should be larger than
# any possible trigram file length
# any possible trigram file length
# in terms of total lines
# in terms of total lines
if
nltk
.
compat
.
PY3
:
if
PY3
:
dist
=
maxsize
dist
=
maxsize
else
:
else
:
dist
=
maxint
dist
=
maxint
...
@@ -148,41 +147,47 @@ class TextCat(object):
...
@@ -148,41 +147,47 @@ class TextCat(object):
return
min
(
self
.
last_distances
,
key
=
self
.
last_distances
.
get
)
return
min
(
self
.
last_distances
,
key
=
self
.
last_distances
.
get
)
#################################################')
#################################################')
def
demo
(
self
):
def
demo
(
):
from
nltk.corpus
import
udhr
from
nltk.corpus
import
udhr
langs
=
[
'Kurdish-UTF8'
,
'Abkhaz-UTF8'
,
'Farsi_Persian-UTF8'
,
langs
=
[
'Kurdish-UTF8'
,
'Abkhaz-UTF8'
,
'Farsi_Persian-UTF8'
,
'Hindi-UTF8'
,
'Hawaiian-UTF8'
,
'Russian-UTF8'
,
'Vietnamese-UTF8'
,
'Hindi-UTF8'
,
'Hawaiian-UTF8'
,
'Russian-UTF8'
,
'Vietnamese-UTF8'
,
'Serbian_Srpski-UTF8'
,
'Esperanto-UTF8'
]
'Serbian_Srpski-UTF8'
,
'Esperanto-UTF8'
]
friendly
=
{
'kmr'
:
'Northern Kurdish'
,
friendly
=
{
'kmr'
:
'Northern Kurdish'
,
'abk'
:
'Abkhazian'
,
'abk'
:
'Abkhazian'
,
'pes'
:
'Iranian Persian'
,
'pes'
:
'Iranian Persian'
,
'hin'
:
'Hindi'
,
'hin'
:
'Hindi'
,
'haw'
:
'Hawaiian'
,
'haw'
:
'Hawaiian'
,
'rus'
:
'Russian'
,
'rus'
:
'Russian'
,
'vie'
:
'Vietnamese'
,
'vie'
:
'Vietnamese'
,
'srp'
:
'Serbian'
,
'srp'
:
'Serbian'
,
'epo'
:
'Esperanto'
}
'epo'
:
'Esperanto'
}
for
cur_lang
in
langs
:
tc
=
TextCat
()
# Get raw data from UDHR corpus
raw_sentences
=
udhr
.
sents
(
cur_lang
)
for
cur_lang
in
langs
:
rows
=
len
(
raw_sentences
)
-
1
# Get raw data from UDHR corpus
cols
=
list
(
map
(
len
,
raw_sentences
))
raw_sentences
=
udhr
.
sents
(
cur_lang
)
rows
=
len
(
raw_sentences
)
-
1
cols
=
list
(
map
(
len
,
raw_sentences
))
sample
=
''
sample
=
''
# Generate a sample text of the language
# Generate a sample text of the language
for
i
in
range
(
0
,
rows
):
for
i
in
range
(
0
,
rows
):
cur_sent
=
''
cur_sent
=
''
for
j
in
range
(
0
,
cols
[
i
]):
for
j
in
range
(
0
,
cols
[
i
]):
cur_sent
+=
' '
+
raw_sentences
[
i
][
j
]
cur_sent
+=
' '
+
raw_sentences
[
i
][
j
]
sample
+=
cur_sent
sample
+=
cur_sent
# Try to detect what it is
# Try to detect what it is
print
(
'Language snippet: '
+
sample
[
0
:
140
]
+
'...'
)
print
(
'Language snippet: '
+
sample
[
0
:
140
]
+
'...'
)
guess
=
self
.
guess_language
(
sample
)
guess
=
tc
.
guess_language
(
sample
)
print
(
'Language detection:
%
s (
%
s)'
%
(
guess
,
friendly
[
guess
]))
print
(
'Language detection:
%
s (
%
s)'
%
(
guess
,
friendly
[
guess
]))
print
(
'#'
*
140
)
print
(
'#'
*
140
)
if
__name__
==
'__main__'
:
demo
()
nltk/corpus/reader/crubadan.py
View file @
e7ba3080
...
@@ -22,7 +22,7 @@ http://borel.slu.edu/crubadan/index.html
...
@@ -22,7 +22,7 @@ http://borel.slu.edu/crubadan/index.html
from
__future__
import
print_function
,
unicode_literals
from
__future__
import
print_function
,
unicode_literals
import
re
import
re
import
nltk.compat
from
nltk.compat
import
PY3
from
os
import
path
from
os
import
path
from
nltk.corpus.reader
import
CorpusReader
from
nltk.corpus.reader
import
CorpusReader
from
nltk.probability
import
FreqDist
from
nltk.probability
import
FreqDist
...
@@ -75,7 +75,7 @@ class CrubadanCorpusReader(CorpusReader):
...
@@ -75,7 +75,7 @@ class CrubadanCorpusReader(CorpusReader):
if
self
.
_LANG_MAPPER_FILE
not
in
self
.
fileids
():
if
self
.
_LANG_MAPPER_FILE
not
in
self
.
fileids
():
raise
RuntimeError
(
"Could not find language mapper file: "
+
mapper_file
)
raise
RuntimeError
(
"Could not find language mapper file: "
+
mapper_file
)
if
nltk
.
compat
.
PY3
:
if
PY3
:
raw
=
open
(
mapper_file
,
'r'
,
encoding
=
'utf-8'
)
.
read
()
.
strip
()
raw
=
open
(
mapper_file
,
'r'
,
encoding
=
'utf-8'
)
.
read
()
.
strip
()
else
:
else
:
raw
=
open
(
mapper_file
,
'rU'
)
.
read
()
.
decode
(
'utf-8'
)
.
strip
()
raw
=
open
(
mapper_file
,
'rU'
)
.
read
()
.
decode
(
'utf-8'
)
.
strip
()
...
@@ -93,13 +93,13 @@ class CrubadanCorpusReader(CorpusReader):
...
@@ -93,13 +93,13 @@ class CrubadanCorpusReader(CorpusReader):
raise
Runtime
(
"Could not find language n-gram file for "
+
lang
)
raise
Runtime
(
"Could not find language n-gram file for "
+
lang
)
counts
=
FreqDist
()
counts
=
FreqDist
()
if
nltk
.
compat
.
PY3
:
if
PY3
:
f
=
open
(
ngram_file
,
'r'
,
encoding
=
'utf-8'
)
f
=
open
(
ngram_file
,
'r'
,
encoding
=
'utf-8'
)
else
:
else
:
f
=
open
(
ngram_file
,
'rU'
)
f
=
open
(
ngram_file
,
'rU'
)
for
line
in
f
:
for
line
in
f
:
if
nltk
.
compat
.
PY3
:
if
PY3
:
data
=
line
.
split
(
' '
)
data
=
line
.
split
(
' '
)
else
:
else
:
data
=
line
.
decode
(
'utf8'
)
.
split
(
' '
)
data
=
line
.
decode
(
'utf8'
)
.
split
(
' '
)
...
...
nltk/parse/malt.py
View file @
e7ba3080
...
@@ -15,7 +15,6 @@ from functools import reduce
...
@@ -15,7 +15,6 @@ from functools import reduce
import
subprocess
import
subprocess
from
nltk.data
import
ZipFilePathPointer
from
nltk.data
import
ZipFilePathPointer
from
nltk.tag
import
RegexpTagger
from
nltk.tokenize
import
word_tokenize
from
nltk.tokenize
import
word_tokenize
from
nltk.internals
import
find_binary
from
nltk.internals
import
find_binary
...
@@ -43,6 +42,7 @@ class MaltParser(ParserI):
...
@@ -43,6 +42,7 @@ class MaltParser(ParserI):
if
tagger
is
not
None
:
if
tagger
is
not
None
:
self
.
tagger
=
tagger
self
.
tagger
=
tagger
else
:
else
:
from
nltk.tag
import
RegexpTagger
self
.
tagger
=
RegexpTagger
(
self
.
tagger
=
RegexpTagger
(
[(
r'^-?[0-9]+(.[0-9]+)?$'
,
'CD'
),
# cardinal numbers
[(
r'^-?[0-9]+(.[0-9]+)?$'
,
'CD'
),
# cardinal numbers
(
r'(The|the|A|a|An|an)$'
,
'AT'
),
# articles
(
r'(The|the|A|a|An|an)$'
,
'AT'
),
# articles
...
...
nltk/parse/nonprojectivedependencyparser.py
View file @
e7ba3080
...
@@ -14,7 +14,6 @@ import logging
...
@@ -14,7 +14,6 @@ import logging
from
nltk.compat
import
xrange
from
nltk.compat
import
xrange
from
nltk.parse.dependencygraph
import
DependencyGraph
from
nltk.parse.dependencygraph
import
DependencyGraph
from
nltk.classify
import
NaiveBayesClassifier
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
...
@@ -111,6 +110,8 @@ class NaiveBayesDependencyScorer(DependencyScorerI):
...
@@ -111,6 +110,8 @@ class NaiveBayesDependencyScorer(DependencyScorerI):
:param graphs: A list of dependency graphs to train the scorer.
:param graphs: A list of dependency graphs to train the scorer.
"""
"""
from
nltk.classify
import
NaiveBayesClassifier
# Create training labeled training examples
# Create training labeled training examples
labeled_examples
=
[]
labeled_examples
=
[]
for
graph
in
graphs
:
for
graph
in
graphs
:
...
...
nltk/parse/transitionparser.py
View file @
e7ba3080
...
@@ -12,10 +12,14 @@ import pickle
...
@@ -12,10 +12,14 @@ import pickle
from
os
import
remove
from
os
import
remove
from
copy
import
deepcopy
from
copy
import
deepcopy
from
operator
import
itemgetter
from
operator
import
itemgetter
from
scipy
import
sparse
from
numpy
import
array
try
:
from
sklearn.datasets
import
load_svmlight_file
from
numpy
import
array
from
sklearn
import
svm
from
scipy
import
sparse
from
sklearn.datasets
import
load_svmlight_file
from
sklearn
import
svm
except
ImportError
:
pass
from
nltk.parse
import
ParserI
,
DependencyGraph
,
DependencyEvaluator
from
nltk.parse
import
ParserI
,
DependencyGraph
,
DependencyEvaluator
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment