Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
N
nltk
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
edx
nltk
Commits
10a99af5
Commit
10a99af5
authored
Apr 29, 2015
by
Ewan Klein
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
updates to tweets corpus reading
parent
fbbcf3a5
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
23 additions
and
20 deletions
+23
-20
nltk/corpus/reader/tweets.py
+1
-1
nltk/twitter/__init__.py
+0
-9
nltk/twitter/twitter_demo.py
+13
-5
nltk/twitter/twitterclient.py
+9
-5
No files found.
nltk/corpus/reader/tweets.py
View file @
10a99af5
...
...
@@ -48,7 +48,7 @@ class TwitterCorpusReader(CorpusReader):
The corpus view class used by this reader.
"""
def
__init__
(
self
,
root
,
def
__init__
(
self
,
root
,
fileids
=
None
,
word_tokenizer
=
TweetTokenizer
(),
encoding
=
'utf8'
):
"""
...
...
nltk/twitter/__init__.py
View file @
10a99af5
...
...
@@ -13,15 +13,6 @@ This package contains classes for retrieving Tweet documents using the
Twitter API.
"""
try
:
from
twython
import
Twython
,
TwythonStreamer
except
ImportError
as
err
:
import
textwrap
MSG
=
"""The NLTK twitterclient module requires the Twython package. See
\
https://twython.readthedocs.org/ for installation instructions."""
err
.
msg
=
textwrap
.
fill
(
MSG
)
raise
from
nltk.twitter.util
import
credsfromfile
from
nltk.twitter.twitterclient
import
Streamer
,
Query
,
Twitter
,
TweetViewer
,
\
TweetWriter
nltk/twitter/twitter_demo.py
View file @
10a99af5
...
...
@@ -191,29 +191,37 @@ def corpusreader_demo():
* the result of tokenising the raw strings.
"""
from
nltk.corpus
import
TwitterCorpusReader
#from nltk.corpus import TwitterCorpusReader
from
nltk.corpus
import
tweets
tweets
.
fileids
()
#root = os.environ['TWITTER']
#reader = TwitterCorpusReader(root, '1k_sample.json')
reader
=
TwitterCorpusReader
(
'twitter'
,
'tweets.20150417.json'
)
#
reader = TwitterCorpusReader('twitter', 'tweets.20150417.json')
print
()
print
(
"Complete tweet documents"
)
print
(
SPACER
)
for
tweet
in
reader
.
docs
()[:
2
]:
for
tweet
in
tweets
.
docs
()[:
2
]:
print
(
json
.
dumps
(
tweet
,
indent
=
1
,
sort_keys
=
True
))
print
()
print
(
"Raw tweet strings:"
)
print
(
SPACER
)
for
text
in
reader
.
strings
()[:
15
]:
for
text
in
tweets
.
strings
()[:
15
]:
print
(
text
)
print
()
print
(
"Tokenized tweet strings:"
)
print
(
SPACER
)
for
text
in
reader
.
tokenized
()[:
15
]:
for
text
in
tweets
.
tokenized
()[:
15
]:
print
(
text
)
#def corpusreader_demo():
#from nltk.corpus import brown
#brown.words()
ALL
=
range
(
12
)
DEMOS
=
ALL
[
11
:]
...
...
nltk/twitter/twitterclient.py
View file @
10a99af5
...
...
@@ -6,11 +6,14 @@
# Lorenzo Rubio <lrnzcig@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
from
twython.exceptions
import
TwythonRateLimitError
"""
NLTK Twitter client
.
NLTK Twitter client
This module offers methods for collecting and processing tweets. Most of the
functionality depends on access to the Twitter APIs, and this is handled via
the third party Twython library.
If one of the methods below returns an integer, it is probably a `Twitter
error code <https://dev.twitter.com/overview/api/response-codes>`_. For
...
...
@@ -31,6 +34,7 @@ from nltk.compat import UTC
try
:
from
twython
import
Twython
,
TwythonStreamer
from
twython.exceptions
import
TwythonRateLimitError
except
ImportError
as
err
:
import
textwrap
MSG
=
"""The NLTK twitterclient module requires the Twython package. See
\
...
...
@@ -204,10 +208,10 @@ class Query(Twython):
results
=
self
.
search
(
q
=
keywords
,
count
=
min
(
100
,
count
),
lang
=
lang
)
count_from_query
=
results
[
'search_metadata'
][
'count'
]
self
.
handler
.
handle_chunk
(
results
[
'statuses'
])
'''
pagination loop: keep fetching tweets until the count requested is reached,
dealing with twitter rate limits
dealing with twitter rate limits
'''
while
count_from_query
<
count
:
max_id
=
results
[
'search_metadata'
][
'max_id'
]
...
...
@@ -217,7 +221,7 @@ class Query(Twython):
except
TwythonRateLimitError
as
e
:
print
(
"Waiting for 15 minutes -{0}"
.
format
(
e
))
time
.
sleep
(
15
*
60
)
# wait 15 minutes
continue
continue
count_from_query
+=
results
[
'search_metadata'
][
'count'
]
self
.
handler
.
handle_chunk
(
results
[
'statuses'
])
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment