Commit fc44d4d1 by Ewan Klein

minor edits to corpus reader demo

parent 10a99af5
...@@ -33,13 +33,14 @@ class TwitterCorpusReader(CorpusReader): ...@@ -33,13 +33,14 @@ class TwitterCorpusReader(CorpusReader):
If you made your own tweet collection in a directory called If you made your own tweet collection in a directory called
`twitter-files`, then you can initialise the reader as:: `twitter-files`, then you can initialise the reader as::
>>> reader = TwitterCorpusReader(root='twitter-files', '.*\.json') # doctest: +SKIP from nltk.corpus import TwitterCorpusReader
reader = TwitterCorpusReader(root='/path/to/twitter-files', '.*\.json')
The recommended approach is to use this directory as the value of the However, the recommended approach is to use this directory as the value of the
environmental variable `TWITTER`, and then invoke the reader as:: environmental variable `TWITTER`, and then invoke the reader as::
>>> root = os.environ['TWITTER'] root = os.environ['TWITTER']
>>> reader = TwitterCorpusReader(root, '.*\.json') # doctest: +SKIP reader = TwitterCorpusReader(root, '.*\.json')
""" """
......
...@@ -191,16 +191,14 @@ def corpusreader_demo(): ...@@ -191,16 +191,14 @@ def corpusreader_demo():
* the result of tokenising the raw strings. * the result of tokenising the raw strings.
""" """
#from nltk.corpus import TwitterCorpusReader
from nltk.corpus import tweets from nltk.corpus import tweets
tweets.fileids()
#root = os.environ['TWITTER']
#reader = TwitterCorpusReader(root, '1k_sample.json') #reader = TwitterCorpusReader(root, '1k_sample.json')
#reader = TwitterCorpusReader('twitter', 'tweets.20150417.json') #reader = TwitterCorpusReader('twitter', 'tweets.20150417.json')
print() print()
print("Complete tweet documents") print("Complete tweet documents")
print(SPACER) print(SPACER)
for tweet in tweets.docs()[:2]: for tweet in tweets.docs()[:1]:
print(json.dumps(tweet, indent=1, sort_keys=True)) print(json.dumps(tweet, indent=1, sort_keys=True))
print() print()
...@@ -212,13 +210,11 @@ def corpusreader_demo(): ...@@ -212,13 +210,11 @@ def corpusreader_demo():
print() print()
print("Tokenized tweet strings:") print("Tokenized tweet strings:")
print(SPACER) print(SPACER)
for text in tweets.tokenized()[:15]: for toks in tweets.tokenized()[:15]:
print(text) print(toks)
#def corpusreader_demo():
#from nltk.corpus import brown
#brown.words()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment