Commit b4386729 by Steven Bird

added basic documentation for sentiwordnet

parent 76187b5a
...@@ -17,13 +17,28 @@ For details about SentiWordNet see: ...@@ -17,13 +17,28 @@ For details about SentiWordNet see:
http://sentiwordnet.isti.cnr.it/ http://sentiwordnet.isti.cnr.it/
>>> from nltk.corpus import sentiwordnet as swn >>> from nltk.corpus import sentiwordnet as swn
>>> print(swn.senti_synset('breakdown.n.03'))
<breakdown.n.03: PosScore=0.0 NegScore=0.25>
>>> list(swn.senti_synsets('slow'))
[SentiSynset('decelerate.v.01'), SentiSynset('slow.v.02'),\
SentiSynset('slow.v.03'), SentiSynset('slow.a.01'),\
SentiSynset('slow.a.02'), SentiSynset('slow.a.04'),\
SentiSynset('slowly.r.01'), SentiSynset('behind.r.03')]
>>> happy = swn.senti_synsets('happy', 'a')
>>> happy0 = list(happy)[0]
>>> happy0.pos_score()
0.875
>>> happy0.neg_score()
0.0
>>> happy0.obj_score()
0.125
""" """
import re import re
from nltk.compat import python_2_unicode_compatible
from nltk.corpus.reader import CorpusReader from nltk.corpus.reader import CorpusReader
@python_2_unicode_compatible
class SentiWordNetCorpusReader(CorpusReader): class SentiWordNetCorpusReader(CorpusReader):
def __init__(self, root, fileids, encoding='utf-8'): def __init__(self, root, fileids, encoding='utf-8'):
""" """
...@@ -34,10 +49,10 @@ class SentiWordNetCorpusReader(CorpusReader): ...@@ -34,10 +49,10 @@ class SentiWordNetCorpusReader(CorpusReader):
encoding=encoding) encoding=encoding)
if len(self._fileids) != 1: if len(self._fileids) != 1:
raise ValueError('Exactly one file must be specified') raise ValueError('Exactly one file must be specified')
self.db = {} self._db = {}
self.parse_src_file() self._parse_src_file()
def parse_src_file(self): def _parse_src_file(self):
lines = self.open(self._fileids[0]).read().splitlines() lines = self.open(self._fileids[0]).read().splitlines()
lines = filter((lambda x : not re.search(r"^\s*#", x)), lines) lines = filter((lambda x : not re.search(r"^\s*#", x)), lines)
for i, line in enumerate(lines): for i, line in enumerate(lines):
...@@ -48,12 +63,12 @@ class SentiWordNetCorpusReader(CorpusReader): ...@@ -48,12 +63,12 @@ class SentiWordNetCorpusReader(CorpusReader):
raise ValueError('Line %s formatted incorrectly: %s\n' % (i, line)) raise ValueError('Line %s formatted incorrectly: %s\n' % (i, line))
if pos and offset: if pos and offset:
offset = int(offset) offset = int(offset)
self.db[(pos, offset)] = (float(pos_score), float(neg_score)) self._db[(pos, offset)] = (float(pos_score), float(neg_score))
def senti_synset(self, *vals): def senti_synset(self, *vals):
from nltk.corpus import wordnet as wn from nltk.corpus import wordnet as wn
if tuple(vals) in self.db: if tuple(vals) in self._db:
pos_score, neg_score = self.db[tuple(vals)] pos_score, neg_score = self._db[tuple(vals)]
pos, offset = vals pos, offset = vals
synset = wn._synset_from_pos_and_offset(pos, offset) synset = wn._synset_from_pos_and_offset(pos, offset)
return SentiSynset(pos_score, neg_score, synset) return SentiSynset(pos_score, neg_score, synset)
...@@ -61,8 +76,8 @@ class SentiWordNetCorpusReader(CorpusReader): ...@@ -61,8 +76,8 @@ class SentiWordNetCorpusReader(CorpusReader):
synset = wn.synset(vals[0]) synset = wn.synset(vals[0])
pos = synset.pos() pos = synset.pos()
offset = synset.offset() offset = synset.offset()
if (pos, offset) in self.db: if (pos, offset) in self._db:
pos_score, neg_score = self.db[(pos, offset)] pos_score, neg_score = self._db[(pos, offset)]
return SentiSynset(pos_score, neg_score, synset) return SentiSynset(pos_score, neg_score, synset)
else: else:
return None return None
...@@ -78,28 +93,42 @@ class SentiWordNetCorpusReader(CorpusReader): ...@@ -78,28 +93,42 @@ class SentiWordNetCorpusReader(CorpusReader):
def all_senti_synsets(self): def all_senti_synsets(self):
from nltk.corpus import wordnet as wn from nltk.corpus import wordnet as wn
for key, fields in self.db.items(): for key, fields in self._db.items():
pos, offset = key pos, offset = key
pos_score, neg_score = fields pos_score, neg_score = fields
synset = wn._synset_from_pos_and_offset(pos, offset) synset = wn._synset_from_pos_and_offset(pos, offset)
yield SentiSynset(pos_score, neg_score, synset) yield SentiSynset(pos_score, neg_score, synset)
@python_2_unicode_compatible
class SentiSynset(object): class SentiSynset(object):
def __init__(self, pos_score, neg_score, synset): def __init__(self, pos_score, neg_score, synset):
self.pos_score = pos_score self._pos_score = pos_score
self.neg_score = neg_score self._neg_score = neg_score
self.obj_score = 1.0 - (self.pos_score + self.neg_score) self._obj_score = 1.0 - (self._pos_score + self._neg_score)
self.synset = synset self.synset = synset
def pos_score(self):
return self._pos_score
def neg_score(self):
return self._neg_score
def obj_score(self):
return self._obj_score
def __str__(self): def __str__(self):
"""Prints just the Pos/Neg scores for now.""" """Prints just the Pos/Neg scores for now."""
s = "" s = "<"
s += self.synset.name() + "\t" s += self.synset.name() + ": "
s += "PosScore: %s\t" % self.pos_score s += "PosScore=%s " % self._pos_score
s += "NegScore: %s" % self.neg_score s += "NegScore=%s" % self._neg_score
s += ">"
return s return s
def __repr__(self): def __repr__(self):
return "Senti" + repr(self.synset) return "Senti" + repr(self.synset)
if __name__ == "__main__":
import doctest
doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
...@@ -479,6 +479,11 @@ PropBank ...@@ -479,6 +479,11 @@ PropBank
Please see the separate PropBank howto. Please see the separate PropBank howto.
SentiWordNet
============
Please see the separate SentiWordNet howto.
Categorized Corpora Categorized Corpora
=================== ===================
......
.. Copyright (C) 2001-2014 NLTK Project
.. For license information, see LICENSE.TXT
======================
SentiWordNet Interface
======================
SentiWordNet can be imported like this:
>>> from nltk.corpus import sentiwordnet as swn
------------
SentiSynsets
------------
>>> breakdown = swn.senti_synset('breakdown.n.03')
>>> print(breakdown)
<breakdown.n.03: PosScore=0.0 NegScore=0.25>
>>> breakdown.pos_score()
0.0
>>> breakdown.neg_score()
0.25
>>> breakdown.obj_score()
0.75
------
Lookup
------
>>> list(swn.senti_synsets('slow'))
[SentiSynset('decelerate.v.01'), SentiSynset('slow.v.02'),\
SentiSynset('slow.v.03'), SentiSynset('slow.a.01'),\
SentiSynset('slow.a.02'), SentiSynset('slow.a.04'),\
SentiSynset('slowly.r.01'), SentiSynset('behind.r.03')]
>>> happy = swn.senti_synsets('happy', 'a')
>>> all = swn.all_senti_synsets()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment