Merge pull request #896 from longdt219/UpgradeMallet_104

Upgrade mallet #104

Merge pull request #896 from longdt219/UpgradeMallet_104
Upgrade mallet #104
ad9c5384 · Steven Bird · 15b64cdb · dc5ca7a0 · ad9c5384 · ad9c5384
Commit ad9c5384 authored Mar 06, 2015 by Steven Bird
Hide whitespace changes
Inline Side-by-side

Showing with 200 additions and 0 deletions

nltk/tag/crfsuite.py
+199 -0

pip-req.txt
+1 -0

No files found.
--- a/nltk/tag/crfsuite.py
+++ b/nltk/tag/crfsuite.py
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Interface to the CRFSuite Tagger
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Author: Long Duong <longdt219@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+"""
+A module for POS tagging using CRFSuite
+"""
+from __future__ import absolute_import
+from __future__ import unicode_literals
+import unicodedata
+import re 
+from nltk.tag.api import TaggerI
+import pycrfsuite
+class CRFTagger(TaggerI):
+    """
+    A module for POS tagging using CRFSuite https://pypi.python.org/pypi/python-crfsuite
+    >>> from nltk.tag.crfsuite import CRFTagger
+    >>> ct = CRFTagger()
+    >>> train_data = [[('University','Noun'), ('is','Verb'), ('a','Det'), ('good','Adj'), ('place','Noun')],
+    ... [('dog','Noun'),('eat','Verb'),('meat','Noun')]]
+    >>> ct.train(train_data,'model.crf.tagger')
+    >>> ct.tag_sents([['dog','is','good'], ['Cat','eat','meat']])
+    [[('dog', 'Noun'), ('is', 'Verb'), ('good', 'Adj')], [('Cat', 'Noun'), ('eat', 'Verb'), ('meat', 'Noun')]]
+    >>> gold_sentences = [[('dog','Noun'),('is','Verb'),('good','Adj')] , [('Cat','Noun'),('eat','Verb'), ('meat','Noun')]] 
+    >>> ct.evaluate(gold_sentences) 
+    1.0
+    Setting learned model file  
+    >>> ct = CRFTagger() 
+    >>> ct.set_model_file('model.crf.tagger')
+    >>> ct.evaluate(gold_sentences)
+    1.0
+    """
+    def __init__(self,  feature_func = None, verbose = False, training_opt = {}):
+        """
+        Initialize the CRFSuite tagger 
+        :param feature_func: The function that extracts features for each token of a sentence. This function should take 
+        2 parameters: tokens and index which extract features at index position from tokens list. See the build in 
+        _get_features function for more detail.   
+        :param verbose: output the debugging messages during training.
+        :type verbose: boolean  
+        :param training_opt: python-crfsuite training options
+        :type training_opt : dictionary 
+        Set of possible training options (using LBFGS training algorithm).  
+         'feature.minfreq' : The minimum frequency of features.
+         'feature.possible_states' : Force to generate possible state features.
+         'feature.possible_transitions' : Force to generate possible transition features.
+         'c1' : Coefficient for L1 regularization.
+         'c2' : Coefficient for L2 regularization.
+         'max_iterations' : The maximum number of iterations for L-BFGS optimization.
+         'num_memories' : The number of limited memories for approximating the inverse hessian matrix.
+         'epsilon' : Epsilon for testing the convergence of the objective.
+         'period' : The duration of iterations to test the stopping criterion.
+         'delta' : The threshold for the stopping criterion; an L-BFGS iteration stops when the
+                    improvement of the log likelihood over the last ${period} iterations is no greater than this threshold.
+         'linesearch' : The line search algorithm used in L-BFGS updates:
+                           { 'MoreThuente': More and Thuente's method,
+                              'Backtracking': Backtracking method with regular Wolfe condition,
+                              'StrongBacktracking': Backtracking method with strong Wolfe condition
+                           } 
+         'max_linesearch' :  The maximum number of trials for the line search algorithm.
+        """
+        self._model_file = ''
+        self._tagger = pycrfsuite.Tagger()
+        if feature_func is None:
+            self._feature_func =  self._get_features
+        else:
+            self._feature_func =  feature_func
+        self._verbose = verbose 
+        self._training_options = training_opt
+        self._pattern = re.compile(r'\d')
+    def set_model_file(self, model_file):
+        self._model_file = model_file
+        self._tagger.open(self._model_file)
+    def _get_features(self, tokens, idx):
+        """
+        Extract basic features about this word including 
+             - Current Word 
+             - Is Capitalized ?
+             - Has Punctuation ?
+             - Has Number ?
+             - Suffixes up to length 3
+        Note that : we might include feature over previous word, next word ect. 
+        :return : a list which contains the features
+        :rtype : list(str)    
+        """ 
+        token = tokens[idx]
+        feature_list = []  
+        # Capitalization 
+        if token[0].isupper():
+            feature_list.append('CAPITALIZATION')
+        # Number 
+        if re.search(self._pattern, token) is not None:
+            feature_list.append('HAS_NUM') 
+        # Punctuation
+        punc_cat = set(["Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po"])
+        if all (unicodedata.category(x) in punc_cat for x in token):
+            feature_list.append('PUNCTUATION')
+        # Suffix up to length 3
+        if len(token) > 1:
+            feature_list.append('SUF_' + token[-1:]) 
+        if len(token) > 2: 
+            feature_list.append('SUF_' + token[-2:])    
+        if len(token) > 3: 
+            feature_list.append('SUF_' + token[-3:])
+        feature_list.append('WORD_' + token )
+        return feature_list
+    def tag_sents(self, sents):
+        '''
+        Tag a list of sentences. NB before using this function, user should specify the mode_file either by 
+                       - Train a new model using ``train'' function 
+                       - Use the pre-trained model which is set via ``set_model_file'' function  
+        :params sentences : list of sentences needed to tag. 
+        :type sentences : list(list(str))
+        :return : list of tagged sentences. 
+        :rtype : list (list (tuple(str,str))) 
+        '''
+        if self._model_file == '':
+            raise Exception(' No model file is found !! Please use train or set_model_file function')
+        # We need the list of sentences instead of the list generator for matching the input and output
+        result = []  
+        for tokens in sents:
+            features = [self._feature_func(tokens,i) for i in range(len(tokens))]
+            labels = self._tagger.tag(features)
+            if len(labels) != len(tokens):
+                raise Exception(' Predicted Length Not Matched, Expect Errors !')
+            tagged_sent = list(zip(tokens,labels))
+            result.append(tagged_sent)
+        return result 
+    def train(self, train_data, model_file):
+        '''
+        Train the CRF tagger using CRFSuite  
+        :params train_data : is the list of annotated sentences.        
+        :type train_data : list (list(tuple(str,str)))
+        :params model_file : the model will be saved to this file.     
+        '''
+        trainer = pycrfsuite.Trainer(verbose=self._verbose)
+        trainer.set_params(self._training_options)
+        for sent in train_data:
+            tokens,labels = zip(*sent)
+            features = [self._feature_func(tokens,i) for i in range(len(tokens))]
+            trainer.append(features,labels)
+        # Now train the model, the output should be model_file
+        trainer.train(model_file)
+        # Save the model file
+        self.set_model_file(model_file) 
+    def tag(self, tokens):
+        '''
+        Tag a sentence using Python CRFSuite Tagger. NB before using this function, user should specify the mode_file either by 
+                       - Train a new model using ``train'' function 
+                       - Use the pre-trained model which is set via ``set_model_file'' function  
+        :params tokens : list of tokens needed to tag. 
+        :type tokens : list(str)
+        :return : list of tagged tokens. 
+        :rtype : list (tuple(str,str)) 
+        '''
+        return self.tag_sents([tokens])[0]
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
--- a/pip-req.txt
+++ b/pip-req.txt
@@ -6,3 +6,4 @@ numpy>=1.8.0
 scipy>=0.13.2
 matplotlib>=1.3.1
 scikit-learn>=0.14.1
+python-crfsuite>=0.8.2