Commit d99bd97f by Steven Bird

resolve merge conflict

parents bfbbcaab fcc70d8b
......@@ -43,7 +43,39 @@ if [[ ! -d $senna_folder_name ]]; then
rm ${senna_file_name}
fi
# Setup the Enviroment variable
# Download and Install Liblbfgs
lbfgs_file_name=$(curl -s 'http://www.chokkan.org/software/liblbfgs/' | grep -o 'liblbfgs-.*\.tar.gz' | head -n1)
[[ ${lbfgs_file_name=} =~ (.+)\.tar.gz ]]
lbfgs_folder_name=${BASH_REMATCH[1]}
if [[ ! -d $lbfgs_folder_name ]]; then
wget -nv "https://github.com/downloads/chokkan/liblbfgs/$lbfgs_file_name"
tar -xvzf ${lbfgs_file_name}
rm ${lbfgs_file_name}
fi
cd $lbfgs_folder_name
./configure --prefix=$HOME/third/liblbfgs
make
make install
cd ..
# Download and install crfsuite
crfsuite_file_name=$(curl -s 'http://www.chokkan.org/software/crfsuite/' | grep -o 'crfsuite-.*\.tar.gz' | head -n1)
[[ ${crfsuite_file_name=} =~ (.+)\.tar.gz ]]
crfsuite_folder_name=${BASH_REMATCH[1]}
if [[ ! -d $crfsuite_folder_name ]]; then
wget -nv "https://github.com/downloads/chokkan/crfsuite/$crfsuite_file_name"
tar -xvzf ${crfsuite_file_name}
rm ${crfsuite_file_name}
fi
cd $crfsuite_folder_name
./configure --prefix=$HOME/third/crfsuite --with-liblbfgs=$HOME/third/liblbfgs
make
make install
cd ..
# Set up the enviroment variables
export CRFSUITE=$(pwd)'/crfsuite/bin'
export STANFORD_PARSER=$(pwd)'/stanford-parser'
export STANFORD_MODELS=$(pwd)'/stanford-parser'
export STANFORD_POSTAGGER=$(pwd)'/stanford-postagger'
......
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Interface to the CRFSuite Tagger
#
# Copyright (C) 2001-2015 NLTK Project
# Author: Long Duong <longdt219@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
A module for interfacing with the CRFSuite taggers.
"""
from __future__ import absolute_import
from __future__ import unicode_literals
from os import path, environ, sep
import os
from platform import system
import tempfile
from subprocess import PIPE
import unicodedata
import re
from subprocess import Popen
from nltk.tag.api import TaggerI
_crfsuite_url = 'http://www.chokkan.org/software/crfsuite'
class Error(Exception):
"""Basic error handling class to be extended by the module specific
exceptions"""
class ExecutableNotFound(Error):
"""Raised if the crfsuite executable does not exist"""
class CRFTagger(TaggerI):
"""
An interface to CRFSuite taggers. http://www.chokkan.org/software/crfsuite/tutorial.html
>>> from nltk.tag.crfsuite import CRFTagger
>>> ct = CRFTagger()
>>> train_data = [[('University','Noun'), ('is','Verb'), ('a','Det'), ('good','Adj'), ('place','Noun')],
... [('dog','Noun'),('eat','Verb'),('meat','Noun')]]
>>> ct.train(train_data,'model.crf.tagger')
>>> ct.tag_sents([['dog','is','good'], ['Cat','eat','meat']])
[[('dog', 'Noun'), ('is', 'Verb'), ('good', 'Adj')], [('Cat', 'Noun'), ('eat', 'Verb'), ('meat', 'Noun')]]
>>> gold_sentences = [[('dog','Noun'),('is','Verb'),('good','Adj')] , [('Cat','Noun'),('eat','Verb'), ('meat','Noun')]]
>>> ct.evaluate(gold_sentences)
1.0
Setting learned model file
>>> ct = CRFTagger()
>>> ct.set_model_file('model.crf.tagger')
>>> ct.evaluate(gold_sentences)
1.0
"""
def __init__(self, file_path=''):
self._path = path.normpath(file_path) + sep
exe_file_1 = self.executable(self._path)
# Verifies the existence of the executable on the self._path first
if not path.isfile(exe_file_1):
# Check for the system environment
if 'CRFSUITE' in environ:
self._path = path.normpath(environ['CRFSUITE']) + sep
exe_file_2 = self.executable(self._path)
if not path.isfile(exe_file_2):
raise ExecutableNotFound("CRFSuite executable expected at %s or %s but not found" % (exe_file_1,exe_file_2))
self._model_file = ''
def executable(self, base_path):
"""
The function that determines the system specific binary that should be
used in the pipeline. In case, the system is not known the default crfsuite binary will
be used.
"""
os_name = system()
if os_name == 'Linux':
return path.join(base_path, 'crfsuite')
if os_name == 'Windows':
return path.join(base_path, 'crfsuite.exe')
return path.join(base_path, 'crfsuite')
def set_model_file(self, model_file):
self._model_file = model_file
def _get_features(self,data):
"""
Extract basic features about this word including
- Current Word
- Is Capitalized ?
- Has Punctuation ?
- Has Number ?
- Suffixes up to length 3
:return : a string which contains the features
"""
feature_list = ''
# Capitalization
if data[0].isupper():
feature_list += 'CAPITALIZATION\t'
# Number
pattern = re.compile('\\d')
if re.search(pattern, data) is not None:
feature_list += 'HAS_NUM\t'
# Punctuation
punc_cat = set(["Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po"])
if all (unicodedata.category(x) in punc_cat for x in data):
feature_list += 'PUNCTUATION\t'
# Suffix up to length 3
if len(data) > 1:
feature_list += ('SUF_' + data[-1:] + '\t')
if len(data) > 2:
feature_list += ('SUF_' + data[-2:] + '\t')
if len(data) > 3:
feature_list += ('SUF_' + data[-3:] + '\t')
feature_list += 'WORD_' + data + '\t'
return feature_list.strip()
def tag_sents(self, sents):
'''
Tag a list of sentences. NB before using this function, user should specify the mode_file either by
- Train a new model using ``train'' function
- Use the pre-trained model which is set via ``set_model_file'' function
:params sentences : list of sentences needed to tag.
:type sentences : list(list(str))
:return : list of tagged sentences.
:rtype : list (list (tuple(str,str)))
'''
# We need the list of sentences instead of the list generator for matching the input and output
sentences = list(sents)
# First, build the test file
input_file = tempfile.NamedTemporaryFile(
prefix='crf_tagger.test',
dir=tempfile.gettempdir(),
delete=False)
for sent in sentences:
for token in sent:
#data = unicode(token)
data = token
input_file.write(('DUMMY_LABEL\t' + self._get_features(data) + '\n').encode('utf-8'))
input_file.write('\n'.encode('utf-8'))
input_file.close()
# Now use the model to tag
_crf_cmd = [self.executable(self._path), 'tag', '-m', self._model_file, input_file.name]
# Run the tagger and get the output
p = Popen(_crf_cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE)
(stdout, stderr) = p.communicate()
# Check the return code.
if p.returncode != 0:
raise Exception('CRFSuite command failed! Details: %s' % stderr)
# Remove the temp file
#print ('Test file : ' + input_file.name)
os.remove(input_file.name)
return self.parse_output(stdout, sentences)
def train(self, train_data, model_file):
'''
Train the CRF tagger using CRFSuite
:params train : is the list of annotated sentences.
:type train : list (list(tuple(str,str)))
:params model_file : the model will be saved to this file.
'''
try:
input_file = tempfile.NamedTemporaryFile(
prefix='crf_tagger.train',
dir=tempfile.gettempdir(),
delete=False)
for sent in train_data:
for data,label in sent:
#data = unicode(data)
input_file.write((label + '\t' + self._get_features(data) + '\n').encode('utf-8'))
input_file.write('\n'.encode('utf-8'))
input_file.close()
# Now train the model, the output should be model_file
_crf_cmd = [self.executable(self._path), 'learn', '-m', model_file, input_file.name]
# Serialize the actual sentences to a temporary string
# Run the tagger and get the output
p = Popen(_crf_cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE)
(stdout, stderr) = p.communicate()
# Check the return code.
if p.returncode != 0:
raise Exception('CRFSuite command failed! Details: %s' % stderr)
# Save the model file
self._model_file = model_file
finally:
os.remove(input_file.name)
#print ('Training data : ' + input_file.name)
def tag(self, tokens):
'''
Tag a sentence using Mallet CRF Tagger. NB before using this function, user should specify the mode_file either by
- Train a new model using ``train'' function
- Use the pre-trained model which is set via ``set_model_file'' function
:params tokens : list of tokens needed to tag.
:type tokens : list(str)
:return : list of tagged tokens.
:rtype : list (tuple(str,str))
'''
return self.tag_sents([tokens])[0]
def parse_output(self, text, sentences):
labels= []
sent_labels = []
text = text.decode('utf-8')
for label in text.strip().split("\n"):
label = label.strip()
if label == '':
sent_labels.append(labels)
labels = []
continue
labels.append(label)
if len(labels) > 0:
sent_labels.append(labels)
# Match labels with word
if len(sentences) != len(sent_labels):
raise ValueError(' Expecting error, number of sentence is not matched' + str(len(sentences)) + ' vs ' + str(len(sent_labels)))
tagged_sentences = []
for i in range(len(sentences)):
words = sentences[i]
labels = sent_labels[i]
if len(words) != len(labels):
raise ValueError(' Expecting error, sentence length is not matched')
tagged_sentence = []
for j in range(len(words)):
tagged_sentence.append((words[j],labels[j]))
tagged_sentences.append(tagged_sentence)
return tagged_sentences
if __name__ == "__main__":
import doctest
doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment