Commit afcab733 by Steven Bird

Merge pull request #290 from pflaquerre/malt-thread-safety

Make MaltParser safe to use in parallel
parents c09ae1a1 86b3909c
...@@ -10,7 +10,9 @@ import os ...@@ -10,7 +10,9 @@ import os
import tempfile import tempfile
import glob import glob
from operator import add from operator import add
import subprocess
from nltk.data import ZipFilePathPointer
from nltk.tag import RegexpTagger from nltk.tag import RegexpTagger
from nltk.tokenize import word_tokenize from nltk.tokenize import word_tokenize
from nltk.internals import find_binary from nltk.internals import find_binary
...@@ -24,8 +26,9 @@ class MaltParser(ParserI): ...@@ -24,8 +26,9 @@ class MaltParser(ParserI):
""" """
An interface for parsing with the Malt Parser. An interface for parsing with the Malt Parser.
:param mco: The full path to a pre-trained model. If :param mco: The name of the pre-trained model. If provided, training
provided, then training will not be needed. will not be required, and MaltParser will use the model file in
${working_dir}/${mco}.mco.
:type mco: str :type mco: str
""" """
self.config_malt() self.config_malt()
...@@ -123,31 +126,35 @@ class MaltParser(ParserI): ...@@ -123,31 +126,35 @@ class MaltParser(ParserI):
if not self._trained: if not self._trained:
raise Exception("Parser has not been trained. Call train() first.") raise Exception("Parser has not been trained. Call train() first.")
input_file = os.path.join(tempfile.gettempdir(), 'malt_input.conll') input_file = tempfile.NamedTemporaryFile(prefix='malt_input.conll',
output_file = os.path.join(tempfile.gettempdir(), 'malt_output.conll') dir=self.working_dir,
delete=False)
output_file = tempfile.NamedTemporaryFile(prefix='malt_output.conll',
dir=self.working_dir,
delete=False)
execute_string = 'java -jar %s -w %s -c %s -i %s -o %s -m parse'
if not verbose:
execute_string += ' > ' + os.path.join(tempfile.gettempdir(), "malt.out")
f = None
try: try:
f = open(input_file, 'w') for (i, (word, tag)) in enumerate(sentence, start=1):
input_file.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %
for (i, (word,tag)) in enumerate(sentence): (i, word, '_', tag, tag, '_', '0', 'a', '_', '_'))
f.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % input_file.write('\n')
(i+1, word, '_', tag, tag, '_', '0', 'a', '_', '_')) input_file.close()
f.write('\n')
f.close() cmd = ['java', '-jar', self._malt_bin, '-w', self.working_dir,
'-c', self.mco, '-i', input_file.name,
cmd = ['java', '-jar %s' % self._malt_bin, '-w %s' % tempfile.gettempdir(), '-o', output_file.name, '-m', 'parse']
'-c %s' % self.mco, '-i %s' % input_file, '-o %s' % output_file, '-m parse']
ret = self._execute(cmd, verbose)
self._execute(cmd, 'parse', verbose) if ret != 0:
raise Exception("MaltParser parsing (%s) failed with exit "
return DependencyGraph.load(output_file) "code %d" % (' '.join(cmd), ret))
return DependencyGraph.load(output_file.name)
finally: finally:
if f: f.close() input_file.close()
os.remove(input_file.name)
output_file.close()
os.remove(output_file.name)
def train(self, depgraphs, verbose=False): def train(self, depgraphs, verbose=False):
""" """
...@@ -155,16 +162,16 @@ class MaltParser(ParserI): ...@@ -155,16 +162,16 @@ class MaltParser(ParserI):
:param depgraphs: list of ``DependencyGraph`` objects for training input data :param depgraphs: list of ``DependencyGraph`` objects for training input data
""" """
input_file = os.path.join(tempfile.gettempdir(),'malt_train.conll') input_file = tempfile.NamedTemporaryFile(prefix='malt_train.conll',
dir=self.working_dir,
f = None delete=False)
try: try:
f = open(input_file, 'w') input_file.write('\n'.join([dg.to_conll(10) for dg in depgraphs]))
f.write('\n'.join([dg.to_conll(10) for dg in depgraphs])) input_file.close()
self.train_from_file(input_file.name, verbose=verbose)
finally: finally:
if f: f.close() input_file.close()
os.remove(input_file.name)
self.train_from_file(input_file, verbose=verbose)
def train_from_file(self, conll_file, verbose=False): def train_from_file(self, conll_file, verbose=False):
""" """
...@@ -175,33 +182,38 @@ class MaltParser(ParserI): ...@@ -175,33 +182,38 @@ class MaltParser(ParserI):
if not self._malt_bin: if not self._malt_bin:
raise Exception("MaltParser location is not configured. Call config_malt() first.") raise Exception("MaltParser location is not configured. Call config_malt() first.")
# If conll_file is a ZipFilePathPointer, then we need to do some extra massaging # If conll_file is a ZipFilePathPointer, then we need to do some extra
f = None # massaging
if hasattr(conll_file, 'zipfile'): if isinstance(conll_file, ZipFilePathPointer):
zip_conll_file = conll_file input_file = tempfile.NamedTemporaryFile(prefix='malt_train.conll',
conll_file = os.path.join(tempfile.gettempdir(),'malt_train.conll') dir=self.working_dir,
conll_str = zip_conll_file.open().read() delete=False)
f = open(conll_file,'w') try:
f.write(conll_str) conll_str = conll_file.open().read()
f.close() conll_file.close()
input_file.write(conll_str)
cmd = ['java', '-jar %s' % self._malt_bin, '-w %s' % tempfile.gettempdir(), input_file.close()
'-c %s' % self.mco, '-i %s' % conll_file, '-m learn'] return self.train_from_file(input_file.name, verbose=verbose)
finally:
# p = subprocess.Popen(cmd, stdout=subprocess.PIPE, input_file.close()
# stderr=subprocess.STDOUT, os.remove(input_file.name)
# stdin=subprocess.PIPE)
# (stdout, stderr) = p.communicate() cmd = ['java', '-jar', self._malt_bin, '-w', self.working_dir,
'-c', self.mco, '-i', conll_file, '-m', 'learn']
self._execute(cmd, 'train', verbose)
ret = self._execute(cmd, verbose)
if ret != 0:
raise Exception("MaltParser training (%s) "
"failed with exit code %d" %
(' '.join(cmd), ret))
self._trained = True self._trained = True
def _execute(self, cmd, type, verbose=False): @staticmethod
if not verbose: def _execute(cmd, verbose=False):
temp_dir = os.path.join(tempfile.gettempdir(), '') output = None if verbose else subprocess.PIPE
cmd.append(' > %smalt_%s.out 2> %smalt_%s.err' % ((temp_dir, type)*2)) p = subprocess.Popen(cmd, stdout=output, stderr=output)
malt_exit = os.system(' '.join(cmd)) return p.wait()
def demo(): def demo():
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment