177 lines
6.6 KiB
Python
177 lines
6.6 KiB
Python
# Natural Language Toolkit: Senna Interface
|
|
#
|
|
# Copyright (C) 2001-2023 NLTK Project
|
|
# Author: Rami Al-Rfou' <ralrfou@cs.stonybrook.edu>
|
|
# URL: <https://www.nltk.org/>
|
|
# For license information, see LICENSE.TXT
|
|
|
|
"""
|
|
A general interface to the SENNA pipeline that supports any of the
|
|
operations specified in SUPPORTED_OPERATIONS.
|
|
|
|
Applying multiple operations at once has the speed advantage. For example,
|
|
Senna will automatically determine POS tags if you are extracting named
|
|
entities. Applying both of the operations will cost only the time of
|
|
extracting the named entities.
|
|
|
|
The SENNA pipeline has a fixed maximum size of the sentences that it can read.
|
|
By default it is 1024 token/sentence. If you have larger sentences, changing
|
|
the MAX_SENTENCE_SIZE value in SENNA_main.c should be considered and your
|
|
system specific binary should be rebuilt. Otherwise this could introduce
|
|
misalignment errors.
|
|
|
|
The input is:
|
|
|
|
- path to the directory that contains SENNA executables. If the path is incorrect,
|
|
Senna will automatically search for executable file specified in SENNA environment variable
|
|
- List of the operations needed to be performed.
|
|
- (optionally) the encoding of the input data (default:utf-8)
|
|
|
|
Note: Unit tests for this module can be found in test/unit/test_senna.py
|
|
|
|
>>> from nltk.classify import Senna
|
|
>>> pipeline = Senna('/usr/share/senna-v3.0', ['pos', 'chk', 'ner']) # doctest: +SKIP
|
|
>>> sent = 'Dusseldorf is an international business center'.split()
|
|
>>> [(token['word'], token['chk'], token['ner'], token['pos']) for token in pipeline.tag(sent)] # doctest: +SKIP
|
|
[('Dusseldorf', 'B-NP', 'B-LOC', 'NNP'), ('is', 'B-VP', 'O', 'VBZ'), ('an', 'B-NP', 'O', 'DT'),
|
|
('international', 'I-NP', 'O', 'JJ'), ('business', 'I-NP', 'O', 'NN'), ('center', 'I-NP', 'O', 'NN')]
|
|
"""
|
|
|
|
from os import environ, path, sep
|
|
from platform import architecture, system
|
|
from subprocess import PIPE, Popen
|
|
|
|
from nltk.tag.api import TaggerI
|
|
|
|
|
|
class Senna(TaggerI):
|
|
|
|
SUPPORTED_OPERATIONS = ["pos", "chk", "ner"]
|
|
|
|
def __init__(self, senna_path, operations, encoding="utf-8"):
|
|
self._encoding = encoding
|
|
self._path = path.normpath(senna_path) + sep
|
|
|
|
# Verifies the existence of the executable on the self._path first
|
|
# senna_binary_file_1 = self.executable(self._path)
|
|
exe_file_1 = self.executable(self._path)
|
|
if not path.isfile(exe_file_1):
|
|
# Check for the system environment
|
|
if "SENNA" in environ:
|
|
# self._path = path.join(environ['SENNA'],'')
|
|
self._path = path.normpath(environ["SENNA"]) + sep
|
|
exe_file_2 = self.executable(self._path)
|
|
if not path.isfile(exe_file_2):
|
|
raise LookupError(
|
|
"Senna executable expected at %s or %s but not found"
|
|
% (exe_file_1, exe_file_2)
|
|
)
|
|
|
|
self.operations = operations
|
|
|
|
def executable(self, base_path):
|
|
"""
|
|
The function that determines the system specific binary that should be
|
|
used in the pipeline. In case, the system is not known the default senna binary will
|
|
be used.
|
|
"""
|
|
os_name = system()
|
|
if os_name == "Linux":
|
|
bits = architecture()[0]
|
|
if bits == "64bit":
|
|
return path.join(base_path, "senna-linux64")
|
|
return path.join(base_path, "senna-linux32")
|
|
if os_name == "Windows":
|
|
return path.join(base_path, "senna-win32.exe")
|
|
if os_name == "Darwin":
|
|
return path.join(base_path, "senna-osx")
|
|
return path.join(base_path, "senna")
|
|
|
|
def _map(self):
|
|
"""
|
|
A method that calculates the order of the columns that SENNA pipeline
|
|
will output the tags into. This depends on the operations being ordered.
|
|
"""
|
|
_map = {}
|
|
i = 1
|
|
for operation in Senna.SUPPORTED_OPERATIONS:
|
|
if operation in self.operations:
|
|
_map[operation] = i
|
|
i += 1
|
|
return _map
|
|
|
|
def tag(self, tokens):
|
|
"""
|
|
Applies the specified operation(s) on a list of tokens.
|
|
"""
|
|
return self.tag_sents([tokens])[0]
|
|
|
|
def tag_sents(self, sentences):
|
|
"""
|
|
Applies the tag method over a list of sentences. This method will return a
|
|
list of dictionaries. Every dictionary will contain a word with its
|
|
calculated annotations/tags.
|
|
"""
|
|
encoding = self._encoding
|
|
|
|
if not path.isfile(self.executable(self._path)):
|
|
raise LookupError(
|
|
"Senna executable expected at %s but not found"
|
|
% self.executable(self._path)
|
|
)
|
|
|
|
# Build the senna command to run the tagger
|
|
_senna_cmd = [
|
|
self.executable(self._path),
|
|
"-path",
|
|
self._path,
|
|
"-usrtokens",
|
|
"-iobtags",
|
|
]
|
|
_senna_cmd.extend(["-" + op for op in self.operations])
|
|
|
|
# Serialize the actual sentences to a temporary string
|
|
_input = "\n".join(" ".join(x) for x in sentences) + "\n"
|
|
if isinstance(_input, str) and encoding:
|
|
_input = _input.encode(encoding)
|
|
|
|
# Run the tagger and get the output
|
|
p = Popen(_senna_cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE)
|
|
(stdout, stderr) = p.communicate(input=_input)
|
|
senna_output = stdout
|
|
|
|
# Check the return code.
|
|
if p.returncode != 0:
|
|
raise RuntimeError("Senna command failed! Details: %s" % stderr)
|
|
|
|
if encoding:
|
|
senna_output = stdout.decode(encoding)
|
|
|
|
# Output the tagged sentences
|
|
map_ = self._map()
|
|
tagged_sentences = [[]]
|
|
sentence_index = 0
|
|
token_index = 0
|
|
for tagged_word in senna_output.strip().split("\n"):
|
|
if not tagged_word:
|
|
tagged_sentences.append([])
|
|
sentence_index += 1
|
|
token_index = 0
|
|
continue
|
|
tags = tagged_word.split("\t")
|
|
result = {}
|
|
for tag in map_:
|
|
result[tag] = tags[map_[tag]].strip()
|
|
try:
|
|
result["word"] = sentences[sentence_index][token_index]
|
|
except IndexError as e:
|
|
raise IndexError(
|
|
"Misalignment error occurred at sentence number %d. Possible reason"
|
|
" is that the sentence size exceeded the maximum size. Check the "
|
|
"documentation of Senna class for more information."
|
|
% sentence_index
|
|
) from e
|
|
tagged_sentences[-1].append(result)
|
|
token_index += 1
|
|
return tagged_sentences
|