801 lines
26 KiB
Python
801 lines
26 KiB
Python
|
# Natural Language Toolkit: Interface to the CoreNLP REST API.
|
||
|
#
|
||
|
# Copyright (C) 2001-2023 NLTK Project
|
||
|
# Author: Dmitrijs Milajevs <dimazest@gmail.com>
|
||
|
#
|
||
|
# URL: <https://www.nltk.org/>
|
||
|
# For license information, see LICENSE.TXT
|
||
|
|
||
|
import json
|
||
|
import os # required for doctests
|
||
|
import re
|
||
|
import socket
|
||
|
import time
|
||
|
from typing import List, Tuple
|
||
|
|
||
|
from nltk.internals import _java_options, config_java, find_jar_iter, java
|
||
|
from nltk.parse.api import ParserI
|
||
|
from nltk.parse.dependencygraph import DependencyGraph
|
||
|
from nltk.tag.api import TaggerI
|
||
|
from nltk.tokenize.api import TokenizerI
|
||
|
from nltk.tree import Tree
|
||
|
|
||
|
_stanford_url = "https://stanfordnlp.github.io/CoreNLP/"
|
||
|
|
||
|
|
||
|
class CoreNLPServerError(EnvironmentError):
|
||
|
"""Exceptions associated with the Core NLP server."""
|
||
|
|
||
|
|
||
|
def try_port(port=0):
|
||
|
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||
|
sock.bind(("", port))
|
||
|
|
||
|
p = sock.getsockname()[1]
|
||
|
sock.close()
|
||
|
|
||
|
return p
|
||
|
|
||
|
|
||
|
class CoreNLPServer:
|
||
|
|
||
|
_MODEL_JAR_PATTERN = r"stanford-corenlp-(\d+)\.(\d+)\.(\d+)-models\.jar"
|
||
|
_JAR = r"stanford-corenlp-(\d+)\.(\d+)\.(\d+)\.jar"
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
path_to_jar=None,
|
||
|
path_to_models_jar=None,
|
||
|
verbose=False,
|
||
|
java_options=None,
|
||
|
corenlp_options=None,
|
||
|
port=None,
|
||
|
):
|
||
|
|
||
|
if corenlp_options is None:
|
||
|
corenlp_options = ["-preload", "tokenize,ssplit,pos,lemma,parse,depparse"]
|
||
|
|
||
|
jars = list(
|
||
|
find_jar_iter(
|
||
|
self._JAR,
|
||
|
path_to_jar,
|
||
|
env_vars=("CORENLP",),
|
||
|
searchpath=(),
|
||
|
url=_stanford_url,
|
||
|
verbose=verbose,
|
||
|
is_regex=True,
|
||
|
)
|
||
|
)
|
||
|
|
||
|
# find the most recent code and model jar
|
||
|
stanford_jar = max(jars, key=lambda model_name: re.match(self._JAR, model_name))
|
||
|
|
||
|
if port is None:
|
||
|
try:
|
||
|
port = try_port(9000)
|
||
|
except OSError:
|
||
|
port = try_port()
|
||
|
corenlp_options.extend(["-port", str(port)])
|
||
|
else:
|
||
|
try_port(port)
|
||
|
corenlp_options.extend(["-port", str(port)])
|
||
|
|
||
|
self.url = f"http://localhost:{port}"
|
||
|
|
||
|
model_jar = max(
|
||
|
find_jar_iter(
|
||
|
self._MODEL_JAR_PATTERN,
|
||
|
path_to_models_jar,
|
||
|
env_vars=("CORENLP_MODELS",),
|
||
|
searchpath=(),
|
||
|
url=_stanford_url,
|
||
|
verbose=verbose,
|
||
|
is_regex=True,
|
||
|
),
|
||
|
key=lambda model_name: re.match(self._MODEL_JAR_PATTERN, model_name),
|
||
|
)
|
||
|
|
||
|
self.verbose = verbose
|
||
|
|
||
|
self._classpath = stanford_jar, model_jar
|
||
|
|
||
|
self.corenlp_options = corenlp_options
|
||
|
self.java_options = java_options or ["-mx2g"]
|
||
|
|
||
|
def start(self, stdout="devnull", stderr="devnull"):
|
||
|
"""Starts the CoreNLP server
|
||
|
|
||
|
:param stdout, stderr: Specifies where CoreNLP output is redirected. Valid values are 'devnull', 'stdout', 'pipe'
|
||
|
"""
|
||
|
import requests
|
||
|
|
||
|
cmd = ["edu.stanford.nlp.pipeline.StanfordCoreNLPServer"]
|
||
|
|
||
|
if self.corenlp_options:
|
||
|
cmd.extend(self.corenlp_options)
|
||
|
|
||
|
# Configure java.
|
||
|
default_options = " ".join(_java_options)
|
||
|
config_java(options=self.java_options, verbose=self.verbose)
|
||
|
|
||
|
try:
|
||
|
self.popen = java(
|
||
|
cmd,
|
||
|
classpath=self._classpath,
|
||
|
blocking=False,
|
||
|
stdout=stdout,
|
||
|
stderr=stderr,
|
||
|
)
|
||
|
finally:
|
||
|
# Return java configurations to their default values.
|
||
|
config_java(options=default_options, verbose=self.verbose)
|
||
|
|
||
|
# Check that the server is istill running.
|
||
|
returncode = self.popen.poll()
|
||
|
if returncode is not None:
|
||
|
_, stderrdata = self.popen.communicate()
|
||
|
raise CoreNLPServerError(
|
||
|
returncode,
|
||
|
"Could not start the server. "
|
||
|
"The error was: {}".format(stderrdata.decode("ascii")),
|
||
|
)
|
||
|
|
||
|
for i in range(30):
|
||
|
try:
|
||
|
response = requests.get(requests.compat.urljoin(self.url, "live"))
|
||
|
except requests.exceptions.ConnectionError:
|
||
|
time.sleep(1)
|
||
|
else:
|
||
|
if response.ok:
|
||
|
break
|
||
|
else:
|
||
|
raise CoreNLPServerError("Could not connect to the server.")
|
||
|
|
||
|
for i in range(60):
|
||
|
try:
|
||
|
response = requests.get(requests.compat.urljoin(self.url, "ready"))
|
||
|
except requests.exceptions.ConnectionError:
|
||
|
time.sleep(1)
|
||
|
else:
|
||
|
if response.ok:
|
||
|
break
|
||
|
else:
|
||
|
raise CoreNLPServerError("The server is not ready.")
|
||
|
|
||
|
def stop(self):
|
||
|
self.popen.terminate()
|
||
|
self.popen.wait()
|
||
|
|
||
|
def __enter__(self):
|
||
|
self.start()
|
||
|
|
||
|
return self
|
||
|
|
||
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
||
|
self.stop()
|
||
|
return False
|
||
|
|
||
|
|
||
|
class GenericCoreNLPParser(ParserI, TokenizerI, TaggerI):
|
||
|
"""Interface to the CoreNLP Parser."""
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
url="http://localhost:9000",
|
||
|
encoding="utf8",
|
||
|
tagtype=None,
|
||
|
strict_json=True,
|
||
|
):
|
||
|
import requests
|
||
|
|
||
|
self.url = url
|
||
|
self.encoding = encoding
|
||
|
|
||
|
if tagtype not in ["pos", "ner", None]:
|
||
|
raise ValueError("tagtype must be either 'pos', 'ner' or None")
|
||
|
|
||
|
self.tagtype = tagtype
|
||
|
self.strict_json = strict_json
|
||
|
|
||
|
self.session = requests.Session()
|
||
|
|
||
|
def parse_sents(self, sentences, *args, **kwargs):
|
||
|
"""Parse multiple sentences.
|
||
|
|
||
|
Takes multiple sentences as a list where each sentence is a list of
|
||
|
words. Each sentence will be automatically tagged with this
|
||
|
CoreNLPParser instance's tagger.
|
||
|
|
||
|
If a whitespace exists inside a token, then the token will be treated as
|
||
|
several tokens.
|
||
|
|
||
|
:param sentences: Input sentences to parse
|
||
|
:type sentences: list(list(str))
|
||
|
:rtype: iter(iter(Tree))
|
||
|
"""
|
||
|
# Converting list(list(str)) -> list(str)
|
||
|
sentences = (" ".join(words) for words in sentences)
|
||
|
return self.raw_parse_sents(sentences, *args, **kwargs)
|
||
|
|
||
|
def raw_parse(self, sentence, properties=None, *args, **kwargs):
|
||
|
"""Parse a sentence.
|
||
|
|
||
|
Takes a sentence as a string; before parsing, it will be automatically
|
||
|
tokenized and tagged by the CoreNLP Parser.
|
||
|
|
||
|
:param sentence: Input sentence to parse
|
||
|
:type sentence: str
|
||
|
:rtype: iter(Tree)
|
||
|
"""
|
||
|
default_properties = {"tokenize.whitespace": "false"}
|
||
|
default_properties.update(properties or {})
|
||
|
|
||
|
return next(
|
||
|
self.raw_parse_sents(
|
||
|
[sentence], properties=default_properties, *args, **kwargs
|
||
|
)
|
||
|
)
|
||
|
|
||
|
def api_call(self, data, properties=None, timeout=60):
|
||
|
default_properties = {
|
||
|
"outputFormat": "json",
|
||
|
"annotators": "tokenize,pos,lemma,ssplit,{parser_annotator}".format(
|
||
|
parser_annotator=self.parser_annotator
|
||
|
),
|
||
|
}
|
||
|
|
||
|
default_properties.update(properties or {})
|
||
|
|
||
|
response = self.session.post(
|
||
|
self.url,
|
||
|
params={"properties": json.dumps(default_properties)},
|
||
|
data=data.encode(self.encoding),
|
||
|
headers={"Content-Type": f"text/plain; charset={self.encoding}"},
|
||
|
timeout=timeout,
|
||
|
)
|
||
|
|
||
|
response.raise_for_status()
|
||
|
|
||
|
return response.json(strict=self.strict_json)
|
||
|
|
||
|
def raw_parse_sents(
|
||
|
self, sentences, verbose=False, properties=None, *args, **kwargs
|
||
|
):
|
||
|
"""Parse multiple sentences.
|
||
|
|
||
|
Takes multiple sentences as a list of strings. Each sentence will be
|
||
|
automatically tokenized and tagged.
|
||
|
|
||
|
:param sentences: Input sentences to parse.
|
||
|
:type sentences: list(str)
|
||
|
:rtype: iter(iter(Tree))
|
||
|
|
||
|
"""
|
||
|
default_properties = {
|
||
|
# Only splits on '\n', never inside the sentence.
|
||
|
"ssplit.eolonly": "true"
|
||
|
}
|
||
|
|
||
|
default_properties.update(properties or {})
|
||
|
|
||
|
"""
|
||
|
for sentence in sentences:
|
||
|
parsed_data = self.api_call(sentence, properties=default_properties)
|
||
|
|
||
|
assert len(parsed_data['sentences']) == 1
|
||
|
|
||
|
for parse in parsed_data['sentences']:
|
||
|
tree = self.make_tree(parse)
|
||
|
yield iter([tree])
|
||
|
"""
|
||
|
parsed_data = self.api_call("\n".join(sentences), properties=default_properties)
|
||
|
for parsed_sent in parsed_data["sentences"]:
|
||
|
tree = self.make_tree(parsed_sent)
|
||
|
yield iter([tree])
|
||
|
|
||
|
def parse_text(self, text, *args, **kwargs):
|
||
|
"""Parse a piece of text.
|
||
|
|
||
|
The text might contain several sentences which will be split by CoreNLP.
|
||
|
|
||
|
:param str text: text to be split.
|
||
|
:returns: an iterable of syntactic structures. # TODO: should it be an iterable of iterables?
|
||
|
|
||
|
"""
|
||
|
parsed_data = self.api_call(text, *args, **kwargs)
|
||
|
|
||
|
for parse in parsed_data["sentences"]:
|
||
|
yield self.make_tree(parse)
|
||
|
|
||
|
def tokenize(self, text, properties=None):
|
||
|
"""Tokenize a string of text.
|
||
|
|
||
|
Skip these tests if CoreNLP is likely not ready.
|
||
|
>>> from nltk.test.setup_fixt import check_jar
|
||
|
>>> check_jar(CoreNLPServer._JAR, env_vars=("CORENLP",), is_regex=True)
|
||
|
|
||
|
The CoreNLP server can be started using the following notation, although
|
||
|
we recommend the `with CoreNLPServer() as server:` context manager notation
|
||
|
to ensure that the server is always stopped.
|
||
|
>>> server = CoreNLPServer()
|
||
|
>>> server.start()
|
||
|
>>> parser = CoreNLPParser(url=server.url)
|
||
|
|
||
|
>>> text = 'Good muffins cost $3.88\\nin New York. Please buy me\\ntwo of them.\\nThanks.'
|
||
|
>>> list(parser.tokenize(text))
|
||
|
['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
|
||
|
|
||
|
>>> s = "The colour of the wall is blue."
|
||
|
>>> list(
|
||
|
... parser.tokenize(
|
||
|
... 'The colour of the wall is blue.',
|
||
|
... properties={'tokenize.options': 'americanize=true'},
|
||
|
... )
|
||
|
... )
|
||
|
['The', 'colour', 'of', 'the', 'wall', 'is', 'blue', '.']
|
||
|
>>> server.stop()
|
||
|
|
||
|
"""
|
||
|
default_properties = {"annotators": "tokenize,ssplit"}
|
||
|
|
||
|
default_properties.update(properties or {})
|
||
|
|
||
|
result = self.api_call(text, properties=default_properties)
|
||
|
|
||
|
for sentence in result["sentences"]:
|
||
|
for token in sentence["tokens"]:
|
||
|
yield token["originalText"] or token["word"]
|
||
|
|
||
|
def tag_sents(self, sentences):
|
||
|
"""
|
||
|
Tag multiple sentences.
|
||
|
|
||
|
Takes multiple sentences as a list where each sentence is a list of
|
||
|
tokens.
|
||
|
|
||
|
:param sentences: Input sentences to tag
|
||
|
:type sentences: list(list(str))
|
||
|
:rtype: list(list(tuple(str, str))
|
||
|
"""
|
||
|
# Converting list(list(str)) -> list(str)
|
||
|
sentences = (" ".join(words) for words in sentences)
|
||
|
return [sentences[0] for sentences in self.raw_tag_sents(sentences)]
|
||
|
|
||
|
def tag(self, sentence: str) -> List[Tuple[str, str]]:
|
||
|
"""
|
||
|
Tag a list of tokens.
|
||
|
|
||
|
:rtype: list(tuple(str, str))
|
||
|
|
||
|
Skip these tests if CoreNLP is likely not ready.
|
||
|
>>> from nltk.test.setup_fixt import check_jar
|
||
|
>>> check_jar(CoreNLPServer._JAR, env_vars=("CORENLP",), is_regex=True)
|
||
|
|
||
|
The CoreNLP server can be started using the following notation, although
|
||
|
we recommend the `with CoreNLPServer() as server:` context manager notation
|
||
|
to ensure that the server is always stopped.
|
||
|
>>> server = CoreNLPServer()
|
||
|
>>> server.start()
|
||
|
>>> parser = CoreNLPParser(url=server.url, tagtype='ner')
|
||
|
>>> tokens = 'Rami Eid is studying at Stony Brook University in NY'.split()
|
||
|
>>> parser.tag(tokens) # doctest: +NORMALIZE_WHITESPACE
|
||
|
[('Rami', 'PERSON'), ('Eid', 'PERSON'), ('is', 'O'), ('studying', 'O'), ('at', 'O'), ('Stony', 'ORGANIZATION'),
|
||
|
('Brook', 'ORGANIZATION'), ('University', 'ORGANIZATION'), ('in', 'O'), ('NY', 'STATE_OR_PROVINCE')]
|
||
|
|
||
|
>>> parser = CoreNLPParser(url=server.url, tagtype='pos')
|
||
|
>>> tokens = "What is the airspeed of an unladen swallow ?".split()
|
||
|
>>> parser.tag(tokens) # doctest: +NORMALIZE_WHITESPACE
|
||
|
[('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'),
|
||
|
('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'),
|
||
|
('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')]
|
||
|
>>> server.stop()
|
||
|
"""
|
||
|
return self.tag_sents([sentence])[0]
|
||
|
|
||
|
def raw_tag_sents(self, sentences):
|
||
|
"""
|
||
|
Tag multiple sentences.
|
||
|
|
||
|
Takes multiple sentences as a list where each sentence is a string.
|
||
|
|
||
|
:param sentences: Input sentences to tag
|
||
|
:type sentences: list(str)
|
||
|
:rtype: list(list(list(tuple(str, str)))
|
||
|
"""
|
||
|
default_properties = {
|
||
|
"ssplit.isOneSentence": "true",
|
||
|
"annotators": "tokenize,ssplit,",
|
||
|
}
|
||
|
|
||
|
# Supports only 'pos' or 'ner' tags.
|
||
|
assert self.tagtype in ["pos", "ner"]
|
||
|
default_properties["annotators"] += self.tagtype
|
||
|
for sentence in sentences:
|
||
|
tagged_data = self.api_call(sentence, properties=default_properties)
|
||
|
yield [
|
||
|
[
|
||
|
(token["word"], token[self.tagtype])
|
||
|
for token in tagged_sentence["tokens"]
|
||
|
]
|
||
|
for tagged_sentence in tagged_data["sentences"]
|
||
|
]
|
||
|
|
||
|
|
||
|
class CoreNLPParser(GenericCoreNLPParser):
|
||
|
"""
|
||
|
Skip these tests if CoreNLP is likely not ready.
|
||
|
>>> from nltk.test.setup_fixt import check_jar
|
||
|
>>> check_jar(CoreNLPServer._JAR, env_vars=("CORENLP",), is_regex=True)
|
||
|
|
||
|
The recommended usage of `CoreNLPParser` is using the context manager notation:
|
||
|
>>> with CoreNLPServer() as server:
|
||
|
... parser = CoreNLPParser(url=server.url)
|
||
|
... next(
|
||
|
... parser.raw_parse('The quick brown fox jumps over the lazy dog.')
|
||
|
... ).pretty_print() # doctest: +NORMALIZE_WHITESPACE
|
||
|
ROOT
|
||
|
|
|
||
|
S
|
||
|
_______________|__________________________
|
||
|
| VP |
|
||
|
| _________|___ |
|
||
|
| | PP |
|
||
|
| | ________|___ |
|
||
|
NP | | NP |
|
||
|
____|__________ | | _______|____ |
|
||
|
DT JJ JJ NN VBZ IN DT JJ NN .
|
||
|
| | | | | | | | | |
|
||
|
The quick brown fox jumps over the lazy dog .
|
||
|
|
||
|
Alternatively, the server can be started using the following notation.
|
||
|
Note that `CoreNLPServer` does not need to be used if the CoreNLP server is started
|
||
|
outside of Python.
|
||
|
>>> server = CoreNLPServer()
|
||
|
>>> server.start()
|
||
|
>>> parser = CoreNLPParser(url=server.url)
|
||
|
|
||
|
>>> (parse_fox, ), (parse_wolf, ) = parser.raw_parse_sents(
|
||
|
... [
|
||
|
... 'The quick brown fox jumps over the lazy dog.',
|
||
|
... 'The quick grey wolf jumps over the lazy fox.',
|
||
|
... ]
|
||
|
... )
|
||
|
|
||
|
>>> parse_fox.pretty_print() # doctest: +NORMALIZE_WHITESPACE
|
||
|
ROOT
|
||
|
|
|
||
|
S
|
||
|
_______________|__________________________
|
||
|
| VP |
|
||
|
| _________|___ |
|
||
|
| | PP |
|
||
|
| | ________|___ |
|
||
|
NP | | NP |
|
||
|
____|__________ | | _______|____ |
|
||
|
DT JJ JJ NN VBZ IN DT JJ NN .
|
||
|
| | | | | | | | | |
|
||
|
The quick brown fox jumps over the lazy dog .
|
||
|
|
||
|
>>> parse_wolf.pretty_print() # doctest: +NORMALIZE_WHITESPACE
|
||
|
ROOT
|
||
|
|
|
||
|
S
|
||
|
_______________|__________________________
|
||
|
| VP |
|
||
|
| _________|___ |
|
||
|
| | PP |
|
||
|
| | ________|___ |
|
||
|
NP | | NP |
|
||
|
____|_________ | | _______|____ |
|
||
|
DT JJ JJ NN VBZ IN DT JJ NN .
|
||
|
| | | | | | | | | |
|
||
|
The quick grey wolf jumps over the lazy fox .
|
||
|
|
||
|
>>> (parse_dog, ), (parse_friends, ) = parser.parse_sents(
|
||
|
... [
|
||
|
... "I 'm a dog".split(),
|
||
|
... "This is my friends ' cat ( the tabby )".split(),
|
||
|
... ]
|
||
|
... )
|
||
|
|
||
|
>>> parse_dog.pretty_print() # doctest: +NORMALIZE_WHITESPACE
|
||
|
ROOT
|
||
|
|
|
||
|
S
|
||
|
_______|____
|
||
|
| VP
|
||
|
| ________|___
|
||
|
NP | NP
|
||
|
| | ___|___
|
||
|
PRP VBP DT NN
|
||
|
| | | |
|
||
|
I 'm a dog
|
||
|
|
||
|
>>> parse_friends.pretty_print() # doctest: +NORMALIZE_WHITESPACE
|
||
|
ROOT
|
||
|
|
|
||
|
S
|
||
|
____|___________
|
||
|
| VP
|
||
|
| ___________|_____________
|
||
|
| | NP
|
||
|
| | _______|________________________
|
||
|
| | NP | | |
|
||
|
| | _____|_______ | | |
|
||
|
NP | NP | | NP |
|
||
|
| | ______|_________ | | ___|____ |
|
||
|
DT VBZ PRP$ NNS POS NN -LRB- DT NN -RRB-
|
||
|
| | | | | | | | | |
|
||
|
This is my friends ' cat -LRB- the tabby -RRB-
|
||
|
|
||
|
>>> parse_john, parse_mary, = parser.parse_text(
|
||
|
... 'John loves Mary. Mary walks.'
|
||
|
... )
|
||
|
|
||
|
>>> parse_john.pretty_print() # doctest: +NORMALIZE_WHITESPACE
|
||
|
ROOT
|
||
|
|
|
||
|
S
|
||
|
_____|_____________
|
||
|
| VP |
|
||
|
| ____|___ |
|
||
|
NP | NP |
|
||
|
| | | |
|
||
|
NNP VBZ NNP .
|
||
|
| | | |
|
||
|
John loves Mary .
|
||
|
|
||
|
>>> parse_mary.pretty_print() # doctest: +NORMALIZE_WHITESPACE
|
||
|
ROOT
|
||
|
|
|
||
|
S
|
||
|
_____|____
|
||
|
NP VP |
|
||
|
| | |
|
||
|
NNP VBZ .
|
||
|
| | |
|
||
|
Mary walks .
|
||
|
|
||
|
Special cases
|
||
|
|
||
|
>>> next(
|
||
|
... parser.raw_parse(
|
||
|
... 'NASIRIYA, Iraq—Iraqi doctors who treated former prisoner of war '
|
||
|
... 'Jessica Lynch have angrily dismissed claims made in her biography '
|
||
|
... 'that she was raped by her Iraqi captors.'
|
||
|
... )
|
||
|
... ).height()
|
||
|
14
|
||
|
|
||
|
>>> next(
|
||
|
... parser.raw_parse(
|
||
|
... "The broader Standard & Poor's 500 Index <.SPX> was 0.46 points lower, or "
|
||
|
... '0.05 percent, at 997.02.'
|
||
|
... )
|
||
|
... ).height()
|
||
|
11
|
||
|
|
||
|
>>> server.stop()
|
||
|
"""
|
||
|
|
||
|
_OUTPUT_FORMAT = "penn"
|
||
|
parser_annotator = "parse"
|
||
|
|
||
|
def make_tree(self, result):
|
||
|
return Tree.fromstring(result["parse"])
|
||
|
|
||
|
|
||
|
class CoreNLPDependencyParser(GenericCoreNLPParser):
|
||
|
"""Dependency parser.
|
||
|
|
||
|
Skip these tests if CoreNLP is likely not ready.
|
||
|
>>> from nltk.test.setup_fixt import check_jar
|
||
|
>>> check_jar(CoreNLPServer._JAR, env_vars=("CORENLP",), is_regex=True)
|
||
|
|
||
|
The recommended usage of `CoreNLPParser` is using the context manager notation:
|
||
|
>>> with CoreNLPServer() as server:
|
||
|
... dep_parser = CoreNLPDependencyParser(url=server.url)
|
||
|
... parse, = dep_parser.raw_parse(
|
||
|
... 'The quick brown fox jumps over the lazy dog.'
|
||
|
... )
|
||
|
... print(parse.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE
|
||
|
The DT 4 det
|
||
|
quick JJ 4 amod
|
||
|
brown JJ 4 amod
|
||
|
fox NN 5 nsubj
|
||
|
jumps VBZ 0 ROOT
|
||
|
over IN 9 case
|
||
|
the DT 9 det
|
||
|
lazy JJ 9 amod
|
||
|
dog NN 5 obl
|
||
|
. . 5 punct
|
||
|
|
||
|
Alternatively, the server can be started using the following notation.
|
||
|
Note that `CoreNLPServer` does not need to be used if the CoreNLP server is started
|
||
|
outside of Python.
|
||
|
>>> server = CoreNLPServer()
|
||
|
>>> server.start()
|
||
|
>>> dep_parser = CoreNLPDependencyParser(url=server.url)
|
||
|
>>> parse, = dep_parser.raw_parse('The quick brown fox jumps over the lazy dog.')
|
||
|
>>> print(parse.tree()) # doctest: +NORMALIZE_WHITESPACE
|
||
|
(jumps (fox The quick brown) (dog over the lazy) .)
|
||
|
|
||
|
>>> for governor, dep, dependent in parse.triples():
|
||
|
... print(governor, dep, dependent) # doctest: +NORMALIZE_WHITESPACE
|
||
|
('jumps', 'VBZ') nsubj ('fox', 'NN')
|
||
|
('fox', 'NN') det ('The', 'DT')
|
||
|
('fox', 'NN') amod ('quick', 'JJ')
|
||
|
('fox', 'NN') amod ('brown', 'JJ')
|
||
|
('jumps', 'VBZ') obl ('dog', 'NN')
|
||
|
('dog', 'NN') case ('over', 'IN')
|
||
|
('dog', 'NN') det ('the', 'DT')
|
||
|
('dog', 'NN') amod ('lazy', 'JJ')
|
||
|
('jumps', 'VBZ') punct ('.', '.')
|
||
|
|
||
|
>>> (parse_fox, ), (parse_dog, ) = dep_parser.raw_parse_sents(
|
||
|
... [
|
||
|
... 'The quick brown fox jumps over the lazy dog.',
|
||
|
... 'The quick grey wolf jumps over the lazy fox.',
|
||
|
... ]
|
||
|
... )
|
||
|
>>> print(parse_fox.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE
|
||
|
The DT 4 det
|
||
|
quick JJ 4 amod
|
||
|
brown JJ 4 amod
|
||
|
fox NN 5 nsubj
|
||
|
jumps VBZ 0 ROOT
|
||
|
over IN 9 case
|
||
|
the DT 9 det
|
||
|
lazy JJ 9 amod
|
||
|
dog NN 5 obl
|
||
|
. . 5 punct
|
||
|
|
||
|
>>> print(parse_dog.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE
|
||
|
The DT 4 det
|
||
|
quick JJ 4 amod
|
||
|
grey JJ 4 amod
|
||
|
wolf NN 5 nsubj
|
||
|
jumps VBZ 0 ROOT
|
||
|
over IN 9 case
|
||
|
the DT 9 det
|
||
|
lazy JJ 9 amod
|
||
|
fox NN 5 obl
|
||
|
. . 5 punct
|
||
|
|
||
|
>>> (parse_dog, ), (parse_friends, ) = dep_parser.parse_sents(
|
||
|
... [
|
||
|
... "I 'm a dog".split(),
|
||
|
... "This is my friends ' cat ( the tabby )".split(),
|
||
|
... ]
|
||
|
... )
|
||
|
>>> print(parse_dog.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE
|
||
|
I PRP 4 nsubj
|
||
|
'm VBP 4 cop
|
||
|
a DT 4 det
|
||
|
dog NN 0 ROOT
|
||
|
|
||
|
>>> print(parse_friends.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE
|
||
|
This DT 6 nsubj
|
||
|
is VBZ 6 cop
|
||
|
my PRP$ 4 nmod:poss
|
||
|
friends NNS 6 nmod:poss
|
||
|
' POS 4 case
|
||
|
cat NN 0 ROOT
|
||
|
( -LRB- 9 punct
|
||
|
the DT 9 det
|
||
|
tabby NN 6 dep
|
||
|
) -RRB- 9 punct
|
||
|
|
||
|
>>> parse_john, parse_mary, = dep_parser.parse_text(
|
||
|
... 'John loves Mary. Mary walks.'
|
||
|
... )
|
||
|
|
||
|
>>> print(parse_john.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE
|
||
|
John NNP 2 nsubj
|
||
|
loves VBZ 0 ROOT
|
||
|
Mary NNP 2 obj
|
||
|
. . 2 punct
|
||
|
|
||
|
>>> print(parse_mary.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE
|
||
|
Mary NNP 2 nsubj
|
||
|
walks VBZ 0 ROOT
|
||
|
. . 2 punct
|
||
|
|
||
|
Special cases
|
||
|
|
||
|
Non-breaking space inside of a token.
|
||
|
|
||
|
>>> len(
|
||
|
... next(
|
||
|
... dep_parser.raw_parse(
|
||
|
... 'Anhalt said children typically treat a 20-ounce soda bottle as one '
|
||
|
... 'serving, while it actually contains 2 1/2 servings.'
|
||
|
... )
|
||
|
... ).nodes
|
||
|
... )
|
||
|
23
|
||
|
|
||
|
Phone numbers.
|
||
|
|
||
|
>>> len(
|
||
|
... next(
|
||
|
... dep_parser.raw_parse('This is not going to crash: 01 111 555.')
|
||
|
... ).nodes
|
||
|
... )
|
||
|
10
|
||
|
|
||
|
>>> print(
|
||
|
... next(
|
||
|
... dep_parser.raw_parse('The underscore _ should not simply disappear.')
|
||
|
... ).to_conll(4)
|
||
|
... ) # doctest: +NORMALIZE_WHITESPACE
|
||
|
The DT 2 det
|
||
|
underscore NN 7 nsubj
|
||
|
_ NFP 7 punct
|
||
|
should MD 7 aux
|
||
|
not RB 7 advmod
|
||
|
simply RB 7 advmod
|
||
|
disappear VB 0 ROOT
|
||
|
. . 7 punct
|
||
|
|
||
|
>>> print(
|
||
|
... next(
|
||
|
... dep_parser.raw_parse(
|
||
|
... 'for all of its insights into the dream world of teen life , and its electronic expression through '
|
||
|
... 'cyber culture , the film gives no quarter to anyone seeking to pull a cohesive story out of its 2 '
|
||
|
... '1/2-hour running time .'
|
||
|
... )
|
||
|
... ).to_conll(4)
|
||
|
... ) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
|
||
|
for IN 2 case
|
||
|
all DT 24 obl
|
||
|
of IN 5 case
|
||
|
its PRP$ 5 nmod:poss
|
||
|
insights NNS 2 nmod
|
||
|
into IN 9 case
|
||
|
the DT 9 det
|
||
|
dream NN 9 compound
|
||
|
world NN 5 nmod
|
||
|
of IN 12 case
|
||
|
teen NN 12 compound
|
||
|
...
|
||
|
|
||
|
>>> server.stop()
|
||
|
"""
|
||
|
|
||
|
_OUTPUT_FORMAT = "conll2007"
|
||
|
parser_annotator = "depparse"
|
||
|
|
||
|
def make_tree(self, result):
|
||
|
|
||
|
return DependencyGraph(
|
||
|
(
|
||
|
" ".join(n_items[1:]) # NLTK expects an iterable of strings...
|
||
|
for n_items in sorted(transform(result))
|
||
|
),
|
||
|
cell_separator=" ", # To make sure that a non-breaking space is kept inside of a token.
|
||
|
)
|
||
|
|
||
|
|
||
|
def transform(sentence):
|
||
|
for dependency in sentence["basicDependencies"]:
|
||
|
|
||
|
dependent_index = dependency["dependent"]
|
||
|
token = sentence["tokens"][dependent_index - 1]
|
||
|
|
||
|
# Return values that we don't know as '_'. Also, consider tag and ctag
|
||
|
# to be equal.
|
||
|
yield (
|
||
|
dependent_index,
|
||
|
"_",
|
||
|
token["word"],
|
||
|
token["lemma"],
|
||
|
token["pos"],
|
||
|
token["pos"],
|
||
|
"_",
|
||
|
str(dependency["governor"]),
|
||
|
dependency["dep"],
|
||
|
"_",
|
||
|
"_",
|
||
|
)
|