591 lines
22 KiB
Python
591 lines
22 KiB
Python
# Natural Language Toolkit: Text Trees
|
|
#
|
|
# Copyright (C) 2001-2023 NLTK Project
|
|
# Author: Edward Loper <edloper@gmail.com>
|
|
# Steven Bird <stevenbird1@gmail.com>
|
|
# Peter Ljunglöf <peter.ljunglof@gu.se>
|
|
# Tom Aarsen <>
|
|
# URL: <https://www.nltk.org/>
|
|
# For license information, see LICENSE.TXT
|
|
|
|
import warnings
|
|
from abc import ABCMeta, abstractmethod
|
|
|
|
from nltk.tree.tree import Tree
|
|
from nltk.util import slice_bounds
|
|
|
|
|
|
######################################################################
|
|
## Parented trees
|
|
######################################################################
|
|
class AbstractParentedTree(Tree, metaclass=ABCMeta):
|
|
"""
|
|
An abstract base class for a ``Tree`` that automatically maintains
|
|
pointers to parent nodes. These parent pointers are updated
|
|
whenever any change is made to a tree's structure. Two subclasses
|
|
are currently defined:
|
|
|
|
- ``ParentedTree`` is used for tree structures where each subtree
|
|
has at most one parent. This class should be used in cases
|
|
where there is no"sharing" of subtrees.
|
|
|
|
- ``MultiParentedTree`` is used for tree structures where a
|
|
subtree may have zero or more parents. This class should be
|
|
used in cases where subtrees may be shared.
|
|
|
|
Subclassing
|
|
===========
|
|
The ``AbstractParentedTree`` class redefines all operations that
|
|
modify a tree's structure to call two methods, which are used by
|
|
subclasses to update parent information:
|
|
|
|
- ``_setparent()`` is called whenever a new child is added.
|
|
- ``_delparent()`` is called whenever a child is removed.
|
|
"""
|
|
|
|
def __init__(self, node, children=None):
|
|
super().__init__(node, children)
|
|
# If children is None, the tree is read from node, and
|
|
# all parents will be set during parsing.
|
|
if children is not None:
|
|
# Otherwise we have to set the parent of the children.
|
|
# Iterate over self, and *not* children, because children
|
|
# might be an iterator.
|
|
for i, child in enumerate(self):
|
|
if isinstance(child, Tree):
|
|
self._setparent(child, i, dry_run=True)
|
|
for i, child in enumerate(self):
|
|
if isinstance(child, Tree):
|
|
self._setparent(child, i)
|
|
|
|
# ////////////////////////////////////////////////////////////
|
|
# Parent management
|
|
# ////////////////////////////////////////////////////////////
|
|
@abstractmethod
|
|
def _setparent(self, child, index, dry_run=False):
|
|
"""
|
|
Update the parent pointer of ``child`` to point to ``self``. This
|
|
method is only called if the type of ``child`` is ``Tree``;
|
|
i.e., it is not called when adding a leaf to a tree. This method
|
|
is always called before the child is actually added to the
|
|
child list of ``self``.
|
|
|
|
:type child: Tree
|
|
:type index: int
|
|
:param index: The index of ``child`` in ``self``.
|
|
:raise TypeError: If ``child`` is a tree with an impropriate
|
|
type. Typically, if ``child`` is a tree, then its type needs
|
|
to match the type of ``self``. This prevents mixing of
|
|
different tree types (single-parented, multi-parented, and
|
|
non-parented).
|
|
:param dry_run: If true, the don't actually set the child's
|
|
parent pointer; just check for any error conditions, and
|
|
raise an exception if one is found.
|
|
"""
|
|
|
|
@abstractmethod
|
|
def _delparent(self, child, index):
|
|
"""
|
|
Update the parent pointer of ``child`` to not point to self. This
|
|
method is only called if the type of ``child`` is ``Tree``; i.e., it
|
|
is not called when removing a leaf from a tree. This method
|
|
is always called before the child is actually removed from the
|
|
child list of ``self``.
|
|
|
|
:type child: Tree
|
|
:type index: int
|
|
:param index: The index of ``child`` in ``self``.
|
|
"""
|
|
|
|
# ////////////////////////////////////////////////////////////
|
|
# Methods that add/remove children
|
|
# ////////////////////////////////////////////////////////////
|
|
# Every method that adds or removes a child must make
|
|
# appropriate calls to _setparent() and _delparent().
|
|
|
|
def __delitem__(self, index):
|
|
# del ptree[start:stop]
|
|
if isinstance(index, slice):
|
|
start, stop, step = slice_bounds(self, index, allow_step=True)
|
|
# Clear all the children pointers.
|
|
for i in range(start, stop, step):
|
|
if isinstance(self[i], Tree):
|
|
self._delparent(self[i], i)
|
|
# Delete the children from our child list.
|
|
super().__delitem__(index)
|
|
|
|
# del ptree[i]
|
|
elif isinstance(index, int):
|
|
if index < 0:
|
|
index += len(self)
|
|
if index < 0:
|
|
raise IndexError("index out of range")
|
|
# Clear the child's parent pointer.
|
|
if isinstance(self[index], Tree):
|
|
self._delparent(self[index], index)
|
|
# Remove the child from our child list.
|
|
super().__delitem__(index)
|
|
|
|
elif isinstance(index, (list, tuple)):
|
|
# del ptree[()]
|
|
if len(index) == 0:
|
|
raise IndexError("The tree position () may not be deleted.")
|
|
# del ptree[(i,)]
|
|
elif len(index) == 1:
|
|
del self[index[0]]
|
|
# del ptree[i1, i2, i3]
|
|
else:
|
|
del self[index[0]][index[1:]]
|
|
|
|
else:
|
|
raise TypeError(
|
|
"%s indices must be integers, not %s"
|
|
% (type(self).__name__, type(index).__name__)
|
|
)
|
|
|
|
def __setitem__(self, index, value):
|
|
# ptree[start:stop] = value
|
|
if isinstance(index, slice):
|
|
start, stop, step = slice_bounds(self, index, allow_step=True)
|
|
# make a copy of value, in case it's an iterator
|
|
if not isinstance(value, (list, tuple)):
|
|
value = list(value)
|
|
# Check for any error conditions, so we can avoid ending
|
|
# up in an inconsistent state if an error does occur.
|
|
for i, child in enumerate(value):
|
|
if isinstance(child, Tree):
|
|
self._setparent(child, start + i * step, dry_run=True)
|
|
# clear the child pointers of all parents we're removing
|
|
for i in range(start, stop, step):
|
|
if isinstance(self[i], Tree):
|
|
self._delparent(self[i], i)
|
|
# set the child pointers of the new children. We do this
|
|
# after clearing *all* child pointers, in case we're e.g.
|
|
# reversing the elements in a tree.
|
|
for i, child in enumerate(value):
|
|
if isinstance(child, Tree):
|
|
self._setparent(child, start + i * step)
|
|
# finally, update the content of the child list itself.
|
|
super().__setitem__(index, value)
|
|
|
|
# ptree[i] = value
|
|
elif isinstance(index, int):
|
|
if index < 0:
|
|
index += len(self)
|
|
if index < 0:
|
|
raise IndexError("index out of range")
|
|
# if the value is not changing, do nothing.
|
|
if value is self[index]:
|
|
return
|
|
# Set the new child's parent pointer.
|
|
if isinstance(value, Tree):
|
|
self._setparent(value, index)
|
|
# Remove the old child's parent pointer
|
|
if isinstance(self[index], Tree):
|
|
self._delparent(self[index], index)
|
|
# Update our child list.
|
|
super().__setitem__(index, value)
|
|
|
|
elif isinstance(index, (list, tuple)):
|
|
# ptree[()] = value
|
|
if len(index) == 0:
|
|
raise IndexError("The tree position () may not be assigned to.")
|
|
# ptree[(i,)] = value
|
|
elif len(index) == 1:
|
|
self[index[0]] = value
|
|
# ptree[i1, i2, i3] = value
|
|
else:
|
|
self[index[0]][index[1:]] = value
|
|
|
|
else:
|
|
raise TypeError(
|
|
"%s indices must be integers, not %s"
|
|
% (type(self).__name__, type(index).__name__)
|
|
)
|
|
|
|
def append(self, child):
|
|
if isinstance(child, Tree):
|
|
self._setparent(child, len(self))
|
|
super().append(child)
|
|
|
|
def extend(self, children):
|
|
for child in children:
|
|
if isinstance(child, Tree):
|
|
self._setparent(child, len(self))
|
|
super().append(child)
|
|
|
|
def insert(self, index, child):
|
|
# Handle negative indexes. Note that if index < -len(self),
|
|
# we do *not* raise an IndexError, unlike __getitem__. This
|
|
# is done for consistency with list.__getitem__ and list.index.
|
|
if index < 0:
|
|
index += len(self)
|
|
if index < 0:
|
|
index = 0
|
|
# Set the child's parent, and update our child list.
|
|
if isinstance(child, Tree):
|
|
self._setparent(child, index)
|
|
super().insert(index, child)
|
|
|
|
def pop(self, index=-1):
|
|
if index < 0:
|
|
index += len(self)
|
|
if index < 0:
|
|
raise IndexError("index out of range")
|
|
if isinstance(self[index], Tree):
|
|
self._delparent(self[index], index)
|
|
return super().pop(index)
|
|
|
|
# n.b.: like `list`, this is done by equality, not identity!
|
|
# To remove a specific child, use del ptree[i].
|
|
def remove(self, child):
|
|
index = self.index(child)
|
|
if isinstance(self[index], Tree):
|
|
self._delparent(self[index], index)
|
|
super().remove(child)
|
|
|
|
# We need to implement __getslice__ and friends, even though
|
|
# they're deprecated, because otherwise list.__getslice__ will get
|
|
# called (since we're subclassing from list). Just delegate to
|
|
# __getitem__ etc., but use max(0, start) and max(0, stop) because
|
|
# because negative indices are already handled *before*
|
|
# __getslice__ is called; and we don't want to double-count them.
|
|
if hasattr(list, "__getslice__"):
|
|
|
|
def __getslice__(self, start, stop):
|
|
return self.__getitem__(slice(max(0, start), max(0, stop)))
|
|
|
|
def __delslice__(self, start, stop):
|
|
return self.__delitem__(slice(max(0, start), max(0, stop)))
|
|
|
|
def __setslice__(self, start, stop, value):
|
|
return self.__setitem__(slice(max(0, start), max(0, stop)), value)
|
|
|
|
def __getnewargs__(self):
|
|
"""Method used by the pickle module when un-pickling.
|
|
This method provides the arguments passed to ``__new__``
|
|
upon un-pickling. Without this method, ParentedTree instances
|
|
cannot be pickled and unpickled in Python 3.7+ onwards.
|
|
|
|
:return: Tuple of arguments for ``__new__``, i.e. the label
|
|
and the children of this node.
|
|
:rtype: Tuple[Any, List[AbstractParentedTree]]
|
|
"""
|
|
return (self._label, list(self))
|
|
|
|
|
|
class ParentedTree(AbstractParentedTree):
|
|
"""
|
|
A ``Tree`` that automatically maintains parent pointers for
|
|
single-parented trees. The following are methods for querying
|
|
the structure of a parented tree: ``parent``, ``parent_index``,
|
|
``left_sibling``, ``right_sibling``, ``root``, ``treeposition``.
|
|
|
|
Each ``ParentedTree`` may have at most one parent. In
|
|
particular, subtrees may not be shared. Any attempt to reuse a
|
|
single ``ParentedTree`` as a child of more than one parent (or
|
|
as multiple children of the same parent) will cause a
|
|
``ValueError`` exception to be raised.
|
|
|
|
``ParentedTrees`` should never be used in the same tree as ``Trees``
|
|
or ``MultiParentedTrees``. Mixing tree implementations may result
|
|
in incorrect parent pointers and in ``TypeError`` exceptions.
|
|
"""
|
|
|
|
def __init__(self, node, children=None):
|
|
self._parent = None
|
|
"""The parent of this Tree, or None if it has no parent."""
|
|
super().__init__(node, children)
|
|
if children is None:
|
|
# If children is None, the tree is read from node.
|
|
# After parsing, the parent of the immediate children
|
|
# will point to an intermediate tree, not self.
|
|
# We fix this by brute force:
|
|
for i, child in enumerate(self):
|
|
if isinstance(child, Tree):
|
|
child._parent = None
|
|
self._setparent(child, i)
|
|
|
|
def _frozen_class(self):
|
|
from nltk.tree.immutable import ImmutableParentedTree
|
|
|
|
return ImmutableParentedTree
|
|
|
|
def copy(self, deep=False):
|
|
if not deep:
|
|
warnings.warn(
|
|
f"{self.__class__.__name__} objects do not support shallow copies. Defaulting to a deep copy."
|
|
)
|
|
return super().copy(deep=True)
|
|
|
|
# /////////////////////////////////////////////////////////////////
|
|
# Methods
|
|
# /////////////////////////////////////////////////////////////////
|
|
|
|
def parent(self):
|
|
"""The parent of this tree, or None if it has no parent."""
|
|
return self._parent
|
|
|
|
def parent_index(self):
|
|
"""
|
|
The index of this tree in its parent. I.e.,
|
|
``ptree.parent()[ptree.parent_index()] is ptree``. Note that
|
|
``ptree.parent_index()`` is not necessarily equal to
|
|
``ptree.parent.index(ptree)``, since the ``index()`` method
|
|
returns the first child that is equal to its argument.
|
|
"""
|
|
if self._parent is None:
|
|
return None
|
|
for i, child in enumerate(self._parent):
|
|
if child is self:
|
|
return i
|
|
assert False, "expected to find self in self._parent!"
|
|
|
|
def left_sibling(self):
|
|
"""The left sibling of this tree, or None if it has none."""
|
|
parent_index = self.parent_index()
|
|
if self._parent and parent_index > 0:
|
|
return self._parent[parent_index - 1]
|
|
return None # no left sibling
|
|
|
|
def right_sibling(self):
|
|
"""The right sibling of this tree, or None if it has none."""
|
|
parent_index = self.parent_index()
|
|
if self._parent and parent_index < (len(self._parent) - 1):
|
|
return self._parent[parent_index + 1]
|
|
return None # no right sibling
|
|
|
|
def root(self):
|
|
"""
|
|
The root of this tree. I.e., the unique ancestor of this tree
|
|
whose parent is None. If ``ptree.parent()`` is None, then
|
|
``ptree`` is its own root.
|
|
"""
|
|
root = self
|
|
while root.parent() is not None:
|
|
root = root.parent()
|
|
return root
|
|
|
|
def treeposition(self):
|
|
"""
|
|
The tree position of this tree, relative to the root of the
|
|
tree. I.e., ``ptree.root[ptree.treeposition] is ptree``.
|
|
"""
|
|
if self.parent() is None:
|
|
return ()
|
|
else:
|
|
return self.parent().treeposition() + (self.parent_index(),)
|
|
|
|
# /////////////////////////////////////////////////////////////////
|
|
# Parent Management
|
|
# /////////////////////////////////////////////////////////////////
|
|
|
|
def _delparent(self, child, index):
|
|
# Sanity checks
|
|
assert isinstance(child, ParentedTree)
|
|
assert self[index] is child
|
|
assert child._parent is self
|
|
|
|
# Delete child's parent pointer.
|
|
child._parent = None
|
|
|
|
def _setparent(self, child, index, dry_run=False):
|
|
# If the child's type is incorrect, then complain.
|
|
if not isinstance(child, ParentedTree):
|
|
raise TypeError("Can not insert a non-ParentedTree into a ParentedTree")
|
|
|
|
# If child already has a parent, then complain.
|
|
if hasattr(child, "_parent") and child._parent is not None:
|
|
raise ValueError("Can not insert a subtree that already has a parent.")
|
|
|
|
# Set child's parent pointer & index.
|
|
if not dry_run:
|
|
child._parent = self
|
|
|
|
|
|
class MultiParentedTree(AbstractParentedTree):
|
|
"""
|
|
A ``Tree`` that automatically maintains parent pointers for
|
|
multi-parented trees. The following are methods for querying the
|
|
structure of a multi-parented tree: ``parents()``, ``parent_indices()``,
|
|
``left_siblings()``, ``right_siblings()``, ``roots``, ``treepositions``.
|
|
|
|
Each ``MultiParentedTree`` may have zero or more parents. In
|
|
particular, subtrees may be shared. If a single
|
|
``MultiParentedTree`` is used as multiple children of the same
|
|
parent, then that parent will appear multiple times in its
|
|
``parents()`` method.
|
|
|
|
``MultiParentedTrees`` should never be used in the same tree as
|
|
``Trees`` or ``ParentedTrees``. Mixing tree implementations may
|
|
result in incorrect parent pointers and in ``TypeError`` exceptions.
|
|
"""
|
|
|
|
def __init__(self, node, children=None):
|
|
self._parents = []
|
|
"""A list of this tree's parents. This list should not
|
|
contain duplicates, even if a parent contains this tree
|
|
multiple times."""
|
|
super().__init__(node, children)
|
|
if children is None:
|
|
# If children is None, the tree is read from node.
|
|
# After parsing, the parent(s) of the immediate children
|
|
# will point to an intermediate tree, not self.
|
|
# We fix this by brute force:
|
|
for i, child in enumerate(self):
|
|
if isinstance(child, Tree):
|
|
child._parents = []
|
|
self._setparent(child, i)
|
|
|
|
def _frozen_class(self):
|
|
from nltk.tree.immutable import ImmutableMultiParentedTree
|
|
|
|
return ImmutableMultiParentedTree
|
|
|
|
# /////////////////////////////////////////////////////////////////
|
|
# Methods
|
|
# /////////////////////////////////////////////////////////////////
|
|
|
|
def parents(self):
|
|
"""
|
|
The set of parents of this tree. If this tree has no parents,
|
|
then ``parents`` is the empty set. To check if a tree is used
|
|
as multiple children of the same parent, use the
|
|
``parent_indices()`` method.
|
|
|
|
:type: list(MultiParentedTree)
|
|
"""
|
|
return list(self._parents)
|
|
|
|
def left_siblings(self):
|
|
"""
|
|
A list of all left siblings of this tree, in any of its parent
|
|
trees. A tree may be its own left sibling if it is used as
|
|
multiple contiguous children of the same parent. A tree may
|
|
appear multiple times in this list if it is the left sibling
|
|
of this tree with respect to multiple parents.
|
|
|
|
:type: list(MultiParentedTree)
|
|
"""
|
|
return [
|
|
parent[index - 1]
|
|
for (parent, index) in self._get_parent_indices()
|
|
if index > 0
|
|
]
|
|
|
|
def right_siblings(self):
|
|
"""
|
|
A list of all right siblings of this tree, in any of its parent
|
|
trees. A tree may be its own right sibling if it is used as
|
|
multiple contiguous children of the same parent. A tree may
|
|
appear multiple times in this list if it is the right sibling
|
|
of this tree with respect to multiple parents.
|
|
|
|
:type: list(MultiParentedTree)
|
|
"""
|
|
return [
|
|
parent[index + 1]
|
|
for (parent, index) in self._get_parent_indices()
|
|
if index < (len(parent) - 1)
|
|
]
|
|
|
|
def _get_parent_indices(self):
|
|
return [
|
|
(parent, index)
|
|
for parent in self._parents
|
|
for index, child in enumerate(parent)
|
|
if child is self
|
|
]
|
|
|
|
def roots(self):
|
|
"""
|
|
The set of all roots of this tree. This set is formed by
|
|
tracing all possible parent paths until trees with no parents
|
|
are found.
|
|
|
|
:type: list(MultiParentedTree)
|
|
"""
|
|
return list(self._get_roots_helper({}).values())
|
|
|
|
def _get_roots_helper(self, result):
|
|
if self._parents:
|
|
for parent in self._parents:
|
|
parent._get_roots_helper(result)
|
|
else:
|
|
result[id(self)] = self
|
|
return result
|
|
|
|
def parent_indices(self, parent):
|
|
"""
|
|
Return a list of the indices where this tree occurs as a child
|
|
of ``parent``. If this child does not occur as a child of
|
|
``parent``, then the empty list is returned. The following is
|
|
always true::
|
|
|
|
for parent_index in ptree.parent_indices(parent):
|
|
parent[parent_index] is ptree
|
|
"""
|
|
if parent not in self._parents:
|
|
return []
|
|
else:
|
|
return [index for (index, child) in enumerate(parent) if child is self]
|
|
|
|
def treepositions(self, root):
|
|
"""
|
|
Return a list of all tree positions that can be used to reach
|
|
this multi-parented tree starting from ``root``. I.e., the
|
|
following is always true::
|
|
|
|
for treepos in ptree.treepositions(root):
|
|
root[treepos] is ptree
|
|
"""
|
|
if self is root:
|
|
return [()]
|
|
else:
|
|
return [
|
|
treepos + (index,)
|
|
for parent in self._parents
|
|
for treepos in parent.treepositions(root)
|
|
for (index, child) in enumerate(parent)
|
|
if child is self
|
|
]
|
|
|
|
# /////////////////////////////////////////////////////////////////
|
|
# Parent Management
|
|
# /////////////////////////////////////////////////////////////////
|
|
|
|
def _delparent(self, child, index):
|
|
# Sanity checks
|
|
assert isinstance(child, MultiParentedTree)
|
|
assert self[index] is child
|
|
assert len([p for p in child._parents if p is self]) == 1
|
|
|
|
# If the only copy of child in self is at index, then delete
|
|
# self from child's parent list.
|
|
for i, c in enumerate(self):
|
|
if c is child and i != index:
|
|
break
|
|
else:
|
|
child._parents.remove(self)
|
|
|
|
def _setparent(self, child, index, dry_run=False):
|
|
# If the child's type is incorrect, then complain.
|
|
if not isinstance(child, MultiParentedTree):
|
|
raise TypeError(
|
|
"Can not insert a non-MultiParentedTree into a MultiParentedTree"
|
|
)
|
|
|
|
# Add self as a parent pointer if it's not already listed.
|
|
if not dry_run:
|
|
for parent in child._parents:
|
|
if parent is self:
|
|
break
|
|
else:
|
|
child._parents.append(self)
|
|
|
|
|
|
__all__ = [
|
|
"ParentedTree",
|
|
"MultiParentedTree",
|
|
]
|