339 lines
9.3 KiB
Python
339 lines
9.3 KiB
Python
|
# Natural Language Toolkit: Combinatory Categorial Grammar
|
||
|
#
|
||
|
# Copyright (C) 2001-2023 NLTK Project
|
||
|
# Author: Graeme Gange <ggange@csse.unimelb.edu.au>
|
||
|
# URL: <https://www.nltk.org/>
|
||
|
# For license information, see LICENSE.TXT
|
||
|
"""
|
||
|
CCG Lexicons
|
||
|
"""
|
||
|
|
||
|
import re
|
||
|
from collections import defaultdict
|
||
|
|
||
|
from nltk.ccg.api import CCGVar, Direction, FunctionalCategory, PrimitiveCategory
|
||
|
from nltk.internals import deprecated
|
||
|
from nltk.sem.logic import Expression
|
||
|
|
||
|
# ------------
|
||
|
# Regular expressions used for parsing components of the lexicon
|
||
|
# ------------
|
||
|
|
||
|
# Parses a primitive category and subscripts
|
||
|
PRIM_RE = re.compile(r"""([A-Za-z]+)(\[[A-Za-z,]+\])?""")
|
||
|
|
||
|
# Separates the next primitive category from the remainder of the
|
||
|
# string
|
||
|
NEXTPRIM_RE = re.compile(r"""([A-Za-z]+(?:\[[A-Za-z,]+\])?)(.*)""")
|
||
|
|
||
|
# Separates the next application operator from the remainder
|
||
|
APP_RE = re.compile(r"""([\\/])([.,]?)([.,]?)(.*)""")
|
||
|
|
||
|
# Parses the definition of the right-hand side (rhs) of either a word or a family
|
||
|
LEX_RE = re.compile(r"""([\S_]+)\s*(::|[-=]+>)\s*(.+)""", re.UNICODE)
|
||
|
|
||
|
# Parses the right hand side that contains category and maybe semantic predicate
|
||
|
RHS_RE = re.compile(r"""([^{}]*[^ {}])\s*(\{[^}]+\})?""", re.UNICODE)
|
||
|
|
||
|
# Parses the semantic predicate
|
||
|
SEMANTICS_RE = re.compile(r"""\{([^}]+)\}""", re.UNICODE)
|
||
|
|
||
|
# Strips comments from a line
|
||
|
COMMENTS_RE = re.compile("""([^#]*)(?:#.*)?""")
|
||
|
|
||
|
|
||
|
class Token:
|
||
|
"""
|
||
|
Class representing a token.
|
||
|
|
||
|
token => category {semantics}
|
||
|
e.g. eat => S\\var[pl]/var {\\x y.eat(x,y)}
|
||
|
|
||
|
* `token` (string)
|
||
|
* `categ` (string)
|
||
|
* `semantics` (Expression)
|
||
|
"""
|
||
|
|
||
|
def __init__(self, token, categ, semantics=None):
|
||
|
self._token = token
|
||
|
self._categ = categ
|
||
|
self._semantics = semantics
|
||
|
|
||
|
def categ(self):
|
||
|
return self._categ
|
||
|
|
||
|
def semantics(self):
|
||
|
return self._semantics
|
||
|
|
||
|
def __str__(self):
|
||
|
semantics_str = ""
|
||
|
if self._semantics is not None:
|
||
|
semantics_str = " {" + str(self._semantics) + "}"
|
||
|
return "" + str(self._categ) + semantics_str
|
||
|
|
||
|
def __cmp__(self, other):
|
||
|
if not isinstance(other, Token):
|
||
|
return -1
|
||
|
return cmp((self._categ, self._semantics), other.categ(), other.semantics())
|
||
|
|
||
|
|
||
|
class CCGLexicon:
|
||
|
"""
|
||
|
Class representing a lexicon for CCG grammars.
|
||
|
|
||
|
* `primitives`: The list of primitive categories for the lexicon
|
||
|
* `families`: Families of categories
|
||
|
* `entries`: A mapping of words to possible categories
|
||
|
"""
|
||
|
|
||
|
def __init__(self, start, primitives, families, entries):
|
||
|
self._start = PrimitiveCategory(start)
|
||
|
self._primitives = primitives
|
||
|
self._families = families
|
||
|
self._entries = entries
|
||
|
|
||
|
def categories(self, word):
|
||
|
"""
|
||
|
Returns all the possible categories for a word
|
||
|
"""
|
||
|
return self._entries[word]
|
||
|
|
||
|
def start(self):
|
||
|
"""
|
||
|
Return the target category for the parser
|
||
|
"""
|
||
|
return self._start
|
||
|
|
||
|
def __str__(self):
|
||
|
"""
|
||
|
String representation of the lexicon. Used for debugging.
|
||
|
"""
|
||
|
string = ""
|
||
|
first = True
|
||
|
for ident in sorted(self._entries):
|
||
|
if not first:
|
||
|
string = string + "\n"
|
||
|
string = string + ident + " => "
|
||
|
|
||
|
first = True
|
||
|
for cat in self._entries[ident]:
|
||
|
if not first:
|
||
|
string = string + " | "
|
||
|
else:
|
||
|
first = False
|
||
|
string = string + "%s" % cat
|
||
|
return string
|
||
|
|
||
|
|
||
|
# -----------
|
||
|
# Parsing lexicons
|
||
|
# -----------
|
||
|
|
||
|
|
||
|
def matchBrackets(string):
|
||
|
"""
|
||
|
Separate the contents matching the first set of brackets from the rest of
|
||
|
the input.
|
||
|
"""
|
||
|
rest = string[1:]
|
||
|
inside = "("
|
||
|
|
||
|
while rest != "" and not rest.startswith(")"):
|
||
|
if rest.startswith("("):
|
||
|
(part, rest) = matchBrackets(rest)
|
||
|
inside = inside + part
|
||
|
else:
|
||
|
inside = inside + rest[0]
|
||
|
rest = rest[1:]
|
||
|
if rest.startswith(")"):
|
||
|
return (inside + ")", rest[1:])
|
||
|
raise AssertionError("Unmatched bracket in string '" + string + "'")
|
||
|
|
||
|
|
||
|
def nextCategory(string):
|
||
|
"""
|
||
|
Separate the string for the next portion of the category from the rest
|
||
|
of the string
|
||
|
"""
|
||
|
if string.startswith("("):
|
||
|
return matchBrackets(string)
|
||
|
return NEXTPRIM_RE.match(string).groups()
|
||
|
|
||
|
|
||
|
def parseApplication(app):
|
||
|
"""
|
||
|
Parse an application operator
|
||
|
"""
|
||
|
return Direction(app[0], app[1:])
|
||
|
|
||
|
|
||
|
def parseSubscripts(subscr):
|
||
|
"""
|
||
|
Parse the subscripts for a primitive category
|
||
|
"""
|
||
|
if subscr:
|
||
|
return subscr[1:-1].split(",")
|
||
|
return []
|
||
|
|
||
|
|
||
|
def parsePrimitiveCategory(chunks, primitives, families, var):
|
||
|
"""
|
||
|
Parse a primitive category
|
||
|
|
||
|
If the primitive is the special category 'var', replace it with the
|
||
|
correct `CCGVar`.
|
||
|
"""
|
||
|
if chunks[0] == "var":
|
||
|
if chunks[1] is None:
|
||
|
if var is None:
|
||
|
var = CCGVar()
|
||
|
return (var, var)
|
||
|
|
||
|
catstr = chunks[0]
|
||
|
if catstr in families:
|
||
|
(cat, cvar) = families[catstr]
|
||
|
if var is None:
|
||
|
var = cvar
|
||
|
else:
|
||
|
cat = cat.substitute([(cvar, var)])
|
||
|
return (cat, var)
|
||
|
|
||
|
if catstr in primitives:
|
||
|
subscrs = parseSubscripts(chunks[1])
|
||
|
return (PrimitiveCategory(catstr, subscrs), var)
|
||
|
raise AssertionError(
|
||
|
"String '" + catstr + "' is neither a family nor primitive category."
|
||
|
)
|
||
|
|
||
|
|
||
|
def augParseCategory(line, primitives, families, var=None):
|
||
|
"""
|
||
|
Parse a string representing a category, and returns a tuple with
|
||
|
(possibly) the CCG variable for the category
|
||
|
"""
|
||
|
(cat_string, rest) = nextCategory(line)
|
||
|
|
||
|
if cat_string.startswith("("):
|
||
|
(res, var) = augParseCategory(cat_string[1:-1], primitives, families, var)
|
||
|
|
||
|
else:
|
||
|
(res, var) = parsePrimitiveCategory(
|
||
|
PRIM_RE.match(cat_string).groups(), primitives, families, var
|
||
|
)
|
||
|
|
||
|
while rest != "":
|
||
|
app = APP_RE.match(rest).groups()
|
||
|
direction = parseApplication(app[0:3])
|
||
|
rest = app[3]
|
||
|
|
||
|
(cat_string, rest) = nextCategory(rest)
|
||
|
if cat_string.startswith("("):
|
||
|
(arg, var) = augParseCategory(cat_string[1:-1], primitives, families, var)
|
||
|
else:
|
||
|
(arg, var) = parsePrimitiveCategory(
|
||
|
PRIM_RE.match(cat_string).groups(), primitives, families, var
|
||
|
)
|
||
|
res = FunctionalCategory(res, arg, direction)
|
||
|
|
||
|
return (res, var)
|
||
|
|
||
|
|
||
|
def fromstring(lex_str, include_semantics=False):
|
||
|
"""
|
||
|
Convert string representation into a lexicon for CCGs.
|
||
|
"""
|
||
|
CCGVar.reset_id()
|
||
|
primitives = []
|
||
|
families = {}
|
||
|
entries = defaultdict(list)
|
||
|
for line in lex_str.splitlines():
|
||
|
# Strip comments and leading/trailing whitespace.
|
||
|
line = COMMENTS_RE.match(line).groups()[0].strip()
|
||
|
if line == "":
|
||
|
continue
|
||
|
|
||
|
if line.startswith(":-"):
|
||
|
# A line of primitive categories.
|
||
|
# The first one is the target category
|
||
|
# ie, :- S, N, NP, VP
|
||
|
primitives = primitives + [
|
||
|
prim.strip() for prim in line[2:].strip().split(",")
|
||
|
]
|
||
|
else:
|
||
|
# Either a family definition, or a word definition
|
||
|
(ident, sep, rhs) = LEX_RE.match(line).groups()
|
||
|
(catstr, semantics_str) = RHS_RE.match(rhs).groups()
|
||
|
(cat, var) = augParseCategory(catstr, primitives, families)
|
||
|
|
||
|
if sep == "::":
|
||
|
# Family definition
|
||
|
# ie, Det :: NP/N
|
||
|
families[ident] = (cat, var)
|
||
|
else:
|
||
|
semantics = None
|
||
|
if include_semantics is True:
|
||
|
if semantics_str is None:
|
||
|
raise AssertionError(
|
||
|
line
|
||
|
+ " must contain semantics because include_semantics is set to True"
|
||
|
)
|
||
|
else:
|
||
|
semantics = Expression.fromstring(
|
||
|
SEMANTICS_RE.match(semantics_str).groups()[0]
|
||
|
)
|
||
|
# Word definition
|
||
|
# ie, which => (N\N)/(S/NP)
|
||
|
entries[ident].append(Token(ident, cat, semantics))
|
||
|
return CCGLexicon(primitives[0], primitives, families, entries)
|
||
|
|
||
|
|
||
|
@deprecated("Use fromstring() instead.")
|
||
|
def parseLexicon(lex_str):
|
||
|
return fromstring(lex_str)
|
||
|
|
||
|
|
||
|
openccg_tinytiny = fromstring(
|
||
|
"""
|
||
|
# Rather minimal lexicon based on the openccg `tinytiny' grammar.
|
||
|
# Only incorporates a subset of the morphological subcategories, however.
|
||
|
:- S,NP,N # Primitive categories
|
||
|
Det :: NP/N # Determiners
|
||
|
Pro :: NP
|
||
|
IntransVsg :: S\\NP[sg] # Tensed intransitive verbs (singular)
|
||
|
IntransVpl :: S\\NP[pl] # Plural
|
||
|
TransVsg :: S\\NP[sg]/NP # Tensed transitive verbs (singular)
|
||
|
TransVpl :: S\\NP[pl]/NP # Plural
|
||
|
|
||
|
the => NP[sg]/N[sg]
|
||
|
the => NP[pl]/N[pl]
|
||
|
|
||
|
I => Pro
|
||
|
me => Pro
|
||
|
we => Pro
|
||
|
us => Pro
|
||
|
|
||
|
book => N[sg]
|
||
|
books => N[pl]
|
||
|
|
||
|
peach => N[sg]
|
||
|
peaches => N[pl]
|
||
|
|
||
|
policeman => N[sg]
|
||
|
policemen => N[pl]
|
||
|
|
||
|
boy => N[sg]
|
||
|
boys => N[pl]
|
||
|
|
||
|
sleep => IntransVsg
|
||
|
sleep => IntransVpl
|
||
|
|
||
|
eat => IntransVpl
|
||
|
eat => TransVpl
|
||
|
eats => IntransVsg
|
||
|
eats => TransVsg
|
||
|
|
||
|
see => TransVpl
|
||
|
sees => TransVsg
|
||
|
"""
|
||
|
)
|