# Natural Language Toolkit: Combinatory Categorial Grammar # # Copyright (C) 2001-2023 NLTK Project # Author: Graeme Gange # URL: # For license information, see LICENSE.TXT """ CCG Lexicons """ import re from collections import defaultdict from nltk.ccg.api import CCGVar, Direction, FunctionalCategory, PrimitiveCategory from nltk.internals import deprecated from nltk.sem.logic import Expression # ------------ # Regular expressions used for parsing components of the lexicon # ------------ # Parses a primitive category and subscripts PRIM_RE = re.compile(r"""([A-Za-z]+)(\[[A-Za-z,]+\])?""") # Separates the next primitive category from the remainder of the # string NEXTPRIM_RE = re.compile(r"""([A-Za-z]+(?:\[[A-Za-z,]+\])?)(.*)""") # Separates the next application operator from the remainder APP_RE = re.compile(r"""([\\/])([.,]?)([.,]?)(.*)""") # Parses the definition of the right-hand side (rhs) of either a word or a family LEX_RE = re.compile(r"""([\S_]+)\s*(::|[-=]+>)\s*(.+)""", re.UNICODE) # Parses the right hand side that contains category and maybe semantic predicate RHS_RE = re.compile(r"""([^{}]*[^ {}])\s*(\{[^}]+\})?""", re.UNICODE) # Parses the semantic predicate SEMANTICS_RE = re.compile(r"""\{([^}]+)\}""", re.UNICODE) # Strips comments from a line COMMENTS_RE = re.compile("""([^#]*)(?:#.*)?""") class Token: """ Class representing a token. token => category {semantics} e.g. eat => S\\var[pl]/var {\\x y.eat(x,y)} * `token` (string) * `categ` (string) * `semantics` (Expression) """ def __init__(self, token, categ, semantics=None): self._token = token self._categ = categ self._semantics = semantics def categ(self): return self._categ def semantics(self): return self._semantics def __str__(self): semantics_str = "" if self._semantics is not None: semantics_str = " {" + str(self._semantics) + "}" return "" + str(self._categ) + semantics_str def __cmp__(self, other): if not isinstance(other, Token): return -1 return cmp((self._categ, self._semantics), other.categ(), other.semantics()) class CCGLexicon: """ Class representing a lexicon for CCG grammars. * `primitives`: The list of primitive categories for the lexicon * `families`: Families of categories * `entries`: A mapping of words to possible categories """ def __init__(self, start, primitives, families, entries): self._start = PrimitiveCategory(start) self._primitives = primitives self._families = families self._entries = entries def categories(self, word): """ Returns all the possible categories for a word """ return self._entries[word] def start(self): """ Return the target category for the parser """ return self._start def __str__(self): """ String representation of the lexicon. Used for debugging. """ string = "" first = True for ident in sorted(self._entries): if not first: string = string + "\n" string = string + ident + " => " first = True for cat in self._entries[ident]: if not first: string = string + " | " else: first = False string = string + "%s" % cat return string # ----------- # Parsing lexicons # ----------- def matchBrackets(string): """ Separate the contents matching the first set of brackets from the rest of the input. """ rest = string[1:] inside = "(" while rest != "" and not rest.startswith(")"): if rest.startswith("("): (part, rest) = matchBrackets(rest) inside = inside + part else: inside = inside + rest[0] rest = rest[1:] if rest.startswith(")"): return (inside + ")", rest[1:]) raise AssertionError("Unmatched bracket in string '" + string + "'") def nextCategory(string): """ Separate the string for the next portion of the category from the rest of the string """ if string.startswith("("): return matchBrackets(string) return NEXTPRIM_RE.match(string).groups() def parseApplication(app): """ Parse an application operator """ return Direction(app[0], app[1:]) def parseSubscripts(subscr): """ Parse the subscripts for a primitive category """ if subscr: return subscr[1:-1].split(",") return [] def parsePrimitiveCategory(chunks, primitives, families, var): """ Parse a primitive category If the primitive is the special category 'var', replace it with the correct `CCGVar`. """ if chunks[0] == "var": if chunks[1] is None: if var is None: var = CCGVar() return (var, var) catstr = chunks[0] if catstr in families: (cat, cvar) = families[catstr] if var is None: var = cvar else: cat = cat.substitute([(cvar, var)]) return (cat, var) if catstr in primitives: subscrs = parseSubscripts(chunks[1]) return (PrimitiveCategory(catstr, subscrs), var) raise AssertionError( "String '" + catstr + "' is neither a family nor primitive category." ) def augParseCategory(line, primitives, families, var=None): """ Parse a string representing a category, and returns a tuple with (possibly) the CCG variable for the category """ (cat_string, rest) = nextCategory(line) if cat_string.startswith("("): (res, var) = augParseCategory(cat_string[1:-1], primitives, families, var) else: (res, var) = parsePrimitiveCategory( PRIM_RE.match(cat_string).groups(), primitives, families, var ) while rest != "": app = APP_RE.match(rest).groups() direction = parseApplication(app[0:3]) rest = app[3] (cat_string, rest) = nextCategory(rest) if cat_string.startswith("("): (arg, var) = augParseCategory(cat_string[1:-1], primitives, families, var) else: (arg, var) = parsePrimitiveCategory( PRIM_RE.match(cat_string).groups(), primitives, families, var ) res = FunctionalCategory(res, arg, direction) return (res, var) def fromstring(lex_str, include_semantics=False): """ Convert string representation into a lexicon for CCGs. """ CCGVar.reset_id() primitives = [] families = {} entries = defaultdict(list) for line in lex_str.splitlines(): # Strip comments and leading/trailing whitespace. line = COMMENTS_RE.match(line).groups()[0].strip() if line == "": continue if line.startswith(":-"): # A line of primitive categories. # The first one is the target category # ie, :- S, N, NP, VP primitives = primitives + [ prim.strip() for prim in line[2:].strip().split(",") ] else: # Either a family definition, or a word definition (ident, sep, rhs) = LEX_RE.match(line).groups() (catstr, semantics_str) = RHS_RE.match(rhs).groups() (cat, var) = augParseCategory(catstr, primitives, families) if sep == "::": # Family definition # ie, Det :: NP/N families[ident] = (cat, var) else: semantics = None if include_semantics is True: if semantics_str is None: raise AssertionError( line + " must contain semantics because include_semantics is set to True" ) else: semantics = Expression.fromstring( SEMANTICS_RE.match(semantics_str).groups()[0] ) # Word definition # ie, which => (N\N)/(S/NP) entries[ident].append(Token(ident, cat, semantics)) return CCGLexicon(primitives[0], primitives, families, entries) @deprecated("Use fromstring() instead.") def parseLexicon(lex_str): return fromstring(lex_str) openccg_tinytiny = fromstring( """ # Rather minimal lexicon based on the openccg `tinytiny' grammar. # Only incorporates a subset of the morphological subcategories, however. :- S,NP,N # Primitive categories Det :: NP/N # Determiners Pro :: NP IntransVsg :: S\\NP[sg] # Tensed intransitive verbs (singular) IntransVpl :: S\\NP[pl] # Plural TransVsg :: S\\NP[sg]/NP # Tensed transitive verbs (singular) TransVpl :: S\\NP[pl]/NP # Plural the => NP[sg]/N[sg] the => NP[pl]/N[pl] I => Pro me => Pro we => Pro us => Pro book => N[sg] books => N[pl] peach => N[sg] peaches => N[pl] policeman => N[sg] policemen => N[pl] boy => N[sg] boys => N[pl] sleep => IntransVsg sleep => IntransVpl eat => IntransVpl eat => TransVpl eats => IntransVsg eats => TransVsg see => TransVpl sees => TransVsg """ )