296 lines
9.8 KiB
Python
296 lines
9.8 KiB
Python
# Natural Language Toolkit: Tokenizer Utilities
|
||
#
|
||
# Copyright (C) 2001-2023 NLTK Project
|
||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||
# URL: <https://www.nltk.org>
|
||
# For license information, see LICENSE.TXT
|
||
|
||
from re import finditer
|
||
from xml.sax.saxutils import escape, unescape
|
||
|
||
|
||
def string_span_tokenize(s, sep):
|
||
r"""
|
||
Return the offsets of the tokens in *s*, as a sequence of ``(start, end)``
|
||
tuples, by splitting the string at each occurrence of *sep*.
|
||
|
||
>>> from nltk.tokenize.util import string_span_tokenize
|
||
>>> s = '''Good muffins cost $3.88\nin New York. Please buy me
|
||
... two of them.\n\nThanks.'''
|
||
>>> list(string_span_tokenize(s, " ")) # doctest: +NORMALIZE_WHITESPACE
|
||
[(0, 4), (5, 12), (13, 17), (18, 26), (27, 30), (31, 36), (37, 37),
|
||
(38, 44), (45, 48), (49, 55), (56, 58), (59, 73)]
|
||
|
||
:param s: the string to be tokenized
|
||
:type s: str
|
||
:param sep: the token separator
|
||
:type sep: str
|
||
:rtype: iter(tuple(int, int))
|
||
"""
|
||
if len(sep) == 0:
|
||
raise ValueError("Token delimiter must not be empty")
|
||
left = 0
|
||
while True:
|
||
try:
|
||
right = s.index(sep, left)
|
||
if right != 0:
|
||
yield left, right
|
||
except ValueError:
|
||
if left != len(s):
|
||
yield left, len(s)
|
||
break
|
||
|
||
left = right + len(sep)
|
||
|
||
|
||
def regexp_span_tokenize(s, regexp):
|
||
r"""
|
||
Return the offsets of the tokens in *s*, as a sequence of ``(start, end)``
|
||
tuples, by splitting the string at each successive match of *regexp*.
|
||
|
||
>>> from nltk.tokenize.util import regexp_span_tokenize
|
||
>>> s = '''Good muffins cost $3.88\nin New York. Please buy me
|
||
... two of them.\n\nThanks.'''
|
||
>>> list(regexp_span_tokenize(s, r'\s')) # doctest: +NORMALIZE_WHITESPACE
|
||
[(0, 4), (5, 12), (13, 17), (18, 23), (24, 26), (27, 30), (31, 36),
|
||
(38, 44), (45, 48), (49, 51), (52, 55), (56, 58), (59, 64), (66, 73)]
|
||
|
||
:param s: the string to be tokenized
|
||
:type s: str
|
||
:param regexp: regular expression that matches token separators (must not be empty)
|
||
:type regexp: str
|
||
:rtype: iter(tuple(int, int))
|
||
"""
|
||
left = 0
|
||
for m in finditer(regexp, s):
|
||
right, next = m.span()
|
||
if right != left:
|
||
yield left, right
|
||
left = next
|
||
yield left, len(s)
|
||
|
||
|
||
def spans_to_relative(spans):
|
||
r"""
|
||
Return a sequence of relative spans, given a sequence of spans.
|
||
|
||
>>> from nltk.tokenize import WhitespaceTokenizer
|
||
>>> from nltk.tokenize.util import spans_to_relative
|
||
>>> s = '''Good muffins cost $3.88\nin New York. Please buy me
|
||
... two of them.\n\nThanks.'''
|
||
>>> list(spans_to_relative(WhitespaceTokenizer().span_tokenize(s))) # doctest: +NORMALIZE_WHITESPACE
|
||
[(0, 4), (1, 7), (1, 4), (1, 5), (1, 2), (1, 3), (1, 5), (2, 6),
|
||
(1, 3), (1, 2), (1, 3), (1, 2), (1, 5), (2, 7)]
|
||
|
||
:param spans: a sequence of (start, end) offsets of the tokens
|
||
:type spans: iter(tuple(int, int))
|
||
:rtype: iter(tuple(int, int))
|
||
"""
|
||
prev = 0
|
||
for left, right in spans:
|
||
yield left - prev, right - left
|
||
prev = right
|
||
|
||
|
||
class CJKChars:
|
||
"""
|
||
An object that enumerates the code points of the CJK characters as listed on
|
||
https://en.wikipedia.org/wiki/Basic_Multilingual_Plane#Basic_Multilingual_Plane
|
||
|
||
This is a Python port of the CJK code point enumerations of Moses tokenizer:
|
||
https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/detokenizer.perl#L309
|
||
"""
|
||
|
||
# Hangul Jamo (1100–11FF)
|
||
Hangul_Jamo = (4352, 4607) # (ord(u"\u1100"), ord(u"\u11ff"))
|
||
|
||
# CJK Radicals Supplement (2E80–2EFF)
|
||
# Kangxi Radicals (2F00–2FDF)
|
||
# Ideographic Description Characters (2FF0–2FFF)
|
||
# CJK Symbols and Punctuation (3000–303F)
|
||
# Hiragana (3040–309F)
|
||
# Katakana (30A0–30FF)
|
||
# Bopomofo (3100–312F)
|
||
# Hangul Compatibility Jamo (3130–318F)
|
||
# Kanbun (3190–319F)
|
||
# Bopomofo Extended (31A0–31BF)
|
||
# CJK Strokes (31C0–31EF)
|
||
# Katakana Phonetic Extensions (31F0–31FF)
|
||
# Enclosed CJK Letters and Months (3200–32FF)
|
||
# CJK Compatibility (3300–33FF)
|
||
# CJK Unified Ideographs Extension A (3400–4DBF)
|
||
# Yijing Hexagram Symbols (4DC0–4DFF)
|
||
# CJK Unified Ideographs (4E00–9FFF)
|
||
# Yi Syllables (A000–A48F)
|
||
# Yi Radicals (A490–A4CF)
|
||
CJK_Radicals = (11904, 42191) # (ord(u"\u2e80"), ord(u"\ua4cf"))
|
||
|
||
# Phags-pa (A840–A87F)
|
||
Phags_Pa = (43072, 43135) # (ord(u"\ua840"), ord(u"\ua87f"))
|
||
|
||
# Hangul Syllables (AC00–D7AF)
|
||
Hangul_Syllables = (44032, 55215) # (ord(u"\uAC00"), ord(u"\uD7AF"))
|
||
|
||
# CJK Compatibility Ideographs (F900–FAFF)
|
||
CJK_Compatibility_Ideographs = (63744, 64255) # (ord(u"\uF900"), ord(u"\uFAFF"))
|
||
|
||
# CJK Compatibility Forms (FE30–FE4F)
|
||
CJK_Compatibility_Forms = (65072, 65103) # (ord(u"\uFE30"), ord(u"\uFE4F"))
|
||
|
||
# Range U+FF65–FFDC encodes halfwidth forms, of Katakana and Hangul characters
|
||
Katakana_Hangul_Halfwidth = (65381, 65500) # (ord(u"\uFF65"), ord(u"\uFFDC"))
|
||
|
||
# Supplementary Ideographic Plane 20000–2FFFF
|
||
Supplementary_Ideographic_Plane = (
|
||
131072,
|
||
196607,
|
||
) # (ord(u"\U00020000"), ord(u"\U0002FFFF"))
|
||
|
||
ranges = [
|
||
Hangul_Jamo,
|
||
CJK_Radicals,
|
||
Phags_Pa,
|
||
Hangul_Syllables,
|
||
CJK_Compatibility_Ideographs,
|
||
CJK_Compatibility_Forms,
|
||
Katakana_Hangul_Halfwidth,
|
||
Supplementary_Ideographic_Plane,
|
||
]
|
||
|
||
|
||
def is_cjk(character):
|
||
"""
|
||
Python port of Moses' code to check for CJK character.
|
||
|
||
>>> CJKChars().ranges
|
||
[(4352, 4607), (11904, 42191), (43072, 43135), (44032, 55215), (63744, 64255), (65072, 65103), (65381, 65500), (131072, 196607)]
|
||
>>> is_cjk(u'\u33fe')
|
||
True
|
||
>>> is_cjk(u'\uFE5F')
|
||
False
|
||
|
||
:param character: The character that needs to be checked.
|
||
:type character: char
|
||
:return: bool
|
||
"""
|
||
return any(
|
||
[
|
||
start <= ord(character) <= end
|
||
for start, end in [
|
||
(4352, 4607),
|
||
(11904, 42191),
|
||
(43072, 43135),
|
||
(44032, 55215),
|
||
(63744, 64255),
|
||
(65072, 65103),
|
||
(65381, 65500),
|
||
(131072, 196607),
|
||
]
|
||
]
|
||
)
|
||
|
||
|
||
def xml_escape(text):
|
||
"""
|
||
This function transforms the input text into an "escaped" version suitable
|
||
for well-formed XML formatting.
|
||
|
||
Note that the default xml.sax.saxutils.escape() function don't escape
|
||
some characters that Moses does so we have to manually add them to the
|
||
entities dictionary.
|
||
|
||
>>> input_str = ''')| & < > ' " ] ['''
|
||
>>> expected_output = ''')| & < > ' " ] ['''
|
||
>>> escape(input_str) == expected_output
|
||
True
|
||
>>> xml_escape(input_str)
|
||
')| & < > ' " ] ['
|
||
|
||
:param text: The text that needs to be escaped.
|
||
:type text: str
|
||
:rtype: str
|
||
"""
|
||
return escape(
|
||
text,
|
||
entities={
|
||
r"'": r"'",
|
||
r'"': r""",
|
||
r"|": r"|",
|
||
r"[": r"[",
|
||
r"]": r"]",
|
||
},
|
||
)
|
||
|
||
|
||
def xml_unescape(text):
|
||
"""
|
||
This function transforms the "escaped" version suitable
|
||
for well-formed XML formatting into humanly-readable string.
|
||
|
||
Note that the default xml.sax.saxutils.unescape() function don't unescape
|
||
some characters that Moses does so we have to manually add them to the
|
||
entities dictionary.
|
||
|
||
>>> from xml.sax.saxutils import unescape
|
||
>>> s = ')| & < > ' " ] ['
|
||
>>> expected = ''')| & < > \' " ] ['''
|
||
>>> xml_unescape(s) == expected
|
||
True
|
||
|
||
:param text: The text that needs to be unescaped.
|
||
:type text: str
|
||
:rtype: str
|
||
"""
|
||
return unescape(
|
||
text,
|
||
entities={
|
||
r"'": r"'",
|
||
r""": r'"',
|
||
r"|": r"|",
|
||
r"[": r"[",
|
||
r"]": r"]",
|
||
},
|
||
)
|
||
|
||
|
||
def align_tokens(tokens, sentence):
|
||
"""
|
||
This module attempt to find the offsets of the tokens in *s*, as a sequence
|
||
of ``(start, end)`` tuples, given the tokens and also the source string.
|
||
|
||
>>> from nltk.tokenize import TreebankWordTokenizer
|
||
>>> from nltk.tokenize.util import align_tokens
|
||
>>> s = str("The plane, bound for St Petersburg, crashed in Egypt's "
|
||
... "Sinai desert just 23 minutes after take-off from Sharm el-Sheikh "
|
||
... "on Saturday.")
|
||
>>> tokens = TreebankWordTokenizer().tokenize(s)
|
||
>>> expected = [(0, 3), (4, 9), (9, 10), (11, 16), (17, 20), (21, 23),
|
||
... (24, 34), (34, 35), (36, 43), (44, 46), (47, 52), (52, 54),
|
||
... (55, 60), (61, 67), (68, 72), (73, 75), (76, 83), (84, 89),
|
||
... (90, 98), (99, 103), (104, 109), (110, 119), (120, 122),
|
||
... (123, 131), (131, 132)]
|
||
>>> output = list(align_tokens(tokens, s))
|
||
>>> len(tokens) == len(expected) == len(output) # Check that length of tokens and tuples are the same.
|
||
True
|
||
>>> expected == list(align_tokens(tokens, s)) # Check that the output is as expected.
|
||
True
|
||
>>> tokens == [s[start:end] for start, end in output] # Check that the slices of the string corresponds to the tokens.
|
||
True
|
||
|
||
:param tokens: The list of strings that are the result of tokenization
|
||
:type tokens: list(str)
|
||
:param sentence: The original string
|
||
:type sentence: str
|
||
:rtype: list(tuple(int,int))
|
||
"""
|
||
point = 0
|
||
offsets = []
|
||
for token in tokens:
|
||
try:
|
||
start = sentence.index(token, point)
|
||
except ValueError as e:
|
||
raise ValueError(f'substring "{token}" not found in "{sentence}"') from e
|
||
point = start + len(token)
|
||
offsets.append((start, point))
|
||
return offsets
|