133 lines
5.0 KiB
Python
133 lines
5.0 KiB
Python
|
# Natural Language Toolkit: Tokenizers
|
||
|
#
|
||
|
# Copyright (C) 2001-2023 NLTK Project
|
||
|
# Author: Edward Loper <edloper@gmail.com>
|
||
|
# Steven Bird <stevenbird1@gmail.com> (minor additions)
|
||
|
# Contributors: matthewmc, clouds56
|
||
|
# URL: <https://www.nltk.org/>
|
||
|
# For license information, see LICENSE.TXT
|
||
|
|
||
|
r"""
|
||
|
NLTK Tokenizer Package
|
||
|
|
||
|
Tokenizers divide strings into lists of substrings. For example,
|
||
|
tokenizers can be used to find the words and punctuation in a string:
|
||
|
|
||
|
>>> from nltk.tokenize import word_tokenize
|
||
|
>>> s = '''Good muffins cost $3.88\nin New York. Please buy me
|
||
|
... two of them.\n\nThanks.'''
|
||
|
>>> word_tokenize(s) # doctest: +NORMALIZE_WHITESPACE
|
||
|
['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.',
|
||
|
'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
|
||
|
|
||
|
This particular tokenizer requires the Punkt sentence tokenization
|
||
|
models to be installed. NLTK also provides a simpler,
|
||
|
regular-expression based tokenizer, which splits text on whitespace
|
||
|
and punctuation:
|
||
|
|
||
|
>>> from nltk.tokenize import wordpunct_tokenize
|
||
|
>>> wordpunct_tokenize(s) # doctest: +NORMALIZE_WHITESPACE
|
||
|
['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', '.',
|
||
|
'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
|
||
|
|
||
|
We can also operate at the level of sentences, using the sentence
|
||
|
tokenizer directly as follows:
|
||
|
|
||
|
>>> from nltk.tokenize import sent_tokenize, word_tokenize
|
||
|
>>> sent_tokenize(s)
|
||
|
['Good muffins cost $3.88\nin New York.', 'Please buy me\ntwo of them.', 'Thanks.']
|
||
|
>>> [word_tokenize(t) for t in sent_tokenize(s)] # doctest: +NORMALIZE_WHITESPACE
|
||
|
[['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.'],
|
||
|
['Please', 'buy', 'me', 'two', 'of', 'them', '.'], ['Thanks', '.']]
|
||
|
|
||
|
Caution: when tokenizing a Unicode string, make sure you are not
|
||
|
using an encoded version of the string (it may be necessary to
|
||
|
decode it first, e.g. with ``s.decode("utf8")``.
|
||
|
|
||
|
NLTK tokenizers can produce token-spans, represented as tuples of integers
|
||
|
having the same semantics as string slices, to support efficient comparison
|
||
|
of tokenizers. (These methods are implemented as generators.)
|
||
|
|
||
|
>>> from nltk.tokenize import WhitespaceTokenizer
|
||
|
>>> list(WhitespaceTokenizer().span_tokenize(s)) # doctest: +NORMALIZE_WHITESPACE
|
||
|
[(0, 4), (5, 12), (13, 17), (18, 23), (24, 26), (27, 30), (31, 36), (38, 44),
|
||
|
(45, 48), (49, 51), (52, 55), (56, 58), (59, 64), (66, 73)]
|
||
|
|
||
|
There are numerous ways to tokenize text. If you need more control over
|
||
|
tokenization, see the other methods provided in this package.
|
||
|
|
||
|
For further information, please see Chapter 3 of the NLTK book.
|
||
|
"""
|
||
|
|
||
|
import re
|
||
|
|
||
|
from nltk.data import load
|
||
|
from nltk.tokenize.casual import TweetTokenizer, casual_tokenize
|
||
|
from nltk.tokenize.destructive import NLTKWordTokenizer
|
||
|
from nltk.tokenize.legality_principle import LegalitySyllableTokenizer
|
||
|
from nltk.tokenize.mwe import MWETokenizer
|
||
|
from nltk.tokenize.punkt import PunktSentenceTokenizer
|
||
|
from nltk.tokenize.regexp import (
|
||
|
BlanklineTokenizer,
|
||
|
RegexpTokenizer,
|
||
|
WhitespaceTokenizer,
|
||
|
WordPunctTokenizer,
|
||
|
blankline_tokenize,
|
||
|
regexp_tokenize,
|
||
|
wordpunct_tokenize,
|
||
|
)
|
||
|
from nltk.tokenize.repp import ReppTokenizer
|
||
|
from nltk.tokenize.sexpr import SExprTokenizer, sexpr_tokenize
|
||
|
from nltk.tokenize.simple import (
|
||
|
LineTokenizer,
|
||
|
SpaceTokenizer,
|
||
|
TabTokenizer,
|
||
|
line_tokenize,
|
||
|
)
|
||
|
from nltk.tokenize.sonority_sequencing import SyllableTokenizer
|
||
|
from nltk.tokenize.stanford_segmenter import StanfordSegmenter
|
||
|
from nltk.tokenize.texttiling import TextTilingTokenizer
|
||
|
from nltk.tokenize.toktok import ToktokTokenizer
|
||
|
from nltk.tokenize.treebank import TreebankWordDetokenizer, TreebankWordTokenizer
|
||
|
from nltk.tokenize.util import regexp_span_tokenize, string_span_tokenize
|
||
|
|
||
|
|
||
|
# Standard sentence tokenizer.
|
||
|
def sent_tokenize(text, language="english"):
|
||
|
"""
|
||
|
Return a sentence-tokenized copy of *text*,
|
||
|
using NLTK's recommended sentence tokenizer
|
||
|
(currently :class:`.PunktSentenceTokenizer`
|
||
|
for the specified language).
|
||
|
|
||
|
:param text: text to split into sentences
|
||
|
:param language: the model name in the Punkt corpus
|
||
|
"""
|
||
|
tokenizer = load(f"tokenizers/punkt/{language}.pickle")
|
||
|
return tokenizer.tokenize(text)
|
||
|
|
||
|
|
||
|
# Standard word tokenizer.
|
||
|
_treebank_word_tokenizer = NLTKWordTokenizer()
|
||
|
|
||
|
|
||
|
def word_tokenize(text, language="english", preserve_line=False):
|
||
|
"""
|
||
|
Return a tokenized copy of *text*,
|
||
|
using NLTK's recommended word tokenizer
|
||
|
(currently an improved :class:`.TreebankWordTokenizer`
|
||
|
along with :class:`.PunktSentenceTokenizer`
|
||
|
for the specified language).
|
||
|
|
||
|
:param text: text to split into words
|
||
|
:type text: str
|
||
|
:param language: the model name in the Punkt corpus
|
||
|
:type language: str
|
||
|
:param preserve_line: A flag to decide whether to sentence tokenize the text or not.
|
||
|
:type preserve_line: bool
|
||
|
"""
|
||
|
sentences = [text] if preserve_line else sent_tokenize(text, language)
|
||
|
return [
|
||
|
token for sent in sentences for token in _treebank_word_tokenizer.tokenize(sent)
|
||
|
]
|