# Natural Language Toolkit: Python port of the tok-tok.pl tokenizer. # # Copyright (C) 2001-2015 NLTK Project # Author: Jon Dehdari # Contributors: Liling Tan, Selcuk Ayguney, ikegami, Martijn Pieters # # URL: # For license information, see LICENSE.TXT """ The tok-tok tokenizer is a simple, general tokenizer, where the input has one sentence per line; thus only final period is tokenized. Tok-tok has been tested on, and gives reasonably good results for English, Persian, Russian, Czech, French, German, Vietnamese, Tajik, and a few others. The input should be in UTF-8 encoding. Reference: Jon Dehdari. 2014. A Neurophysiologically-Inspired Statistical Language Model (Doctoral dissertation). Columbus, OH, USA: The Ohio State University. """ import re from nltk.tokenize.api import TokenizerI class ToktokTokenizer(TokenizerI): """ This is a Python port of the tok-tok.pl from https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl >>> toktok = ToktokTokenizer() >>> text = u'Is 9.5 or 525,600 my favorite number?' >>> print(toktok.tokenize(text, return_str=True)) Is 9.5 or 525,600 my favorite number ? >>> text = u'The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things' >>> print(toktok.tokenize(text, return_str=True)) The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things >>> text = u'\xa1This, is a sentence with weird\xbb symbols\u2026 appearing everywhere\xbf' >>> expected = u'\xa1 This , is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf' >>> assert toktok.tokenize(text, return_str=True) == expected >>> toktok.tokenize(text) == [u'\xa1', u'This', u',', u'is', u'a', u'sentence', u'with', u'weird', u'\xbb', u'symbols', u'\u2026', u'appearing', u'everywhere', u'\xbf'] True """ # Replace non-breaking spaces with normal spaces. NON_BREAKING = re.compile("\u00A0"), " " # Pad some funky punctuation. FUNKY_PUNCT_1 = re.compile(r'([،;؛¿!"\])}»›”؟¡%٪°±©®।॥…])'), r" \1 " # Pad more funky punctuation. FUNKY_PUNCT_2 = re.compile(r"([({\[“‘„‚«‹「『])"), r" \1 " # Pad En dash and em dash EN_EM_DASHES = re.compile("([–—])"), r" \1 " # Replace problematic character with numeric character reference. AMPERCENT = re.compile("& "), "& " TAB = re.compile("\t"), " " PIPE = re.compile(r"\|"), " | " # Pad numbers with commas to keep them from further tokenization. COMMA_IN_NUM = re.compile(r"(? "something ..." # "something." -> "something ." FINAL_PERIOD_1 = re.compile(r"(? "... stuff ." FINAL_PERIOD_2 = re.compile(r"""(?