ai-content-maker/.venv/Lib/site-packages/pysbd/processor.py

# -*- coding: utf-8 -*-
import re
from pysbd.utils import Text
from pysbd.lists_item_replacer import ListItemReplacer
from pysbd.exclamation_words import ExclamationWords
from pysbd.between_punctuation import BetweenPunctuation
from pysbd.abbreviation_replacer import AbbreviationReplacer

class Processor(object):

    def __init__(self, text, lang, char_span=False):
        """Process a text - do pre and post processing - to get proper sentences

        Parameters
        ----------
        text : str
            Original text
        language : object
            Language module
        char_span : bool, optional
            Get start & end character offsets of each sentences
            within original text, by default False
        """
        self.text = text
        self.lang = lang
        self.char_span = char_span

    def process(self):
        if not self.text:
            return self.text
        self.text = self.text.replace('\n', '\r')
        li = ListItemReplacer(self.text)
        self.text = li.add_line_break()
        self.replace_abbreviations()
        self.replace_numbers()
        self.replace_continuous_punctuation()
        self.replace_periods_before_numeric_references()
        self.text = Text(self.text).apply(
            self.lang.Abbreviation.WithMultiplePeriodsAndEmailRule,
            self.lang.GeoLocationRule, self.lang.FileFormatRule)
        postprocessed_sents = self.split_into_segments()
        return postprocessed_sents

    def rm_none_flatten(self, sents):
        """Remove None values and unpack list of list sents

        Parameters
        ----------
        sents : list
            list of sentences

        Returns
        -------
        list
            unpacked and None removed list of sents
        """
        sents = list(filter(None, sents))
        if not any(isinstance(s, list) for s in sents):
            return sents
        new_sents = []
        for sent in sents:
            if isinstance(sent, list):
                for s in sent:
                    new_sents.append(s)
            else:
                new_sents.append(sent)
        return new_sents

    def split_into_segments(self):
        self.check_for_parens_between_quotes()
        sents = self.text.split('\r')
        # remove empty and none values
        sents = self.rm_none_flatten(sents)
        sents = [
            Text(s).apply(self.lang.SingleNewLineRule, *self.lang.EllipsisRules.All)
            for s in sents
        ]
        sents = [self.check_for_punctuation(s) for s in sents]
        # flatten list of list of sentences
        sents = self.rm_none_flatten(sents)
        postprocessed_sents = []
        for sent in sents:
            sent = Text(sent).apply(*self.lang.SubSymbolsRules.All)
            post_process_sent = self.post_process_segments(sent)
            if post_process_sent and isinstance(post_process_sent, str):
                postprocessed_sents.append(post_process_sent)
            elif isinstance(post_process_sent, list):
                for pps in post_process_sent:
                    postprocessed_sents.append(pps)
        postprocessed_sents = [Text(ns).apply(self.lang.SubSingleQuoteRule)
                               for ns in postprocessed_sents]
        return postprocessed_sents

    def post_process_segments(self, txt):
        if len(txt) > 2 and re.search(r'\A[a-zA-Z]*\Z', txt):
            return txt

        # below condition present in pragmatic segmenter
        # dont know significance of it yet.
        # if self.consecutive_underscore(txt) or len(txt) < 2:
        #     return txt

        if re.match(r'\t', txt):
            pass

        # TODO:
        # Decide on keeping or removing Standard.ExtraWhiteSpaceRule
        # removed to retain original text spans
        # txt = Text(txt).apply(*ReinsertEllipsisRules.All,
        #                       Standard.ExtraWhiteSpaceRule)
        txt = Text(txt).apply(*self.lang.ReinsertEllipsisRules.All)
        if re.search(self.lang.QUOTATION_AT_END_OF_SENTENCE_REGEX, txt):
            txt = re.split(
                self.lang.SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX, txt)
            return txt
        else:
            txt = txt.replace('\n', '')
            return txt.strip()

    def check_for_parens_between_quotes(self):
        def paren_replace(match):
            match = match.group()
            sub1 = re.sub(r'\s(?=\()', '\r', match)
            sub2 = re.sub(r'(?<=\))\s', '\r', sub1)
            return sub2
        self.text = re.sub(self.lang.PARENS_BETWEEN_DOUBLE_QUOTES_REGEX,
                      paren_replace, self.text)

    def replace_continuous_punctuation(self):
        def continuous_puncs_replace(match):
            match = match.group()
            sub1 = re.sub(re.escape('!'), '&ᓴ&', match)
            sub2 = re.sub(re.escape('?'), '&ᓷ&', sub1)
            return sub2
        self.text = re.sub(self.lang.CONTINUOUS_PUNCTUATION_REGEX,
                        continuous_puncs_replace, self.text)

    def replace_periods_before_numeric_references(self):
         # https://github.com/diasks2/pragmatic_segmenter/commit/d9ec1a352aff92b91e2e572c30bb9561eb42c703
        self.text = re.sub(self.lang.NUMBERED_REFERENCE_REGEX,
                      r"∯\2\r\7", self.text)

    def consecutive_underscore(self, txt):
        # Rubular: http://rubular.com/r/fTF2Ff3WBL
        txt = re.sub(r'_{3,}', '', txt)
        return len(txt) == 0

    def check_for_punctuation(self, txt):
        if any(p in txt for p in self.lang.Punctuations):
            sents = self.process_text(txt)
            return sents
        else:
            # NOTE: next steps of check_for_punctuation will unpack this list
            return [txt]

    def process_text(self, txt):
        if txt[-1] not in self.lang.Punctuations:
            txt += 'ȸ'
        txt = ExclamationWords.apply_rules(txt)
        txt = self.between_punctuation(txt)
        # handle text having only doublepunctuations
        if not re.match(self.lang.DoublePunctuationRules.DoublePunctuation, txt):
            txt = Text(txt).apply(*self.lang.DoublePunctuationRules.All)
        txt = Text(txt).apply(self.lang.QuestionMarkInQuotationRule,
                              *self.lang.ExclamationPointRules.All)
        txt = ListItemReplacer(txt).replace_parens()
        txt = self.sentence_boundary_punctuation(txt)
        return txt

    def replace_numbers(self):
        self.text = Text(self.text).apply(*self.lang.Numbers.All)

    def abbreviations_replacer(self):
        if hasattr(self.lang, "AbbreviationReplacer"):
            return self.lang.AbbreviationReplacer(self.text, self.lang)
        else:
            return AbbreviationReplacer(self.text, self.lang)

    def replace_abbreviations(self):
        self.text = self.abbreviations_replacer().replace()

    def between_punctuation_processor(self, txt):
        if hasattr(self.lang, "BetweenPunctuation"):
            return self.lang.BetweenPunctuation(txt)
        else:
            return BetweenPunctuation(txt)

    def between_punctuation(self, txt):
        txt = self.between_punctuation_processor(txt).replace()
        return txt

    def sentence_boundary_punctuation(self, txt):
        if hasattr(self.lang, 'ReplaceColonBetweenNumbersRule'):
            txt = Text(txt).apply(
                self.lang.ReplaceColonBetweenNumbersRule)
        if hasattr(self.lang, 'ReplaceNonSentenceBoundaryCommaRule'):
            txt = Text(txt).apply(
                self.lang.ReplaceNonSentenceBoundaryCommaRule)
        # retain exclamation mark if it is an ending character of a given text
        txt = re.sub(r'&ᓴ&$', '!', txt)
        txt = [
            m.group() for m in re.finditer(self.lang.SENTENCE_BOUNDARY_REGEX, txt)
            ]
        return txt
first commit 2024-05-03 04:18:51 +03:00			`# -- coding: utf-8 --`
			`import re`
			`from pysbd.utils import Text`
			`from pysbd.lists_item_replacer import ListItemReplacer`
			`from pysbd.exclamation_words import ExclamationWords`
			`from pysbd.between_punctuation import BetweenPunctuation`
			`from pysbd.abbreviation_replacer import AbbreviationReplacer`

			`class Processor(object):`

			`def __init__(self, text, lang, char_span=False):`
			`"""Process a text - do pre and post processing - to get proper sentences`

			`Parameters`
			`----------`
			`text : str`
			`Original text`
			`language : object`
			`Language module`
			`char_span : bool, optional`
			`Get start & end character offsets of each sentences`
			`within original text, by default False`
			`"""`
			`self.text = text`
			`self.lang = lang`
			`self.char_span = char_span`

			`def process(self):`
			`if not self.text:`
			`return self.text`
			`self.text = self.text.replace('\n', '\r')`
			`li = ListItemReplacer(self.text)`
			`self.text = li.add_line_break()`
			`self.replace_abbreviations()`
			`self.replace_numbers()`
			`self.replace_continuous_punctuation()`
			`self.replace_periods_before_numeric_references()`
			`self.text = Text(self.text).apply(`
			`self.lang.Abbreviation.WithMultiplePeriodsAndEmailRule,`
			`self.lang.GeoLocationRule, self.lang.FileFormatRule)`
			`postprocessed_sents = self.split_into_segments()`
			`return postprocessed_sents`

			`def rm_none_flatten(self, sents):`
			`"""Remove None values and unpack list of list sents`

			`Parameters`
			`----------`
			`sents : list`
			`list of sentences`

			`Returns`
			`-------`
			`list`
			`unpacked and None removed list of sents`
			`"""`
			`sents = list(filter(None, sents))`
			`if not any(isinstance(s, list) for s in sents):`
			`return sents`
			`new_sents = []`
			`for sent in sents:`
			`if isinstance(sent, list):`
			`for s in sent:`
			`new_sents.append(s)`
			`else:`
			`new_sents.append(sent)`
			`return new_sents`

			`def split_into_segments(self):`
			`self.check_for_parens_between_quotes()`
			`sents = self.text.split('\r')`
			`# remove empty and none values`
			`sents = self.rm_none_flatten(sents)`
			`sents = [`
			`Text(s).apply(self.lang.SingleNewLineRule, *self.lang.EllipsisRules.All)`
			`for s in sents`
			`]`
			`sents = [self.check_for_punctuation(s) for s in sents]`
			`# flatten list of list of sentences`
			`sents = self.rm_none_flatten(sents)`
			`postprocessed_sents = []`
			`for sent in sents:`
			`sent = Text(sent).apply(*self.lang.SubSymbolsRules.All)`
			`post_process_sent = self.post_process_segments(sent)`
			`if post_process_sent and isinstance(post_process_sent, str):`
			`postprocessed_sents.append(post_process_sent)`
			`elif isinstance(post_process_sent, list):`
			`for pps in post_process_sent:`
			`postprocessed_sents.append(pps)`
			`postprocessed_sents = [Text(ns).apply(self.lang.SubSingleQuoteRule)`
			`for ns in postprocessed_sents]`
			`return postprocessed_sents`

			`def post_process_segments(self, txt):`
			`if len(txt) > 2 and re.search(r'\A[a-zA-Z]*\Z', txt):`
			`return txt`

			`# below condition present in pragmatic segmenter`
			`# dont know significance of it yet.`
			`# if self.consecutive_underscore(txt) or len(txt) < 2:`
			`# return txt`

			`if re.match(r'\t', txt):`
			`pass`

			`# TODO:`
			`# Decide on keeping or removing Standard.ExtraWhiteSpaceRule`
			`# removed to retain original text spans`
			`# txt = Text(txt).apply(*ReinsertEllipsisRules.All,`
			`# Standard.ExtraWhiteSpaceRule)`
			`txt = Text(txt).apply(*self.lang.ReinsertEllipsisRules.All)`
			`if re.search(self.lang.QUOTATION_AT_END_OF_SENTENCE_REGEX, txt):`
			`txt = re.split(`
			`self.lang.SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX, txt)`
			`return txt`
			`else:`
			`txt = txt.replace('\n', '')`
			`return txt.strip()`

			`def check_for_parens_between_quotes(self):`
			`def paren_replace(match):`
			`match = match.group()`
			`sub1 = re.sub(r'\s(?=\()', '\r', match)`
			`sub2 = re.sub(r'(?<=\))\s', '\r', sub1)`
			`return sub2`
			`self.text = re.sub(self.lang.PARENS_BETWEEN_DOUBLE_QUOTES_REGEX,`
			`paren_replace, self.text)`

			`def replace_continuous_punctuation(self):`
			`def continuous_puncs_replace(match):`
			`match = match.group()`
			`sub1 = re.sub(re.escape('!'), '&ᓴ&', match)`
			`sub2 = re.sub(re.escape('?'), '&ᓷ&', sub1)`
			`return sub2`
			`self.text = re.sub(self.lang.CONTINUOUS_PUNCTUATION_REGEX,`
			`continuous_puncs_replace, self.text)`

			`def replace_periods_before_numeric_references(self):`
			`# https://github.com/diasks2/pragmatic_segmenter/commit/d9ec1a352aff92b91e2e572c30bb9561eb42c703`
			`self.text = re.sub(self.lang.NUMBERED_REFERENCE_REGEX,`
			`r"∯\2\r\7", self.text)`

			`def consecutive_underscore(self, txt):`
			`# Rubular: http://rubular.com/r/fTF2Ff3WBL`
			`txt = re.sub(r'_{3,}', '', txt)`
			`return len(txt) == 0`

			`def check_for_punctuation(self, txt):`
			`if any(p in txt for p in self.lang.Punctuations):`
			`sents = self.process_text(txt)`
			`return sents`
			`else:`
			`# NOTE: next steps of check_for_punctuation will unpack this list`
			`return [txt]`

			`def process_text(self, txt):`
			`if txt[-1] not in self.lang.Punctuations:`
			`txt += 'ȸ'`
			`txt = ExclamationWords.apply_rules(txt)`
			`txt = self.between_punctuation(txt)`
			`# handle text having only doublepunctuations`
			`if not re.match(self.lang.DoublePunctuationRules.DoublePunctuation, txt):`
			`txt = Text(txt).apply(*self.lang.DoublePunctuationRules.All)`
			`txt = Text(txt).apply(self.lang.QuestionMarkInQuotationRule,`
			`*self.lang.ExclamationPointRules.All)`
			`txt = ListItemReplacer(txt).replace_parens()`
			`txt = self.sentence_boundary_punctuation(txt)`
			`return txt`

			`def replace_numbers(self):`
			`self.text = Text(self.text).apply(*self.lang.Numbers.All)`

			`def abbreviations_replacer(self):`
			`if hasattr(self.lang, "AbbreviationReplacer"):`
			`return self.lang.AbbreviationReplacer(self.text, self.lang)`
			`else:`
			`return AbbreviationReplacer(self.text, self.lang)`

			`def replace_abbreviations(self):`
			`self.text = self.abbreviations_replacer().replace()`

			`def between_punctuation_processor(self, txt):`
			`if hasattr(self.lang, "BetweenPunctuation"):`
			`return self.lang.BetweenPunctuation(txt)`
			`else:`
			`return BetweenPunctuation(txt)`

			`def between_punctuation(self, txt):`
			`txt = self.between_punctuation_processor(txt).replace()`
			`return txt`

			`def sentence_boundary_punctuation(self, txt):`
			`if hasattr(self.lang, 'ReplaceColonBetweenNumbersRule'):`
			`txt = Text(txt).apply(`
			`self.lang.ReplaceColonBetweenNumbersRule)`
			`if hasattr(self.lang, 'ReplaceNonSentenceBoundaryCommaRule'):`
			`txt = Text(txt).apply(`
			`self.lang.ReplaceNonSentenceBoundaryCommaRule)`
			`# retain exclamation mark if it is an ending character of a given text`
			`txt = re.sub(r'&ᓴ&$', '!', txt)`
			`txt = [`
			`m.group() for m in re.finditer(self.lang.SENTENCE_BOUNDARY_REGEX, txt)`
			`]`
			`return txt`