ai-content-maker/.venv/Lib/site-packages/pysbd/cleaner.py

# -*- coding: utf-8 -*-
import re
from pysbd.utils import Text
from pysbd.clean.rules import PDF, HTML, CleanRules as cr


class Cleaner(object):

    def __init__(self, text, lang, doc_type=None):
        self.text = text
        self.lang = lang
        self.doc_type = doc_type

    def clean(self):
        if not self.text:
            return self.text
        self.remove_all_newlines()
        self.replace_double_newlines()
        self.replace_newlines()
        self.replace_escaped_newlines()
        self.text = Text(self.text).apply(*HTML.All)
        self.replace_punctuation_in_brackets()
        self.text = Text(self.text).apply(cr.InlineFormattingRule)
        self.clean_quotations()
        self.clean_table_of_contents()
        self.check_for_no_space_in_between_sentences()
        self.clean_consecutive_characters()
        return self.text

    def remove_all_newlines(self):
        self.remove_newline_in_middle_of_sentence()
        self.remove_newline_in_middle_of_word()

    def remove_newline_in_middle_of_sentence(self):
        def replace_w_blank(match):
            match = match.group()
            sub = re.sub(cr.NEWLINE_IN_MIDDLE_OF_SENTENCE_REGEX, '', match)
            return sub
        self.text = re.sub(r'(?:[^\.])*', replace_w_blank, self.text)

    def remove_newline_in_middle_of_word(self):
        self.text = Text(self.text).apply(cr.NewLineInMiddleOfWordRule)

    def replace_double_newlines(self):
        self.text = Text(self.text).apply(cr.DoubleNewLineWithSpaceRule,
                                          cr.DoubleNewLineRule)

    def remove_pdf_line_breaks(self):
        self.text = Text(
                self.text).apply(cr.NewLineFollowedByBulletRule,
                                 PDF.NewLineInMiddleOfSentenceRule,
                                 PDF.NewLineInMiddleOfSentenceNoSpacesRule)

    def replace_newlines(self):
        if self.doc_type == 'pdf':
            self.remove_pdf_line_breaks()
        else:
            self.text = Text(
                self.text).apply(cr.NewLineFollowedByPeriodRule,
                                 cr.ReplaceNewlineWithCarriageReturnRule)

    def replace_escaped_newlines(self):
        self.text = Text(
                self.text).apply(cr.EscapedNewLineRule,
                                 cr.EscapedCarriageReturnRule,
                                 cr.TypoEscapedNewLineRule,
                                 cr.TypoEscapedCarriageReturnRule)

    def replace_punctuation_in_brackets(self):
        def replace_punct(match):
            match = match.group()
            if '?' in match:
                sub = re.sub(re.escape('?'), '&ᓷ&', match)
                return sub
            return match
        self.text = re.sub(r'\[(?:[^\]])*\]', replace_punct, self.text)

    def clean_quotations(self):
        # method added explicitly
        # pragmatic-segmenter applies thhis method
        # at different location
        self.text = re.sub('`', "'", self.text)
        self.text = Text(self.text).apply(
                                        cr.QuotationsFirstRule,
                                        cr.QuotationsSecondRule)

    def clean_table_of_contents(self):
        self.text = Text(self.text).apply(
                                        cr.TableOfContentsRule,
                                        cr.ConsecutivePeriodsRule,
                                        cr.ConsecutiveForwardSlashRule)

    def search_for_connected_sentences(self, word, txt, regex, rule):
        if not re.search(regex, word):
            return txt
        if any(k in word for k in cr.URL_EMAIL_KEYWORDS):
            return txt
        new_word = Text(word).apply(rule)
        txt = re.sub(re.escape(word), new_word, txt)
        return txt

    def check_for_no_space_in_between_sentences(self):
        words = self.text.split(' ')
        for word in words:
            self.text = self.search_for_connected_sentences(word, self.text, cr.NO_SPACE_BETWEEN_SENTENCES_REGEX, cr.NoSpaceBetweenSentencesRule)
            self.text = self.search_for_connected_sentences(word, self.text, cr.NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX, cr.NoSpaceBetweenSentencesDigitRule)

    def clean_consecutive_characters(self):
        self.text = Text(self.text).apply(
                                        cr.ConsecutivePeriodsRule,
                                        cr.ConsecutiveForwardSlashRule)
first commit 2024-05-03 04:18:51 +03:00			`# -- coding: utf-8 --`
			`import re`
			`from pysbd.utils import Text`
			`from pysbd.clean.rules import PDF, HTML, CleanRules as cr`


			`class Cleaner(object):`

			`def __init__(self, text, lang, doc_type=None):`
			`self.text = text`
			`self.lang = lang`
			`self.doc_type = doc_type`

			`def clean(self):`
			`if not self.text:`
			`return self.text`
			`self.remove_all_newlines()`
			`self.replace_double_newlines()`
			`self.replace_newlines()`
			`self.replace_escaped_newlines()`
			`self.text = Text(self.text).apply(*HTML.All)`
			`self.replace_punctuation_in_brackets()`
			`self.text = Text(self.text).apply(cr.InlineFormattingRule)`
			`self.clean_quotations()`
			`self.clean_table_of_contents()`
			`self.check_for_no_space_in_between_sentences()`
			`self.clean_consecutive_characters()`
			`return self.text`

			`def remove_all_newlines(self):`
			`self.remove_newline_in_middle_of_sentence()`
			`self.remove_newline_in_middle_of_word()`

			`def remove_newline_in_middle_of_sentence(self):`
			`def replace_w_blank(match):`
			`match = match.group()`
			`sub = re.sub(cr.NEWLINE_IN_MIDDLE_OF_SENTENCE_REGEX, '', match)`
			`return sub`
			`self.text = re.sub(r'(?:[^\.])*', replace_w_blank, self.text)`

			`def remove_newline_in_middle_of_word(self):`
			`self.text = Text(self.text).apply(cr.NewLineInMiddleOfWordRule)`

			`def replace_double_newlines(self):`
			`self.text = Text(self.text).apply(cr.DoubleNewLineWithSpaceRule,`
			`cr.DoubleNewLineRule)`

			`def remove_pdf_line_breaks(self):`
			`self.text = Text(`
			`self.text).apply(cr.NewLineFollowedByBulletRule,`
			`PDF.NewLineInMiddleOfSentenceRule,`
			`PDF.NewLineInMiddleOfSentenceNoSpacesRule)`

			`def replace_newlines(self):`
			`if self.doc_type == 'pdf':`
			`self.remove_pdf_line_breaks()`
			`else:`
			`self.text = Text(`
			`self.text).apply(cr.NewLineFollowedByPeriodRule,`
			`cr.ReplaceNewlineWithCarriageReturnRule)`

			`def replace_escaped_newlines(self):`
			`self.text = Text(`
			`self.text).apply(cr.EscapedNewLineRule,`
			`cr.EscapedCarriageReturnRule,`
			`cr.TypoEscapedNewLineRule,`
			`cr.TypoEscapedCarriageReturnRule)`

			`def replace_punctuation_in_brackets(self):`
			`def replace_punct(match):`
			`match = match.group()`
			`if '?' in match:`
			`sub = re.sub(re.escape('?'), '&ᓷ&', match)`
			`return sub`
			`return match`
			`self.text = re.sub(r'\[(?:[^\]])*\]', replace_punct, self.text)`

			`def clean_quotations(self):`
			`# method added explicitly`
			`# pragmatic-segmenter applies thhis method`
			`# at different location`
			self.text = re.sub('`', "'", self.text)
			`self.text = Text(self.text).apply(`
			`cr.QuotationsFirstRule,`
			`cr.QuotationsSecondRule)`

			`def clean_table_of_contents(self):`
			`self.text = Text(self.text).apply(`
			`cr.TableOfContentsRule,`
			`cr.ConsecutivePeriodsRule,`
			`cr.ConsecutiveForwardSlashRule)`

			`def search_for_connected_sentences(self, word, txt, regex, rule):`
			`if not re.search(regex, word):`
			`return txt`
			`if any(k in word for k in cr.URL_EMAIL_KEYWORDS):`
			`return txt`
			`new_word = Text(word).apply(rule)`
			`txt = re.sub(re.escape(word), new_word, txt)`
			`return txt`

			`def check_for_no_space_in_between_sentences(self):`
			`words = self.text.split(' ')`
			`for word in words:`
			`self.text = self.search_for_connected_sentences(word, self.text, cr.NO_SPACE_BETWEEN_SENTENCES_REGEX, cr.NoSpaceBetweenSentencesRule)`
			`self.text = self.search_for_connected_sentences(word, self.text, cr.NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX, cr.NoSpaceBetweenSentencesDigitRule)`

			`def clean_consecutive_characters(self):`
			`self.text = Text(self.text).apply(`
			`cr.ConsecutivePeriodsRule,`
			`cr.ConsecutiveForwardSlashRule)`