ai-content-maker/.venv/Lib/site-packages/pysbd/abbreviation_replacer.py

# -*- coding: utf-8 -*-
import re
from pysbd.utils import Text


def replace_pre_number_abbr(txt, abbr):
    # prepend a space to avoid needing another regex for start of string
    txt = " " + txt
    txt = re.sub(r"(?<=\s{abbr})\.(?=(\s\d|\s+\())".format(abbr=abbr.strip()), "∯", txt)
    # remove the prepended space
    txt = txt[1:]
    return txt


def replace_prepositive_abbr(txt, abbr):
    # prepend a space to avoid needing another regex for start of string
    txt = " " + txt
    txt = re.sub(r"(?<=\s{abbr})\.(?=(\s|:\d+))".format(abbr=abbr.strip()), "∯", txt)
    # remove the prepended space
    txt = txt[1:]
    return txt


class AbbreviationReplacer(object):
    def __init__(self, text, lang):
        self.text = text
        self.lang = lang

    def replace(self):
        self.text = Text(self.text).apply(
            self.lang.PossessiveAbbreviationRule,
            self.lang.KommanditgesellschaftRule,
            *self.lang.SingleLetterAbbreviationRules.All
        )
        abbr_handled_text = ""
        for line in self.text.splitlines(True):
            abbr_handled_text += self.search_for_abbreviations_in_string(line)
        self.text = abbr_handled_text
        self.replace_multi_period_abbreviations()
        self.text = Text(self.text).apply(*self.lang.AmPmRules.All)
        self.text = self.replace_abbreviation_as_sentence_boundary()
        return self.text

    def replace_abbreviation_as_sentence_boundary(self):
        sent_starters = "|".join((r"(?=\s{}\s)".format(word) for word in self.SENTENCE_STARTERS))
        regex = r"(U∯S|U\.S|U∯K|E∯U|E\.U|U∯S∯A|U\.S\.A|I|i.v|I.V)∯({})".format(sent_starters)
        self.text = re.sub(regex, '\\1.', self.text)
        return self.text

    def replace_multi_period_abbreviations(self):
        def mpa_replace(match):
            match = match.group()
            match = re.sub(re.escape(r"."), "∯", match)
            return match

        self.text = re.sub(
            self.lang.MULTI_PERIOD_ABBREVIATION_REGEX,
            mpa_replace,
            self.text,
            flags=re.IGNORECASE
        )

    def replace_period_of_abbr(self, txt, abbr):
        # prepend a space to avoid needing another regex for start of string
        txt = " " + txt
        txt = re.sub(
            r"(?<=\s{abbr})\.(?=((\.|\:|-|\?|,)|(\s([a-z]|I\s|I'm|I'll|\d|\())))".format(
                abbr=re.escape(abbr.strip())
            ),
            "∯",
            txt,
        )
        # remove the prepended space
        txt = txt[1:]
        return txt


    def search_for_abbreviations_in_string(self, text):
        lowered = text.lower()
        for abbr in self.lang.Abbreviation.ABBREVIATIONS:
            stripped = abbr.strip()
            if stripped not in lowered:
                continue
            abbrev_match = re.findall(
                r"(?:^|\s|\r|\n){}".format(stripped), text, flags=re.IGNORECASE
            )
            if not abbrev_match:
                continue
            next_word_start = r"(?<={" + str(re.escape(stripped)) + "} ).{1}"
            char_array = re.findall(next_word_start, text)
            for ind, match in enumerate(abbrev_match):
                text = self.scan_for_replacements(
                    text, match, ind, char_array
                )
        return text

    def scan_for_replacements(self, txt, am, ind, char_array):
        try:
            char = char_array[ind]
        except IndexError:
            char = ""
        prepositive = self.lang.Abbreviation.PREPOSITIVE_ABBREVIATIONS
        number_abbr = self.lang.Abbreviation.NUMBER_ABBREVIATIONS
        upper = str(char).isupper()
        if not upper or am.strip().lower() in prepositive:
            if am.strip().lower() in prepositive:
                txt = replace_prepositive_abbr(txt, am)
            elif am.strip().lower() in number_abbr:
                txt = replace_pre_number_abbr(txt, am)
            else:
                txt = self.replace_period_of_abbr(txt, am)
        return txt
first commit 2024-05-03 04:18:51 +03:00			`# -- coding: utf-8 --`
			`import re`
			`from pysbd.utils import Text`


			`def replace_pre_number_abbr(txt, abbr):`
			`# prepend a space to avoid needing another regex for start of string`
			`txt = " " + txt`
			`txt = re.sub(r"(?<=\s{abbr})\.(?=(\s\d\|\s+\())".format(abbr=abbr.strip()), "∯", txt)`
			`# remove the prepended space`
			`txt = txt[1:]`
			`return txt`


			`def replace_prepositive_abbr(txt, abbr):`
			`# prepend a space to avoid needing another regex for start of string`
			`txt = " " + txt`
			`txt = re.sub(r"(?<=\s{abbr})\.(?=(\s\|:\d+))".format(abbr=abbr.strip()), "∯", txt)`
			`# remove the prepended space`
			`txt = txt[1:]`
			`return txt`


			`class AbbreviationReplacer(object):`
			`def __init__(self, text, lang):`
			`self.text = text`
			`self.lang = lang`

			`def replace(self):`
			`self.text = Text(self.text).apply(`
			`self.lang.PossessiveAbbreviationRule,`
			`self.lang.KommanditgesellschaftRule,`
			`*self.lang.SingleLetterAbbreviationRules.All`
			`)`
			`abbr_handled_text = ""`
			`for line in self.text.splitlines(True):`
			`abbr_handled_text += self.search_for_abbreviations_in_string(line)`
			`self.text = abbr_handled_text`
			`self.replace_multi_period_abbreviations()`
			`self.text = Text(self.text).apply(*self.lang.AmPmRules.All)`
			`self.text = self.replace_abbreviation_as_sentence_boundary()`
			`return self.text`

			`def replace_abbreviation_as_sentence_boundary(self):`
			`sent_starters = "\|".join((r"(?=\s{}\s)".format(word) for word in self.SENTENCE_STARTERS))`
			`regex = r"(U∯S\|U\.S\|U∯K\|E∯U\|E\.U\|U∯S∯A\|U\.S\.A\|I\|i.v\|I.V)∯({})".format(sent_starters)`
			`self.text = re.sub(regex, '\\1.', self.text)`
			`return self.text`

			`def replace_multi_period_abbreviations(self):`
			`def mpa_replace(match):`
			`match = match.group()`
			`match = re.sub(re.escape(r"."), "∯", match)`
			`return match`

			`self.text = re.sub(`
			`self.lang.MULTI_PERIOD_ABBREVIATION_REGEX,`
			`mpa_replace,`
			`self.text,`
			`flags=re.IGNORECASE`
			`)`

			`def replace_period_of_abbr(self, txt, abbr):`
			`# prepend a space to avoid needing another regex for start of string`
			`txt = " " + txt`
			`txt = re.sub(`
			`r"(?<=\s{abbr})\.(?=((\.\|\:\|-\|\?\|,)\|(\s([a-z]\|I\s\|I'm\|I'll\|\d\|\())))".format(`
			`abbr=re.escape(abbr.strip())`
			`),`
			`"∯",`
			`txt,`
			`)`
			`# remove the prepended space`
			`txt = txt[1:]`
			`return txt`


			`def search_for_abbreviations_in_string(self, text):`
			`lowered = text.lower()`
			`for abbr in self.lang.Abbreviation.ABBREVIATIONS:`
			`stripped = abbr.strip()`
			`if stripped not in lowered:`
			`continue`
			`abbrev_match = re.findall(`
			`r"(?:^\|\s\|\r\|\n){}".format(stripped), text, flags=re.IGNORECASE`
			`)`
			`if not abbrev_match:`
			`continue`
			`next_word_start = r"(?<={" + str(re.escape(stripped)) + "} ).{1}"`
			`char_array = re.findall(next_word_start, text)`
			`for ind, match in enumerate(abbrev_match):`
			`text = self.scan_for_replacements(`
			`text, match, ind, char_array`
			`)`
			`return text`

			`def scan_for_replacements(self, txt, am, ind, char_array):`
			`try:`
			`char = char_array[ind]`
			`except IndexError:`
			`char = ""`
			`prepositive = self.lang.Abbreviation.PREPOSITIVE_ABBREVIATIONS`
			`number_abbr = self.lang.Abbreviation.NUMBER_ABBREVIATIONS`
			`upper = str(char).isupper()`
			`if not upper or am.strip().lower() in prepositive:`
			`if am.strip().lower() in prepositive:`
			`txt = replace_prepositive_abbr(txt, am)`
			`elif am.strip().lower() in number_abbr:`
			`txt = replace_pre_number_abbr(txt, am)`
			`else:`
			`txt = self.replace_period_of_abbr(txt, am)`
			`return txt`