# -*- coding: utf-8 -*- import re from pysbd.utils import Text def replace_pre_number_abbr(txt, abbr): # prepend a space to avoid needing another regex for start of string txt = " " + txt txt = re.sub(r"(?<=\s{abbr})\.(?=(\s\d|\s+\())".format(abbr=abbr.strip()), "∯", txt) # remove the prepended space txt = txt[1:] return txt def replace_prepositive_abbr(txt, abbr): # prepend a space to avoid needing another regex for start of string txt = " " + txt txt = re.sub(r"(?<=\s{abbr})\.(?=(\s|:\d+))".format(abbr=abbr.strip()), "∯", txt) # remove the prepended space txt = txt[1:] return txt class AbbreviationReplacer(object): def __init__(self, text, lang): self.text = text self.lang = lang def replace(self): self.text = Text(self.text).apply( self.lang.PossessiveAbbreviationRule, self.lang.KommanditgesellschaftRule, *self.lang.SingleLetterAbbreviationRules.All ) abbr_handled_text = "" for line in self.text.splitlines(True): abbr_handled_text += self.search_for_abbreviations_in_string(line) self.text = abbr_handled_text self.replace_multi_period_abbreviations() self.text = Text(self.text).apply(*self.lang.AmPmRules.All) self.text = self.replace_abbreviation_as_sentence_boundary() return self.text def replace_abbreviation_as_sentence_boundary(self): sent_starters = "|".join((r"(?=\s{}\s)".format(word) for word in self.SENTENCE_STARTERS)) regex = r"(U∯S|U\.S|U∯K|E∯U|E\.U|U∯S∯A|U\.S\.A|I|i.v|I.V)∯({})".format(sent_starters) self.text = re.sub(regex, '\\1.', self.text) return self.text def replace_multi_period_abbreviations(self): def mpa_replace(match): match = match.group() match = re.sub(re.escape(r"."), "∯", match) return match self.text = re.sub( self.lang.MULTI_PERIOD_ABBREVIATION_REGEX, mpa_replace, self.text, flags=re.IGNORECASE ) def replace_period_of_abbr(self, txt, abbr): # prepend a space to avoid needing another regex for start of string txt = " " + txt txt = re.sub( r"(?<=\s{abbr})\.(?=((\.|\:|-|\?|,)|(\s([a-z]|I\s|I'm|I'll|\d|\())))".format( abbr=re.escape(abbr.strip()) ), "∯", txt, ) # remove the prepended space txt = txt[1:] return txt def search_for_abbreviations_in_string(self, text): lowered = text.lower() for abbr in self.lang.Abbreviation.ABBREVIATIONS: stripped = abbr.strip() if stripped not in lowered: continue abbrev_match = re.findall( r"(?:^|\s|\r|\n){}".format(stripped), text, flags=re.IGNORECASE ) if not abbrev_match: continue next_word_start = r"(?<={" + str(re.escape(stripped)) + "} ).{1}" char_array = re.findall(next_word_start, text) for ind, match in enumerate(abbrev_match): text = self.scan_for_replacements( text, match, ind, char_array ) return text def scan_for_replacements(self, txt, am, ind, char_array): try: char = char_array[ind] except IndexError: char = "" prepositive = self.lang.Abbreviation.PREPOSITIVE_ABBREVIATIONS number_abbr = self.lang.Abbreviation.NUMBER_ABBREVIATIONS upper = str(char).isupper() if not upper or am.strip().lower() in prepositive: if am.strip().lower() in prepositive: txt = replace_prepositive_abbr(txt, am) elif am.strip().lower() in number_abbr: txt = replace_pre_number_abbr(txt, am) else: txt = self.replace_period_of_abbr(txt, am) return txt