113 lines
3.9 KiB
Python
113 lines
3.9 KiB
Python
|
# -*- coding: utf-8 -*-
|
||
|
import re
|
||
|
from pysbd.utils import Text
|
||
|
|
||
|
|
||
|
def replace_pre_number_abbr(txt, abbr):
|
||
|
# prepend a space to avoid needing another regex for start of string
|
||
|
txt = " " + txt
|
||
|
txt = re.sub(r"(?<=\s{abbr})\.(?=(\s\d|\s+\())".format(abbr=abbr.strip()), "∯", txt)
|
||
|
# remove the prepended space
|
||
|
txt = txt[1:]
|
||
|
return txt
|
||
|
|
||
|
|
||
|
def replace_prepositive_abbr(txt, abbr):
|
||
|
# prepend a space to avoid needing another regex for start of string
|
||
|
txt = " " + txt
|
||
|
txt = re.sub(r"(?<=\s{abbr})\.(?=(\s|:\d+))".format(abbr=abbr.strip()), "∯", txt)
|
||
|
# remove the prepended space
|
||
|
txt = txt[1:]
|
||
|
return txt
|
||
|
|
||
|
|
||
|
class AbbreviationReplacer(object):
|
||
|
def __init__(self, text, lang):
|
||
|
self.text = text
|
||
|
self.lang = lang
|
||
|
|
||
|
def replace(self):
|
||
|
self.text = Text(self.text).apply(
|
||
|
self.lang.PossessiveAbbreviationRule,
|
||
|
self.lang.KommanditgesellschaftRule,
|
||
|
*self.lang.SingleLetterAbbreviationRules.All
|
||
|
)
|
||
|
abbr_handled_text = ""
|
||
|
for line in self.text.splitlines(True):
|
||
|
abbr_handled_text += self.search_for_abbreviations_in_string(line)
|
||
|
self.text = abbr_handled_text
|
||
|
self.replace_multi_period_abbreviations()
|
||
|
self.text = Text(self.text).apply(*self.lang.AmPmRules.All)
|
||
|
self.text = self.replace_abbreviation_as_sentence_boundary()
|
||
|
return self.text
|
||
|
|
||
|
def replace_abbreviation_as_sentence_boundary(self):
|
||
|
sent_starters = "|".join((r"(?=\s{}\s)".format(word) for word in self.SENTENCE_STARTERS))
|
||
|
regex = r"(U∯S|U\.S|U∯K|E∯U|E\.U|U∯S∯A|U\.S\.A|I|i.v|I.V)∯({})".format(sent_starters)
|
||
|
self.text = re.sub(regex, '\\1.', self.text)
|
||
|
return self.text
|
||
|
|
||
|
def replace_multi_period_abbreviations(self):
|
||
|
def mpa_replace(match):
|
||
|
match = match.group()
|
||
|
match = re.sub(re.escape(r"."), "∯", match)
|
||
|
return match
|
||
|
|
||
|
self.text = re.sub(
|
||
|
self.lang.MULTI_PERIOD_ABBREVIATION_REGEX,
|
||
|
mpa_replace,
|
||
|
self.text,
|
||
|
flags=re.IGNORECASE
|
||
|
)
|
||
|
|
||
|
def replace_period_of_abbr(self, txt, abbr):
|
||
|
# prepend a space to avoid needing another regex for start of string
|
||
|
txt = " " + txt
|
||
|
txt = re.sub(
|
||
|
r"(?<=\s{abbr})\.(?=((\.|\:|-|\?|,)|(\s([a-z]|I\s|I'm|I'll|\d|\())))".format(
|
||
|
abbr=re.escape(abbr.strip())
|
||
|
),
|
||
|
"∯",
|
||
|
txt,
|
||
|
)
|
||
|
# remove the prepended space
|
||
|
txt = txt[1:]
|
||
|
return txt
|
||
|
|
||
|
|
||
|
def search_for_abbreviations_in_string(self, text):
|
||
|
lowered = text.lower()
|
||
|
for abbr in self.lang.Abbreviation.ABBREVIATIONS:
|
||
|
stripped = abbr.strip()
|
||
|
if stripped not in lowered:
|
||
|
continue
|
||
|
abbrev_match = re.findall(
|
||
|
r"(?:^|\s|\r|\n){}".format(stripped), text, flags=re.IGNORECASE
|
||
|
)
|
||
|
if not abbrev_match:
|
||
|
continue
|
||
|
next_word_start = r"(?<={" + str(re.escape(stripped)) + "} ).{1}"
|
||
|
char_array = re.findall(next_word_start, text)
|
||
|
for ind, match in enumerate(abbrev_match):
|
||
|
text = self.scan_for_replacements(
|
||
|
text, match, ind, char_array
|
||
|
)
|
||
|
return text
|
||
|
|
||
|
def scan_for_replacements(self, txt, am, ind, char_array):
|
||
|
try:
|
||
|
char = char_array[ind]
|
||
|
except IndexError:
|
||
|
char = ""
|
||
|
prepositive = self.lang.Abbreviation.PREPOSITIVE_ABBREVIATIONS
|
||
|
number_abbr = self.lang.Abbreviation.NUMBER_ABBREVIATIONS
|
||
|
upper = str(char).isupper()
|
||
|
if not upper or am.strip().lower() in prepositive:
|
||
|
if am.strip().lower() in prepositive:
|
||
|
txt = replace_prepositive_abbr(txt, am)
|
||
|
elif am.strip().lower() in number_abbr:
|
||
|
txt = replace_pre_number_abbr(txt, am)
|
||
|
else:
|
||
|
txt = self.replace_period_of_abbr(txt, am)
|
||
|
return txt
|