ai-content-maker/.venv/Lib/site-packages/pysbd/lang/deutsch.py

98 lines
5.1 KiB
Python
Raw Normal View History

2024-05-03 04:18:51 +03:00
# -*- coding: utf-8 -*-
import re
from pysbd.abbreviation_replacer import AbbreviationReplacer
from pysbd.between_punctuation import BetweenPunctuation
from pysbd.lang.common import Common, Standard
from pysbd.punctuation_replacer import replace_punctuation
from pysbd.processor import Processor
from pysbd.utils import Text, Rule
class Deutsch(Common, Standard):
iso_code = 'de'
class Numbers(Common.Numbers):
# Rubular: http://rubular.com/r/hZxoyQwKT1
NumberPeriodSpaceRule = Rule(r'(?<=\s\d)\.(?=\s)|(?<=\s\d\d)\.(?=\s)', '')
# Rubular: http://rubular.com/r/ityNMwdghj
NegativeNumberPeriodSpaceRule = Rule(r'(?<=-\d)\.(?=\s)|(?<=-\d\d)\.(?=\s)', '')
All = Common.Numbers.All + [NumberPeriodSpaceRule, NegativeNumberPeriodSpaceRule]
class Processor(Processor):
def __init__(self, text, lang, char_span=False):
super().__init__(text, lang, char_span)
def replace_numbers(self):
self.text = Text(self.text).apply(*self.lang.Numbers.All)
self.replace_period_in_deutsch_dates()
return self.text
def replace_period_in_deutsch_dates(self):
MONTHS = ['Januar', 'Februar', 'März', 'April', 'Mai', 'Juni', 'Juli', 'August',
'September', 'Oktober', 'November', 'Dezember']
for month in MONTHS:
# Rubular: http://rubular.com/r/zlqgj7G5dA
self.text = re.sub(r'(?<=\d)\.(?=\s*{month})'.format(month=month), '', self.text)
class Abbreviation(Standard.Abbreviation):
ABBREVIATIONS = ['Ä', 'ä', 'adj', 'adm', 'adv', 'art', 'asst', 'b.a', 'b.s', 'bart', 'bldg', 'brig', 'bros', 'bse', 'buchst', 'bzgl', 'bzw', 'c.-à-d', 'ca', 'capt', 'chr', 'cmdr', 'co', 'col', 'comdr', 'con', 'corp', 'cpl', 'd.h', 'd.j', 'dergl', 'dgl', 'dkr', 'dr ', 'ens', 'etc', 'ev ', 'evtl', 'ff', 'g.g.a', 'g.u', 'gen', 'ggf', 'gov', 'hon', 'hosp', 'i.f', 'i.h.v', 'ii', 'iii', 'insp', 'iv', 'ix', 'jun', 'k.o', 'kath ', 'lfd', 'lt', 'ltd', 'm.e', 'maj', 'med', 'messrs', 'mio', 'mlle', 'mm', 'mme', 'mr', 'mrd', 'mrs', 'ms', 'msgr', 'mwst', 'no', 'nos', 'nr', 'o.ä', 'op', 'ord', 'pfc', 'ph', 'pp', 'prof', 'pvt', 'rep', 'reps', 'res', 'rev', 'rt', 's.p.a', 'sa', 'sen', 'sens', 'sfc', 'sgt', 'sog', 'sogen', 'spp', 'sr', 'st', 'std', 'str ', 'supt', 'surg', 'u.a ', 'u.e', 'u.s.w', 'u.u', 'u.ä', 'usf', 'usw', 'v', 'vgl', 'vi', 'vii', 'viii', 'vs', 'x', 'xi', 'xii', 'xiii', 'xiv', 'xix', 'xv', 'xvi', 'xvii', 'xviii', 'xx', 'z.b', 'z.t', 'z.z', 'z.zt', 'zt', 'zzt', 'univ.-prof', 'o.univ.-prof', 'ao.univ.prof', 'ass.prof', 'hon.prof', 'univ.-doz', 'univ.ass', 'stud.ass', 'projektass', 'ass', 'di', 'dipl.-ing', 'mag']
PREPOSITIVE_ABBREVIATIONS = []
NUMBER_ABBREVIATIONS = ['art', 'ca', 'no', 'nos', 'nr', 'pp']
class AbbreviationReplacer(AbbreviationReplacer):
SENTENCE_STARTERS = ("Am Auch Auf Bei Da Das Der Die Ein Eine Es Für Heute Ich Im In "
"Ist Jetzt Mein Mit Nach So Und Warum Was Wenn Wer Wie Wir").split(' ')
def __init__(self, text, lang):
super().__init__(text, lang)
def replace(self):
# Rubular: http://rubular.com/r/B4X33QKIL8
SingleLowerCaseLetterRule = Rule(r'(?<=\s[a-z])\.(?=\s)', '')
# Rubular: http://rubular.com/r/iUNSkCuso0
SingleLowerCaseLetterAtStartOfLineRule = Rule(r'(?<=^[a-z])\.(?=\s)', '')
self.text = Text(self.text).apply(
self.lang.PossessiveAbbreviationRule,
*self.lang.SingleLetterAbbreviationRules.All,
SingleLowerCaseLetterRule,
SingleLowerCaseLetterAtStartOfLineRule)
self.text = self.search_for_abbreviations_in_string(self.text)
self.replace_multi_period_abbreviations()
self.text = Text(self.text).apply(*self.lang.AmPmRules.All)
self.text = self.replace_abbreviation_as_sentence_boundary()
return self.text
def scan_for_replacements(self, txt, am, index, character_array):
txt = re.sub(r'(?<={am})\.(?=\s)'.format(am=am), '', txt)
return txt
class BetweenPunctuation(BetweenPunctuation):
def __init__(self, text):
super().__init__(text)
def sub_punctuation_between_double_quotes(self, txt):
# Rubular: http://rubular.com/r/OdcXBsub0w
BETWEEN_UNCONVENTIONAL_DOUBLE_QUOTE_DE_REGEX = r',,(?=(?P<tmp>[^“\\]+|\\{2}|\\.)*)(?P=tmp)“'
# Rubular: http://rubular.com/r/2UskIupGgP
# SPLIT_DOUBLE_QUOTES_DE_REGEX = r'\A„(?=(?P<tmp>[^“\\]+|\\{2}|\\.)*)(?P=tmp)“'
# Rubular: http://rubular.com/r/TkZomF9tTM
BETWEEN_DOUBLE_QUOTES_DE_REGEX = r'„(?=(?P<tmp>[^“\\]+|\\{2}|\\.)*)(?P=tmp)“'
if '' in txt:
return re.sub(BETWEEN_DOUBLE_QUOTES_DE_REGEX, replace_punctuation, txt)
elif ',,' in txt:
return re.sub(BETWEEN_UNCONVENTIONAL_DOUBLE_QUOTE_DE_REGEX,
replace_punctuation, txt)
else:
return txt