ai-content-maker/.venv/Lib/site-packages/pysbd/lang/japanese.py

52 lines
1.8 KiB
Python
Raw Normal View History

2024-05-03 04:18:51 +03:00
# -*- coding: utf-8 -*-
import re
from pysbd.abbreviation_replacer import AbbreviationReplacer
from pysbd.between_punctuation import BetweenPunctuation
from pysbd.lang.common import Common, Standard
from pysbd.punctuation_replacer import replace_punctuation
from pysbd.cleaner import Cleaner
from pysbd.utils import Text, Rule
class Japanese(Common, Standard):
iso_code = 'ja'
class Cleaner(Cleaner):
def __init__(self, text, lang, doc_type=None):
super().__init__(text, lang)
def clean(self):
self.remove_newline_in_middle_of_word()
return self.text
def remove_newline_in_middle_of_word(self):
NewLineInMiddleOfWordRule = Rule(r'(?<=の)\n(?=\S)', '')
self.text = Text(self.text).apply(NewLineInMiddleOfWordRule)
class AbbreviationReplacer(AbbreviationReplacer):
SENTENCE_STARTERS = []
class BetweenPunctuation(BetweenPunctuation):
def __init__(self, text):
super().__init__(text)
def replace(self):
self.sub_punctuation_between_quotes_and_parens()
return self.text
def sub_punctuation_between_parens_ja(self):
BETWEEN_PARENS_JA_REGEX = r'(?=(?P<tmp>[^]+|\\{2}|\\.)*)(?P=tmp)'
self.text = re.sub(BETWEEN_PARENS_JA_REGEX, replace_punctuation,
self.text)
def sub_punctuation_between_quotes_ja(self):
BETWEEN_QUOTE_JA_REGEX = r'「(?=(?P<tmp>[^「」]+|\\{2}|\\.)*)(?P=tmp)」'
self.text = re.sub(BETWEEN_QUOTE_JA_REGEX, replace_punctuation,
self.text)
def sub_punctuation_between_quotes_and_parens(self):
self.sub_punctuation_between_parens_ja()
self.sub_punctuation_between_quotes_ja()