ai-content-maker/.venv/Lib/site-packages/pysbd/lang/japanese.py

52 lines
1.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
import re
from pysbd.abbreviation_replacer import AbbreviationReplacer
from pysbd.between_punctuation import BetweenPunctuation
from pysbd.lang.common import Common, Standard
from pysbd.punctuation_replacer import replace_punctuation
from pysbd.cleaner import Cleaner
from pysbd.utils import Text, Rule
class Japanese(Common, Standard):
iso_code = 'ja'
class Cleaner(Cleaner):
def __init__(self, text, lang, doc_type=None):
super().__init__(text, lang)
def clean(self):
self.remove_newline_in_middle_of_word()
return self.text
def remove_newline_in_middle_of_word(self):
NewLineInMiddleOfWordRule = Rule(r'(?<=の)\n(?=\S)', '')
self.text = Text(self.text).apply(NewLineInMiddleOfWordRule)
class AbbreviationReplacer(AbbreviationReplacer):
SENTENCE_STARTERS = []
class BetweenPunctuation(BetweenPunctuation):
def __init__(self, text):
super().__init__(text)
def replace(self):
self.sub_punctuation_between_quotes_and_parens()
return self.text
def sub_punctuation_between_parens_ja(self):
BETWEEN_PARENS_JA_REGEX = r'(?=(?P<tmp>[^]+|\\{2}|\\.)*)(?P=tmp)'
self.text = re.sub(BETWEEN_PARENS_JA_REGEX, replace_punctuation,
self.text)
def sub_punctuation_between_quotes_ja(self):
BETWEEN_QUOTE_JA_REGEX = r'「(?=(?P<tmp>[^「」]+|\\{2}|\\.)*)(?P=tmp)」'
self.text = re.sub(BETWEEN_QUOTE_JA_REGEX, replace_punctuation,
self.text)
def sub_punctuation_between_quotes_and_parens(self):
self.sub_punctuation_between_parens_ja()
self.sub_punctuation_between_quotes_ja()