52 lines
1.8 KiB
Python
52 lines
1.8 KiB
Python
# -*- coding: utf-8 -*-
|
||
import re
|
||
from pysbd.abbreviation_replacer import AbbreviationReplacer
|
||
from pysbd.between_punctuation import BetweenPunctuation
|
||
from pysbd.lang.common import Common, Standard
|
||
from pysbd.punctuation_replacer import replace_punctuation
|
||
from pysbd.cleaner import Cleaner
|
||
from pysbd.utils import Text, Rule
|
||
|
||
class Japanese(Common, Standard):
|
||
|
||
iso_code = 'ja'
|
||
|
||
class Cleaner(Cleaner):
|
||
|
||
def __init__(self, text, lang, doc_type=None):
|
||
super().__init__(text, lang)
|
||
|
||
def clean(self):
|
||
self.remove_newline_in_middle_of_word()
|
||
return self.text
|
||
|
||
def remove_newline_in_middle_of_word(self):
|
||
NewLineInMiddleOfWordRule = Rule(r'(?<=の)\n(?=\S)', '')
|
||
self.text = Text(self.text).apply(NewLineInMiddleOfWordRule)
|
||
|
||
class AbbreviationReplacer(AbbreviationReplacer):
|
||
SENTENCE_STARTERS = []
|
||
|
||
class BetweenPunctuation(BetweenPunctuation):
|
||
|
||
def __init__(self, text):
|
||
super().__init__(text)
|
||
|
||
def replace(self):
|
||
self.sub_punctuation_between_quotes_and_parens()
|
||
return self.text
|
||
|
||
def sub_punctuation_between_parens_ja(self):
|
||
BETWEEN_PARENS_JA_REGEX = r'((?=(?P<tmp>[^()]+|\\{2}|\\.)*)(?P=tmp))'
|
||
self.text = re.sub(BETWEEN_PARENS_JA_REGEX, replace_punctuation,
|
||
self.text)
|
||
|
||
def sub_punctuation_between_quotes_ja(self):
|
||
BETWEEN_QUOTE_JA_REGEX = r'「(?=(?P<tmp>[^「」]+|\\{2}|\\.)*)(?P=tmp)」'
|
||
self.text = re.sub(BETWEEN_QUOTE_JA_REGEX, replace_punctuation,
|
||
self.text)
|
||
|
||
def sub_punctuation_between_quotes_and_parens(self):
|
||
self.sub_punctuation_between_parens_ja()
|
||
self.sub_punctuation_between_quotes_ja()
|