31 lines
897 B
Python
31 lines
897 B
Python
|
# -*- coding: utf-8 -*-
|
||
|
import re
|
||
|
|
||
|
from pysbd.abbreviation_replacer import AbbreviationReplacer
|
||
|
from pysbd.lang.common import Common, Standard
|
||
|
from pysbd.utils import Rule
|
||
|
|
||
|
class Persian(Common, Standard):
|
||
|
|
||
|
iso_code = 'fa'
|
||
|
|
||
|
Punctuations = ['?', '!', ':', '.', '؟']
|
||
|
SENTENCE_BOUNDARY_REGEX = r'.*?[:\.!\?؟]|.*?\Z|.*?$'
|
||
|
|
||
|
# Rubular: http://rubular.com/r/RX5HpdDIyv
|
||
|
ReplaceColonBetweenNumbersRule = Rule(r'(?<=\d):(?=\d)', '♭')
|
||
|
|
||
|
# Rubular: http://rubular.com/r/kPRgApNHUg
|
||
|
ReplaceNonSentenceBoundaryCommaRule = Rule(r'،(?=\s\S+،)', '♬')
|
||
|
|
||
|
class AbbreviationReplacer(AbbreviationReplacer):
|
||
|
|
||
|
SENTENCE_STARTERS = []
|
||
|
|
||
|
def __init__(self, text, lang):
|
||
|
super().__init__(text, lang)
|
||
|
|
||
|
def scan_for_replacements(self, txt, am, index, character_array):
|
||
|
txt = re.sub('(?<={0})\.'.format(am), '∯', txt)
|
||
|
return txt
|