ai-content-maker/.venv/Lib/site-packages/pysbd/lang/common/common.py

92 lines
3.8 KiB
Python
Raw Normal View History

2024-05-03 04:18:51 +03:00
# -*- coding: utf-8 -*-
import re
from pysbd.utils import Rule
class Common(object):
# added special case: r"[。..!? ]{2,}" to handle intermittent dots, exclamation, etc.
# r"[。..!?] at end to handle single instances of these symbol inputs
SENTENCE_BOUNDARY_REGEX = r"(?:[^])*(?=\s?[A-Z])|「(?:[^」])*」(?=\s[A-Z])|\((?:[^\)]){2,}\)(?=\s[A-Z])|\'(?:[^\'])*[^,]\'(?=\s[A-Z])|\"(?:[^\"])*[^,]\"(?=\s[A-Z])|\“(?:[^\”])*[^,]\”(?=\s[A-Z])|[。..!? ]{2,}|\S.*?[。..!??ȸȹ☉☈☇☄]|[。..!?]"
# # Rubular: http://rubular.com/r/NqCqv372Ix
QUOTATION_AT_END_OF_SENTENCE_REGEX = r'[!?\.-][\"\'“”]\s{1}[A-Z]'
# # Rubular: http://rubular.com/r/6flGnUMEVl
PARENS_BETWEEN_DOUBLE_QUOTES_REGEX = r'["\”]\s\(.*\)\s["\“]'
# # Rubular: http://rubular.com/r/TYzr4qOW1Q
# BETWEEN_DOUBLE_QUOTES_REGEX = / "(?:[^"])*[^, ]"|“(?: [ ^”])*[^, ]”/
# # Rubular: http://rubular.com/r/JMjlZHAT4g
SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX = r'(?<=[!?\.-][\"\'“”])\s{1}(?=[A-Z])'
# # Rubular: http://rubular.com/r/mQ8Es9bxtk
CONTINUOUS_PUNCTUATION_REGEX = r'(?<=\S)(!|\?){3,}(?=(\s|\Z|$))'
# https://rubular.com/r/UkumQaILKbkeyc
# https://github.com/diasks2/pragmatic_segmenter/commit/d9ec1a352aff92b91e2e572c30bb9561eb42c703
NUMBERED_REFERENCE_REGEX = r'(?<=[^\d\s])(\.|∯)((\[(\d{1,3},?\s?-?\s?)*\b\d{1,3}\])+|((\d{1,3}\s?)?\d{1,3}))(\s)(?=[A-Z])'
# # Rubular: http://rubular.com/r/yqa4Rit8EY
PossessiveAbbreviationRule = Rule(r"\.(?='s\s)|\.(?='s$)|\.(?='s\Z)", '')
# # Rubular: http://rubular.com/r/NEv265G2X2
KommanditgesellschaftRule = Rule(r'(?<=Co)\.(?=\sKG)', '')
# # Rubular: http://rubular.com/r/xDkpFZ0EgH
MULTI_PERIOD_ABBREVIATION_REGEX = r"\b[a-z](?:\.[a-z])+[.]"
class SingleLetterAbbreviationRules(object):
"""Searches for periods within an abbreviation and
replaces the periods.
"""
# Rubular: http://rubular.com/r/e3H6kwnr6H
SingleUpperCaseLetterAtStartOfLineRule = Rule(r"(?<=^[A-Z])\.(?=\s)", '')
# Rubular: http://rubular.com/r/gitvf0YWH4
SingleUpperCaseLetterRule = Rule(r"(?<=\s[A-Z])\.(?=,?\s)", '')
All = [
SingleUpperCaseLetterAtStartOfLineRule, SingleUpperCaseLetterRule
]
class AmPmRules(object):
# Rubular: http://rubular.com/r/Vnx3m4Spc8
UpperCasePmRule = Rule(r'(?<= P∯M)∯(?=\s[A-Z])', '.')
# Rubular: http://rubular.com/r/AJMCotJVbW
UpperCaseAmRule = Rule(r'(?<=A∯M)∯(?=\s[A-Z])', '.')
# Rubular: http://rubular.com/r/13q7SnOhgA
LowerCasePmRule = Rule(r'(?<=p∯m)∯(?=\s[A-Z])', '.')
# Rubular: http://rubular.com/r/DgUDq4mLz5
LowerCaseAmRule = Rule(r'(?<=a∯m)∯(?=\s[A-Z])', '.')
All = [UpperCasePmRule, UpperCaseAmRule, LowerCasePmRule, LowerCaseAmRule]
class Numbers(object):
# Rubular: http://rubular.com/r/oNyxBOqbyy
PeriodBeforeNumberRule = Rule(r'\.(?=\d)', '')
# Rubular: http://rubular.com/r/EMk5MpiUzt
NumberAfterPeriodBeforeLetterRule = Rule(r'(?<=\d)\.(?=\S)', '')
# Rubular: http://rubular.com/r/rf4l1HjtjG
NewLineNumberPeriodSpaceLetterRule = Rule(r'(?<=\r\d)\.(?=(\s\S)|\))', '')
# Rubular: http://rubular.com/r/HPa4sdc6b9
StartLineNumberPeriodRule = Rule(r'(?<=^\d)\.(?=(\s\S)|\))', '')
# Rubular: http://rubular.com/r/NuvWnKleFl
StartLineTwoDigitNumberPeriodRule = Rule(r'(?<=^\d\d)\.(?=(\s\S)|\))', '')
All = [
PeriodBeforeNumberRule,
NumberAfterPeriodBeforeLetterRule,
NewLineNumberPeriodSpaceLetterRule,
StartLineNumberPeriodRule,
StartLineTwoDigitNumberPeriodRule
]