92 lines
3.8 KiB
Python
92 lines
3.8 KiB
Python
# -*- coding: utf-8 -*-
|
||
import re
|
||
from pysbd.utils import Rule
|
||
|
||
class Common(object):
|
||
|
||
# added special case: r"[。..!!? ]{2,}" to handle intermittent dots, exclamation, etc.
|
||
# r"[。..!!?] at end to handle single instances of these symbol inputs
|
||
SENTENCE_BOUNDARY_REGEX = r"((?:[^)])*)(?=\s?[A-Z])|「(?:[^」])*」(?=\s[A-Z])|\((?:[^\)]){2,}\)(?=\s[A-Z])|\'(?:[^\'])*[^,]\'(?=\s[A-Z])|\"(?:[^\"])*[^,]\"(?=\s[A-Z])|\“(?:[^\”])*[^,]\”(?=\s[A-Z])|[。..!!?? ]{2,}|\S.*?[。..!!??ȸȹ☉☈☇☄]|[。..!!??]"
|
||
|
||
# # Rubular: http://rubular.com/r/NqCqv372Ix
|
||
QUOTATION_AT_END_OF_SENTENCE_REGEX = r'[!?\.-][\"\'“”]\s{1}[A-Z]'
|
||
|
||
# # Rubular: http://rubular.com/r/6flGnUMEVl
|
||
PARENS_BETWEEN_DOUBLE_QUOTES_REGEX = r'["\”]\s\(.*\)\s["\“]'
|
||
|
||
# # Rubular: http://rubular.com/r/TYzr4qOW1Q
|
||
# BETWEEN_DOUBLE_QUOTES_REGEX = / "(?:[^"])*[^, ]"|“(?: [ ^”])*[^, ]”/
|
||
|
||
# # Rubular: http://rubular.com/r/JMjlZHAT4g
|
||
SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX = r'(?<=[!?\.-][\"\'“”])\s{1}(?=[A-Z])'
|
||
|
||
# # Rubular: http://rubular.com/r/mQ8Es9bxtk
|
||
CONTINUOUS_PUNCTUATION_REGEX = r'(?<=\S)(!|\?){3,}(?=(\s|\Z|$))'
|
||
|
||
# https://rubular.com/r/UkumQaILKbkeyc
|
||
# https://github.com/diasks2/pragmatic_segmenter/commit/d9ec1a352aff92b91e2e572c30bb9561eb42c703
|
||
NUMBERED_REFERENCE_REGEX = r'(?<=[^\d\s])(\.|∯)((\[(\d{1,3},?\s?-?\s?)*\b\d{1,3}\])+|((\d{1,3}\s?)?\d{1,3}))(\s)(?=[A-Z])'
|
||
|
||
# # Rubular: http://rubular.com/r/yqa4Rit8EY
|
||
PossessiveAbbreviationRule = Rule(r"\.(?='s\s)|\.(?='s$)|\.(?='s\Z)", '∯')
|
||
|
||
# # Rubular: http://rubular.com/r/NEv265G2X2
|
||
KommanditgesellschaftRule = Rule(r'(?<=Co)\.(?=\sKG)', '∯')
|
||
|
||
# # Rubular: http://rubular.com/r/xDkpFZ0EgH
|
||
MULTI_PERIOD_ABBREVIATION_REGEX = r"\b[a-z](?:\.[a-z])+[.]"
|
||
|
||
class SingleLetterAbbreviationRules(object):
|
||
"""Searches for periods within an abbreviation and
|
||
replaces the periods.
|
||
"""
|
||
# Rubular: http://rubular.com/r/e3H6kwnr6H
|
||
SingleUpperCaseLetterAtStartOfLineRule = Rule(r"(?<=^[A-Z])\.(?=\s)", '∯')
|
||
|
||
# Rubular: http://rubular.com/r/gitvf0YWH4
|
||
SingleUpperCaseLetterRule = Rule(r"(?<=\s[A-Z])\.(?=,?\s)", '∯')
|
||
|
||
All = [
|
||
SingleUpperCaseLetterAtStartOfLineRule, SingleUpperCaseLetterRule
|
||
]
|
||
|
||
class AmPmRules(object):
|
||
|
||
# Rubular: http://rubular.com/r/Vnx3m4Spc8
|
||
UpperCasePmRule = Rule(r'(?<= P∯M)∯(?=\s[A-Z])', '.')
|
||
|
||
# Rubular: http://rubular.com/r/AJMCotJVbW
|
||
UpperCaseAmRule = Rule(r'(?<=A∯M)∯(?=\s[A-Z])', '.')
|
||
|
||
# Rubular: http://rubular.com/r/13q7SnOhgA
|
||
LowerCasePmRule = Rule(r'(?<=p∯m)∯(?=\s[A-Z])', '.')
|
||
|
||
# Rubular: http://rubular.com/r/DgUDq4mLz5
|
||
LowerCaseAmRule = Rule(r'(?<=a∯m)∯(?=\s[A-Z])', '.')
|
||
|
||
All = [UpperCasePmRule, UpperCaseAmRule, LowerCasePmRule, LowerCaseAmRule]
|
||
|
||
class Numbers(object):
|
||
# Rubular: http://rubular.com/r/oNyxBOqbyy
|
||
PeriodBeforeNumberRule = Rule(r'\.(?=\d)', '∯')
|
||
|
||
# Rubular: http://rubular.com/r/EMk5MpiUzt
|
||
NumberAfterPeriodBeforeLetterRule = Rule(r'(?<=\d)\.(?=\S)', '∯')
|
||
|
||
# Rubular: http://rubular.com/r/rf4l1HjtjG
|
||
NewLineNumberPeriodSpaceLetterRule = Rule(r'(?<=\r\d)\.(?=(\s\S)|\))', '∯')
|
||
|
||
# Rubular: http://rubular.com/r/HPa4sdc6b9
|
||
StartLineNumberPeriodRule = Rule(r'(?<=^\d)\.(?=(\s\S)|\))', '∯')
|
||
|
||
# Rubular: http://rubular.com/r/NuvWnKleFl
|
||
StartLineTwoDigitNumberPeriodRule = Rule(r'(?<=^\d\d)\.(?=(\s\S)|\))', '∯')
|
||
|
||
All = [
|
||
PeriodBeforeNumberRule,
|
||
NumberAfterPeriodBeforeLetterRule,
|
||
NewLineNumberPeriodSpaceLetterRule,
|
||
StartLineNumberPeriodRule,
|
||
StartLineTwoDigitNumberPeriodRule
|
||
]
|