# -*- coding: utf-8 -*- import re from pysbd.utils import Rule class Common(object): # added special case: r"[。..!!? ]{2,}" to handle intermittent dots, exclamation, etc. # r"[。..!!?] at end to handle single instances of these symbol inputs SENTENCE_BOUNDARY_REGEX = r"((?:[^)])*)(?=\s?[A-Z])|「(?:[^」])*」(?=\s[A-Z])|\((?:[^\)]){2,}\)(?=\s[A-Z])|\'(?:[^\'])*[^,]\'(?=\s[A-Z])|\"(?:[^\"])*[^,]\"(?=\s[A-Z])|\“(?:[^\”])*[^,]\”(?=\s[A-Z])|[。..!!?? ]{2,}|\S.*?[。..!!??ȸȹ☉☈☇☄]|[。..!!??]" # # Rubular: http://rubular.com/r/NqCqv372Ix QUOTATION_AT_END_OF_SENTENCE_REGEX = r'[!?\.-][\"\'“”]\s{1}[A-Z]' # # Rubular: http://rubular.com/r/6flGnUMEVl PARENS_BETWEEN_DOUBLE_QUOTES_REGEX = r'["\”]\s\(.*\)\s["\“]' # # Rubular: http://rubular.com/r/TYzr4qOW1Q # BETWEEN_DOUBLE_QUOTES_REGEX = / "(?:[^"])*[^, ]"|“(?: [ ^”])*[^, ]”/ # # Rubular: http://rubular.com/r/JMjlZHAT4g SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX = r'(?<=[!?\.-][\"\'“”])\s{1}(?=[A-Z])' # # Rubular: http://rubular.com/r/mQ8Es9bxtk CONTINUOUS_PUNCTUATION_REGEX = r'(?<=\S)(!|\?){3,}(?=(\s|\Z|$))' # https://rubular.com/r/UkumQaILKbkeyc # https://github.com/diasks2/pragmatic_segmenter/commit/d9ec1a352aff92b91e2e572c30bb9561eb42c703 NUMBERED_REFERENCE_REGEX = r'(?<=[^\d\s])(\.|∯)((\[(\d{1,3},?\s?-?\s?)*\b\d{1,3}\])+|((\d{1,3}\s?)?\d{1,3}))(\s)(?=[A-Z])' # # Rubular: http://rubular.com/r/yqa4Rit8EY PossessiveAbbreviationRule = Rule(r"\.(?='s\s)|\.(?='s$)|\.(?='s\Z)", '∯') # # Rubular: http://rubular.com/r/NEv265G2X2 KommanditgesellschaftRule = Rule(r'(?<=Co)\.(?=\sKG)', '∯') # # Rubular: http://rubular.com/r/xDkpFZ0EgH MULTI_PERIOD_ABBREVIATION_REGEX = r"\b[a-z](?:\.[a-z])+[.]" class SingleLetterAbbreviationRules(object): """Searches for periods within an abbreviation and replaces the periods. """ # Rubular: http://rubular.com/r/e3H6kwnr6H SingleUpperCaseLetterAtStartOfLineRule = Rule(r"(?<=^[A-Z])\.(?=\s)", '∯') # Rubular: http://rubular.com/r/gitvf0YWH4 SingleUpperCaseLetterRule = Rule(r"(?<=\s[A-Z])\.(?=,?\s)", '∯') All = [ SingleUpperCaseLetterAtStartOfLineRule, SingleUpperCaseLetterRule ] class AmPmRules(object): # Rubular: http://rubular.com/r/Vnx3m4Spc8 UpperCasePmRule = Rule(r'(?<= P∯M)∯(?=\s[A-Z])', '.') # Rubular: http://rubular.com/r/AJMCotJVbW UpperCaseAmRule = Rule(r'(?<=A∯M)∯(?=\s[A-Z])', '.') # Rubular: http://rubular.com/r/13q7SnOhgA LowerCasePmRule = Rule(r'(?<=p∯m)∯(?=\s[A-Z])', '.') # Rubular: http://rubular.com/r/DgUDq4mLz5 LowerCaseAmRule = Rule(r'(?<=a∯m)∯(?=\s[A-Z])', '.') All = [UpperCasePmRule, UpperCaseAmRule, LowerCasePmRule, LowerCaseAmRule] class Numbers(object): # Rubular: http://rubular.com/r/oNyxBOqbyy PeriodBeforeNumberRule = Rule(r'\.(?=\d)', '∯') # Rubular: http://rubular.com/r/EMk5MpiUzt NumberAfterPeriodBeforeLetterRule = Rule(r'(?<=\d)\.(?=\S)', '∯') # Rubular: http://rubular.com/r/rf4l1HjtjG NewLineNumberPeriodSpaceLetterRule = Rule(r'(?<=\r\d)\.(?=(\s\S)|\))', '∯') # Rubular: http://rubular.com/r/HPa4sdc6b9 StartLineNumberPeriodRule = Rule(r'(?<=^\d)\.(?=(\s\S)|\))', '∯') # Rubular: http://rubular.com/r/NuvWnKleFl StartLineTwoDigitNumberPeriodRule = Rule(r'(?<=^\d\d)\.(?=(\s\S)|\))', '∯') All = [ PeriodBeforeNumberRule, NumberAfterPeriodBeforeLetterRule, NewLineNumberPeriodSpaceLetterRule, StartLineNumberPeriodRule, StartLineTwoDigitNumberPeriodRule ]