
114 lines
6.4 KiB
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
from pysbd.utils import Rule
from pysbd.abbreviation_replacer import AbbreviationReplacer
class Standard:
# This class holds the punctuation marks.
Punctuations = ['', '', '.', '', '!', '?', '']
# Rubular:
GeoLocationRule = Rule(r'(?<=[a-zA-z]°)\.(?=\s*\d+)', '')
FileFormatRule = Rule(r'(?<=\s)\.(?=(jpe?g|png|gif|tiff?|pdf|ps|docx?|xlsx?|svg|bmp|tga|exif|odt|html?|txt|rtf|bat|sxw|xml|zip|exe|msi|blend|wmv|mp[34]|pptx?|flac|rb|cpp|cs|js)\s)', '')
SingleNewLineRule = Rule(r'\n', 'ȹ')
# Rubular:
QuestionMarkInQuotationRule = Rule(r'\?(?=(\'|\"))', '&ᓷ&')
ExtraWhiteSpaceRule = Rule(r'\s{3,}', ' ')
SubSingleQuoteRule = Rule(r'&⎋&', "'")
class Abbreviation(object):
"""Defines the abbreviations for each language (if available)"""
ABBREVIATIONS = ['adj', 'adm', 'adv', 'al', 'ala', 'alta', 'apr', 'arc', 'ariz', 'ark', 'art', 'assn', 'asst', 'attys', 'aug', 'ave', 'bart', 'bld', 'bldg', 'blvd', 'brig', 'bros', 'btw', 'cal', 'calif', 'capt', 'cl', 'cmdr', 'co', 'col', 'colo', 'comdr', 'con', 'conn', 'corp', 'cpl', 'cres', 'ct', 'd.phil', 'dak', 'dec', 'del', 'dept', 'det', 'dist', 'dr', 'dr.phil', 'dr.philos', 'drs', 'e.g', 'ens', 'esp', 'esq', 'etc', 'exp', 'expy', 'ext', 'feb', 'fed', 'fla', 'ft', 'fwy', 'fy', 'ga', 'gen', 'gov', 'hon', 'hosp', 'hr', 'hway', 'hwy', 'i.e', 'ia', 'id', 'ida', 'ill', 'inc', 'ind', 'ing', 'insp', 'is', 'jan', 'jr', 'jul', 'jun', 'kan', 'kans', 'ken', 'ky', 'la', 'lt', 'ltd', 'maj', 'man', 'mar', 'mass', 'may', 'md', 'me', 'med', 'messrs', 'mex', 'mfg', 'mich', 'min', 'minn', 'miss', 'mlle', 'mm', 'mme', 'mo', 'mont', 'mr', 'mrs', 'ms', 'msgr', 'mssrs', 'mt', 'mtn', 'neb', 'nebr', 'nev', 'no', 'nos', 'nov', 'nr', 'oct', 'ok', 'okla', 'ont', 'op', 'ord', 'ore', 'p', 'pa', 'pd', 'pde', 'penn', 'penna', 'pfc', 'ph', 'ph.d', 'pl', 'plz', 'pp', 'prof', 'pvt', 'que', 'rd', 'rs', 'ref', 'rep', 'reps', 'res', 'rev', 'rt', 'sask', 'sec', 'sen', 'sens', 'sep', 'sept', 'sfc', 'sgt', 'sr', 'st', 'supt', 'surg', 'tce', 'tenn', 'tex', 'univ', 'usafa', 'u.s', 'ut', 'va', 'v', 'ver', 'viz', 'vs', 'vt', 'wash', 'wis', 'wisc', 'wy', 'wyo', 'yuk', 'fig']
PREPOSITIVE_ABBREVIATIONS = ['adm', 'attys', 'brig', 'capt', 'cmdr', 'col', 'cpl', 'det', 'dr', 'gen', 'gov', 'ing', 'lt', 'maj', 'mr', 'mrs', 'ms', 'mt', 'messrs', 'mssrs', 'prof', 'ph', 'rep', 'reps', 'rev', 'sen', 'sens', 'sgt', 'st', 'supt', 'v', 'vs', 'fig']
NUMBER_ABBREVIATIONS = ['art', 'ext', 'no', 'nos', 'p', 'pp']
# Rubular:
# WithMultiplePeriodsAndEmailRule = Rule(r'(\w)(\.)(\w)', '\\1∮\\3')
# \w in python matches unicode abbreviations also so limit to english alphanumerics
WithMultiplePeriodsAndEmailRule = Rule(r'([a-zA-Z0-9_])(\.)([a-zA-Z0-9_])', '\\1∮\\3')
class DoublePunctuationRules(object):
FirstRule = Rule(r'\?!', '')
SecondRule = Rule(r'!\?', '')
ThirdRule = Rule(r'\?\?', '')
ForthRule = Rule(r'!!', '')
DoublePunctuation = r'\?!|!\?|\?\?|!!'
All = [FirstRule, SecondRule, ThirdRule, ForthRule]
class ExclamationPointRules(object):
# Rubular:
InQuotationRule = Rule(r'\!(?=(\'|\"))', '&ᓴ&')
# Rubular:
BeforeCommaMidSentenceRule = Rule(r'\!(?=\,\s[a-z])', '&ᓴ&')
# Rubular:
MidSentenceRule = Rule(r'\!(?=\s[a-z])', '&ᓴ&')
All = [InQuotationRule, BeforeCommaMidSentenceRule, MidSentenceRule]
class SubSymbolsRules(object):
Period = Rule(r'', '.')
ArabicComma = Rule(r'', '،')
SemiColon = Rule(r'', ':')
FullWidthPeriod = Rule(r'&ᓰ&', '')
SpecialPeriod = Rule(r'&ᓱ&', '')
FullWidthExclamation = Rule(r'&ᓳ&', '')
ExclamationPoint = Rule(r'&ᓴ&', '!')
QuestionMark = Rule(r'&ᓷ&', '?')
FullWidthQuestionMark = Rule(r'&ᓸ&', '')
MixedDoubleQE = Rule(r'', '?!')
MixedDoubleQQ = Rule(r'', '??')
MixedDoubleEQ = Rule(r'', '!?')
MixedDoubleEE = Rule(r'', '!!')
LeftParens = Rule(r'&✂&', '(')
RightParens = Rule(r'&⌬&', ')')
TemporaryEndingPunctutation = Rule(r'ȸ', '')
Newline = Rule(r'ȹ', "\n")
All = [Period, ArabicComma, SemiColon, FullWidthPeriod, SpecialPeriod,
FullWidthExclamation, ExclamationPoint, QuestionMark,
FullWidthQuestionMark, MixedDoubleQE, MixedDoubleQQ, MixedDoubleEQ,
MixedDoubleEE, LeftParens, RightParens, TemporaryEndingPunctutation,
class EllipsisRules(object):
# below rules aren't similar to original rules of pragmatic segmenter
# modification: spaces replaced with same number of symbols
# Rubular:
ThreeConsecutiveRule = Rule(r'\.\.\.(?=\s+[A-Z])', '☏☏.')
# Rubular:
FourConsecutiveRule = Rule(r'(?<=\S)\.{3}(?=\.\s[A-Z])', 'ƪƪƪ')
# Rubular:
ThreeSpaceRule = Rule(r'(\s\.){3}\s', '♟♟♟♟♟♟♟')
# Rubular:
FourSpaceRule = Rule(r'(?<=[a-z])(\.\s){3}\.($|\\n)', '♝♝♝♝♝♝♝')
OtherThreePeriodRule = Rule(r'\.\.\.', 'ƪƪƪ')
All = [ThreeSpaceRule, FourSpaceRule, FourConsecutiveRule,
ThreeConsecutiveRule, OtherThreePeriodRule]
class ReinsertEllipsisRules(object):
# below rules aren't similar to original rules of pragmatic segmenter
# modification: symbols replaced with same number of ellipses
SubThreeConsecutivePeriod = Rule(r'ƪƪƪ', '...')
SubThreeSpacePeriod = Rule(r'♟♟♟♟♟♟♟', ' . . . ')
SubFourSpacePeriod = Rule(r'♝♝♝♝♝♝♝', '. . . .')
SubTwoConsecutivePeriod = Rule(r'☏☏', '..')
SubOnePeriod = Rule(r'', '.')
All = [SubThreeConsecutivePeriod, SubThreeSpacePeriod, SubFourSpacePeriod,
SubTwoConsecutivePeriod, SubOnePeriod]
class AbbreviationReplacer(AbbreviationReplacer):
SENTENCE_STARTERS = "A Being Did For He How However I In It Millions "\
"More She That The There They We What When Where Who Why".split(" ")