# -*- coding: utf-8 -*- from pysbd.utils import Rule class CleanRules(object): # NOTE: Caution: Might require \\ for special characters # if regex is defined with r'' then dont # add extra \\ for special characters # Rubular: http://rubular.com/r/V57WnM9Zut NewLineInMiddleOfWordRule = Rule(r'\n(?=[a-zA-Z]{1,2}\n)', '') # Rubular: http://rubular.com/r/dMxp5MixFS DoubleNewLineWithSpaceRule = Rule(r'\n \n', "\r") # Rubular: http://rubular.com/r/H6HOJeA8bq DoubleNewLineRule = Rule(r'\n\n', "\r") # Rubular: http://rubular.com/r/FseyMiiYFT NewLineFollowedByPeriodRule = Rule(r'\n(?=\.(\s|\n))', '') ReplaceNewlineWithCarriageReturnRule = Rule(r'\n', "\r") EscapedNewLineRule = Rule(r'\\n', "\n") EscapedCarriageReturnRule = Rule(r'\\r', "\r") TypoEscapedNewLineRule = Rule(r'\\\ n', "\n") TypoEscapedCarriageReturnRule = Rule(r'\\\ r', "\r") # Rubular: http://rubular.com/r/bAJrhyLNeZ InlineFormattingRule = Rule(r'{b\^>\d*<b\^}|{b\^>\d*\s]+))?)+\s*|\s*)\/?>", '') # Rubular: http://rubular.com/r/XZVqMPJhea EscapedHTMLTagRule = Rule(r'<\/?[^gt;]*gt;', '') All = [HTMLTagRule, EscapedHTMLTagRule] class PDF(object): # Rubular: http://rubular.com/r/UZAVcwqck8 NewLineInMiddleOfSentenceRule = Rule(r'(?<=[^\n]\s)\n(?=\S)', '') # Rubular: http://rubular.com/r/eaNwGavmdo NewLineInMiddleOfSentenceNoSpacesRule = Rule(r"\n(?=[a-z])", ' ')