# -*- coding: utf-8 -*- import re from pysbd.utils import Text from pysbd.clean.rules import PDF, HTML, CleanRules as cr class Cleaner(object): def __init__(self, text, lang, doc_type=None): self.text = text self.lang = lang self.doc_type = doc_type def clean(self): if not self.text: return self.text self.remove_all_newlines() self.replace_double_newlines() self.replace_newlines() self.replace_escaped_newlines() self.text = Text(self.text).apply(*HTML.All) self.replace_punctuation_in_brackets() self.text = Text(self.text).apply(cr.InlineFormattingRule) self.clean_quotations() self.clean_table_of_contents() self.check_for_no_space_in_between_sentences() self.clean_consecutive_characters() return self.text def remove_all_newlines(self): self.remove_newline_in_middle_of_sentence() self.remove_newline_in_middle_of_word() def remove_newline_in_middle_of_sentence(self): def replace_w_blank(match): match = match.group() sub = re.sub(cr.NEWLINE_IN_MIDDLE_OF_SENTENCE_REGEX, '', match) return sub self.text = re.sub(r'(?:[^\.])*', replace_w_blank, self.text) def remove_newline_in_middle_of_word(self): self.text = Text(self.text).apply(cr.NewLineInMiddleOfWordRule) def replace_double_newlines(self): self.text = Text(self.text).apply(cr.DoubleNewLineWithSpaceRule, cr.DoubleNewLineRule) def remove_pdf_line_breaks(self): self.text = Text( self.text).apply(cr.NewLineFollowedByBulletRule, PDF.NewLineInMiddleOfSentenceRule, PDF.NewLineInMiddleOfSentenceNoSpacesRule) def replace_newlines(self): if self.doc_type == 'pdf': self.remove_pdf_line_breaks() else: self.text = Text( self.text).apply(cr.NewLineFollowedByPeriodRule, cr.ReplaceNewlineWithCarriageReturnRule) def replace_escaped_newlines(self): self.text = Text( self.text).apply(cr.EscapedNewLineRule, cr.EscapedCarriageReturnRule, cr.TypoEscapedNewLineRule, cr.TypoEscapedCarriageReturnRule) def replace_punctuation_in_brackets(self): def replace_punct(match): match = match.group() if '?' in match: sub = re.sub(re.escape('?'), '&ᓷ&', match) return sub return match self.text = re.sub(r'\[(?:[^\]])*\]', replace_punct, self.text) def clean_quotations(self): # method added explicitly # pragmatic-segmenter applies thhis method # at different location self.text = re.sub('`', "'", self.text) self.text = Text(self.text).apply( cr.QuotationsFirstRule, cr.QuotationsSecondRule) def clean_table_of_contents(self): self.text = Text(self.text).apply( cr.TableOfContentsRule, cr.ConsecutivePeriodsRule, cr.ConsecutiveForwardSlashRule) def search_for_connected_sentences(self, word, txt, regex, rule): if not re.search(regex, word): return txt if any(k in word for k in cr.URL_EMAIL_KEYWORDS): return txt new_word = Text(word).apply(rule) txt = re.sub(re.escape(word), new_word, txt) return txt def check_for_no_space_in_between_sentences(self): words = self.text.split(' ') for word in words: self.text = self.search_for_connected_sentences(word, self.text, cr.NO_SPACE_BETWEEN_SENTENCES_REGEX, cr.NoSpaceBetweenSentencesRule) self.text = self.search_for_connected_sentences(word, self.text, cr.NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX, cr.NoSpaceBetweenSentencesDigitRule) def clean_consecutive_characters(self): self.text = Text(self.text).apply( cr.ConsecutivePeriodsRule, cr.ConsecutiveForwardSlashRule)