# -*- coding: utf-8 -*- import re from pysbd.languages import Language from pysbd.processor import Processor from pysbd.cleaner import Cleaner from pysbd.utils import TextSpan class Segmenter(object): def __init__(self, language="en", clean=False, doc_type=None, char_span=False): """Segments a text into an list of sentences with or withour character offsets from original text Parameters ---------- language : str, required specify a language use its two character ISO 639-1 code, by default "en" clean : bool, optional cleans original text, by default False doc_type : [type], optional Normal text or OCRed text, by default None set to `pdf` for OCRed text char_span : bool, optional Get start & end character offsets of each sentences within original text, by default False """ self.language = language self.language_module = Language.get_language_code(language) self.clean = clean self.doc_type = doc_type self.char_span = char_span if self.clean and self.char_span: raise ValueError("char_span must be False if clean is True. " "Since `clean=True` will modify original text.") # when doctype is pdf then force user to clean the text # char_span func wont be provided with pdf doctype also elif self.doc_type == 'pdf' and not self.clean: raise ValueError("`doc_type='pdf'` should have `clean=True` & " "`char_span` should be False since original" "text will be modified.") def cleaner(self, text): if hasattr(self.language_module, "Cleaner"): return self.language_module.Cleaner(text, self.language_module, doc_type=self.doc_type) else: return Cleaner(text, self.language_module, doc_type=self.doc_type) def processor(self, text): if hasattr(self.language_module, "Processor"): return self.language_module.Processor(text, self.language_module, char_span=self.char_span) else: return Processor(text, self.language_module, char_span=self.char_span) def sentences_with_char_spans(self, sentences): # since SENTENCE_BOUNDARY_REGEX doesnt account # for trailing whitespaces \s* & is used as suffix # to keep non-destructive text after segments joins sent_spans = [] prior_end_char_idx = 0 for sent in sentences: for match in re.finditer('{0}\s*'.format(re.escape(sent)), self.original_text): match_str = match.group() match_start_idx, match_end_idx = match.span() if match_end_idx > prior_end_char_idx: # making sure if curren sentence and its span # is either first sentence along with its char spans # or current sent spans adjacent to prior sentence spans sent_spans.append( TextSpan(match_str, match_start_idx, match_end_idx)) prior_end_char_idx = match_end_idx break return sent_spans def segment(self, text): self.original_text = text if not text: return [] if self.clean or self.doc_type == 'pdf': text = self.cleaner(text).clean() postprocessed_sents = self.processor(text).process() sentence_w_char_spans = self.sentences_with_char_spans(postprocessed_sents) if self.char_span: return sentence_w_char_spans elif self.clean: # clean and destructed sentences return postprocessed_sents else: # nondestructive with whitespaces return [textspan.sent for textspan in sentence_w_char_spans]