82 lines
2.3 KiB
Python
82 lines
2.3 KiB
Python
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
import re
|
|
import pysbd
|
|
|
|
class Rule(object):
|
|
|
|
def __init__(self, pattern, replacement):
|
|
self.pattern = pattern
|
|
self.replacement = replacement
|
|
|
|
def __repr__(self): # pragma: no cover
|
|
return '<{} pattern="{}" and replacement="{}">'.format(
|
|
self.__class__.__name__, self.pattern, self.replacement)
|
|
|
|
|
|
class Text(str):
|
|
"""Extending str functionality to apply regex rules
|
|
|
|
https://stackoverflow.com/questions/4698493/can-i-add-custom-methods-attributes-to-built-in-python-types
|
|
|
|
Parameters
|
|
----------
|
|
str : str
|
|
string content
|
|
|
|
Returns
|
|
-------
|
|
str
|
|
input as it is if rule pattern doesnt match
|
|
else replacing found pattern with replacement chars
|
|
"""
|
|
def apply(self, *rules):
|
|
for each_r in rules:
|
|
self = re.sub(each_r.pattern, each_r.replacement, self)
|
|
return self
|
|
|
|
|
|
class TextSpan(object):
|
|
|
|
def __init__(self, sent, start, end):
|
|
"""
|
|
Sentence text and its start & end character offsets within original text
|
|
|
|
Parameters
|
|
----------
|
|
sent : str
|
|
Sentence text
|
|
start : int
|
|
start character offset of a sentence in original text
|
|
end : int
|
|
end character offset of a sentence in original text
|
|
"""
|
|
self.sent = sent
|
|
self.start = start
|
|
self.end = end
|
|
|
|
def __repr__(self): # pragma: no cover
|
|
return "{0}(sent={1}, start={2}, end={3})".format(
|
|
self.__class__.__name__, repr(self.sent), self.start, self.end)
|
|
|
|
def __eq__(self, other):
|
|
if isinstance(self, other.__class__):
|
|
return self.sent == other.sent and self.start == other.start and self.end == other.end
|
|
|
|
|
|
class PySBDFactory(object):
|
|
"""pysbd as a spacy component through entrypoints"""
|
|
|
|
def __init__(self, nlp, language='en'):
|
|
self.nlp = nlp
|
|
self.seg = pysbd.Segmenter(language=language, clean=False,
|
|
char_span=True)
|
|
|
|
def __call__(self, doc):
|
|
sents_char_spans = self.seg.segment(doc.text_with_ws)
|
|
start_token_ids = [sent.start for sent in sents_char_spans]
|
|
for token in doc:
|
|
token.is_sent_start = (True if token.idx
|
|
in start_token_ids else False)
|
|
return doc
|