ai-content-maker/.venv/Lib/site-packages/benchmarks/genia_benchmark.py

101 lines
2.6 KiB
Python
Raw Normal View History

2024-05-03 04:18:51 +03:00
import blingfire
import nltk
import pysbd
import spacy
import stanza
from syntok.tokenizer import Tokenizer
import syntok.segmenter as syntok_segmenter
from pathlib import Path
pysbd_segmenter = pysbd.Segmenter(language="en", clean=False, char_span=False)
nlp = spacy.blank('en')
nlp.add_pipe(nlp.create_pipe("sentencizer"))
nlp_dep = spacy.load('en_core_web_sm', disable=["ner"])
#stanza.download('en')
stanza_nlp = stanza.Pipeline(lang='en', processors='tokenize')
syntok_tokenizer = Tokenizer()
def blingfire_tokenize(text):
return blingfire.text_to_sentences(text).split('\n')
def nltk_tokenize(text):
return nltk.sent_tokenize(text)
def pysbd_tokenize(text):
segments = pysbd_segmenter.segment(text)
return [s.strip() for s in segments]
def spacy_tokenize(text):
return [sent.text.strip("\n") for sent in nlp(text).sents]
def spacy_dep_tokenize(text):
return [sent.text.strip("\n") for sent in nlp_dep(text).sents]
def stanza_tokenize(text):
return [e.text for e in stanza_nlp(text).sentences]
def make_sentences(segmented_tokens):
for sentence in segmented_tokens:
yield "".join(str(token) for token in sentence).strip()
def syntok_tokenize(text):
tokens = syntok_tokenizer.split(text)
result = syntok_segmenter.split(iter(tokens))
segments = [sent for sent in make_sentences(result)]
return segments
def load_genia_corpus(genia_raw_dir):
txtfiles = Path(genia_raw_dir).glob("**/*.txt")
txtfiles = list(txtfiles)
all_docs = []
for ind, txtfile in enumerate(txtfiles, start=1):
with open(txtfile) as f:
geniatext = f.read().strip()
expected = geniatext.split('\n')
all_docs.append((geniatext, expected))
return all_docs
def benchmark(docs, tokenize_func):
correct = 0
for (text, expected) in docs:
segments = tokenize_func(text)
if segments == expected:
correct +=1
return correct
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument(
'--genia',
help="Path to the directory containing genia data."
)
args = parser.parse_args()
libraries = (
blingfire_tokenize,
nltk_tokenize,
pysbd_tokenize,
spacy_tokenize,
spacy_dep_tokenize,
stanza_tokenize,
syntok_tokenize
)
docs = load_genia_corpus(args.genia)
total = len(docs)
for tokenize_func in libraries:
correct = benchmark(docs, tokenize_func)
percent_score = correct/total * 100
print()
print(tokenize_func.__name__)
print('GENIA abstract acc: {:0.2f}%'.format(percent_score))