65 lines
2.1 KiB
Python
65 lines
2.1 KiB
Python
import sys
|
|
from pathlib import Path
|
|
|
|
import pysbd
|
|
from benchmarks.benchmark_sbd_tools import (
|
|
blingfire_tokenize,
|
|
nltk_tokenize,
|
|
pysbd_tokenize,
|
|
spacy_tokenize,
|
|
spacy_dep_tokenize,
|
|
stanza_tokenize,
|
|
syntok_tokenize)
|
|
|
|
segmenter = pysbd.Segmenter(language="en", clean=False, char_span=False)
|
|
|
|
def run_full_genia_corpus(genia_raw_dir):
|
|
txtfiles = Path(genia_raw_dir).glob("**/*.txt")
|
|
txtfiles = list(txtfiles)
|
|
failed = []
|
|
passed = 0
|
|
for ind, txtfile in enumerate(txtfiles, start=1):
|
|
print(f'Processing {ind}: {txtfile}')
|
|
with open(txtfile) as f:
|
|
geniatext = f.read().strip()
|
|
expected = geniatext.split('\n')
|
|
segments = [s.strip() for s in segmenter.segment(geniatext)]
|
|
# segments = nltk_tokenize(geniatext)
|
|
try:
|
|
assert segments == expected
|
|
passed += 1
|
|
except AssertionError:
|
|
print("Failed")
|
|
failed.append(txtfile)
|
|
print(f'Total Files {len(txtfiles)} | Passed: {passed} | Failed: {len(failed)}')
|
|
return failed
|
|
|
|
def to_file(failed, outputpath):
|
|
with open(outputpath, 'w') as f:
|
|
for eachpath in failed:
|
|
f.write(f'{eachpath}\n')
|
|
|
|
def genia_failed_cases_inspector(filepath):
|
|
# /Users/nipunsadvilkar/projects/Personal/genia-dependency-trees/raw/future_use/7665588.txt
|
|
with open(filepath) as f:
|
|
geniatext = f.read().strip()
|
|
|
|
expected = geniatext.split('\n')
|
|
# segments = segmenter.segment(geniatext)
|
|
segments = nltk_tokenize(geniatext)
|
|
|
|
while len(expected) < len(segments):
|
|
expected.append("")
|
|
# print(len(segments), len(expected))
|
|
|
|
for seg, exp in zip(segments, expected):
|
|
if seg != exp:
|
|
print(f'{repr(exp)} >>>>>>> {repr(seg)}')
|
|
|
|
|
|
if __name__ == "__main__":
|
|
genia_raw_dir = "/Users/nipunsadvilkar/projects/Personal/genia-dependency-trees/raw/"
|
|
failed_files = run_full_genia_corpus(genia_raw_dir)
|
|
to_file(failed_files, 'benchmarks/pysbd_on_genia_failed_new.txt')
|
|
# genia_failed_cases_inspector(sys.argv[1])
|