import sys from pathlib import Path import pysbd from benchmarks.benchmark_sbd_tools import ( blingfire_tokenize, nltk_tokenize, pysbd_tokenize, spacy_tokenize, spacy_dep_tokenize, stanza_tokenize, syntok_tokenize) segmenter = pysbd.Segmenter(language="en", clean=False, char_span=False) def run_full_genia_corpus(genia_raw_dir): txtfiles = Path(genia_raw_dir).glob("**/*.txt") txtfiles = list(txtfiles) failed = [] passed = 0 for ind, txtfile in enumerate(txtfiles, start=1): print(f'Processing {ind}: {txtfile}') with open(txtfile) as f: geniatext = f.read().strip() expected = geniatext.split('\n') segments = [s.strip() for s in segmenter.segment(geniatext)] # segments = nltk_tokenize(geniatext) try: assert segments == expected passed += 1 except AssertionError: print("Failed") failed.append(txtfile) print(f'Total Files {len(txtfiles)} | Passed: {passed} | Failed: {len(failed)}') return failed def to_file(failed, outputpath): with open(outputpath, 'w') as f: for eachpath in failed: f.write(f'{eachpath}\n') def genia_failed_cases_inspector(filepath): # /Users/nipunsadvilkar/projects/Personal/genia-dependency-trees/raw/future_use/7665588.txt with open(filepath) as f: geniatext = f.read().strip() expected = geniatext.split('\n') # segments = segmenter.segment(geniatext) segments = nltk_tokenize(geniatext) while len(expected) < len(segments): expected.append("") # print(len(segments), len(expected)) for seg, exp in zip(segments, expected): if seg != exp: print(f'{repr(exp)} >>>>>>> {repr(seg)}') if __name__ == "__main__": genia_raw_dir = "/Users/nipunsadvilkar/projects/Personal/genia-dependency-trees/raw/" failed_files = run_full_genia_corpus(genia_raw_dir) to_file(failed_files, 'benchmarks/pysbd_on_genia_failed_new.txt') # genia_failed_cases_inspector(sys.argv[1])