55 lines
2.1 KiB
Python
55 lines
2.1 KiB
Python
|
#!/usr/bin/env python
|
||
|
# -*- coding: utf-8 -*-
|
||
|
from os import sendfile
|
||
|
import pysbd
|
||
|
import random
|
||
|
import pyconll
|
||
|
|
||
|
# random.seed(25)
|
||
|
|
||
|
def universal_gold_text_sentences(conllu_content):
|
||
|
expected_sents = [sent.text.rstrip('"') for sent in conllu_content]
|
||
|
# sentences_random_order
|
||
|
# random.shuffle(expected_sents)
|
||
|
return "\n".join(expected_sents), expected_sents
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
conllu_content = pyconll.load_from_file('/Users/nipunsadvilkar/projects/Personal/UD_Marathi-UFAL/mr_ufal-ud-dev.conllu')
|
||
|
# conllu_content = pyconll.load_from_file('/Users/nipunsadvilkar/projects/Personal/UD_English-EWT/en_ewt-ud-dev.conllu')
|
||
|
# conllu_content = pyconll.load_from_file('/Users/nipunsadvilkar/projects/Personal/UD_Spanish-GSD/es_gsd-ud-dev.conllu')
|
||
|
text, expected = universal_gold_text_sentences(conllu_content)
|
||
|
# text = "\n".join([l.strip().strip('"') for l in open('en-ewt.txt').readlines()])
|
||
|
expected = text.split('\n')
|
||
|
# segmenter = pysbd.Segmenter(language="mr", clean=False, char_span=False)
|
||
|
segmenter = pysbd.Segmenter(language="en", clean=False, char_span=False)
|
||
|
# segmenter = pysbd.Segmenter(language="es", clean=False, char_span=False)
|
||
|
segments = segmenter.segment(text)
|
||
|
segments = [s.strip() for s in segments]
|
||
|
# 38
|
||
|
# print(text)
|
||
|
# print(expected[:15])
|
||
|
# print(segments[:15])
|
||
|
# while len(expected) < len(segments):
|
||
|
# expected.append("")
|
||
|
# while len(segments) < len(expected):
|
||
|
# segments.append("")
|
||
|
|
||
|
for seg, exp in zip(segments, expected):
|
||
|
if seg == exp:
|
||
|
pass
|
||
|
print(f'{repr(exp[:10])} === {repr(seg[:10])}')
|
||
|
else:
|
||
|
print(f'{repr(exp)} >>> {repr(seg)}')
|
||
|
# break
|
||
|
print(len(segments), len(expected))
|
||
|
# assert expected == segments
|
||
|
# print(text[:200])
|
||
|
# print(expected[:15])
|
||
|
# print(segments[:15])
|
||
|
# print(text[:100])
|
||
|
# with open('en-ewt-segments.txt', 'w') as f:
|
||
|
# for ind, sent in enumerate(segments):
|
||
|
# f.write(sent + '\n')
|
||
|
# if ind == 10:
|
||
|
# break
|