30 lines
749 B
Python
30 lines
749 B
Python
import pytest
|
|
|
|
DSB_BASIC_TOKENIZATION_TESTS = [
|
|
(
|
|
"Ale eksistěrujo mimo togo ceła kopica narěcow, ako na pśikład slěpjańska.",
|
|
[
|
|
"Ale",
|
|
"eksistěrujo",
|
|
"mimo",
|
|
"togo",
|
|
"ceła",
|
|
"kopica",
|
|
"narěcow",
|
|
",",
|
|
"ako",
|
|
"na",
|
|
"pśikład",
|
|
"slěpjańska",
|
|
".",
|
|
],
|
|
),
|
|
]
|
|
|
|
|
|
@pytest.mark.parametrize("text,expected_tokens", DSB_BASIC_TOKENIZATION_TESTS)
|
|
def test_dsb_tokenizer_basic(dsb_tokenizer, text, expected_tokens):
|
|
tokens = dsb_tokenizer(text)
|
|
token_list = [token.text for token in tokens if not token.is_space]
|
|
assert expected_tokens == token_list
|