diff --git a/tests/test_splitter.py b/tests/test_splitter.py new file mode 100644 index 0000000..143ab43 --- /dev/null +++ b/tests/test_splitter.py @@ -0,0 +1,26 @@ +import pytest + +from dom_tokenizers.pre_tokenizers.splitter import TextSplitter + + +@pytest.mark.parametrize( + ("text,expect_tokens"), + (("That\u2019s all we know.", + ["That's", "all", "we", "know"]), + ("Page=Login&Action=Login\';\n\t\t\treturn", + ["Page", "Login", "Action", "Login", "return"]), + ("/_next/static/css/99762953f4d03581.css", + ["next", "static", "css", "[LONG]", "hex", "digits", "css"]), + ("http://www1.com.com/?tm=1&subid4=1714127069.0292280000&KW1=News%" + "20Media%20Monitoring%20Tools&KW2=News%20Lead%20Distribution%20Pl" + "atform&KW3=Newsletters&searchbox=0&domainname=0&backfill=0", + ["http", "www1", "com", "com", "tm", "1", "subid4", "[LONG]", + "digits", "[LONG]", "digits", "KW1", "News", "Media", "Monitoring", + "Tools", "KW2", "News", "Lead", "Distribution", "Platform", "KW3", + "Newsletters", "searchbox", "0", "domainname", "0", "backfill", + "0"]), + )) +def test_regressions(text, expect_tokens): + """Check that things we improve stay improved. + """ + assert list(TextSplitter().split(text)) == expect_tokens