diff --git a/src/dom_tokenizers/pre_tokenizers/base64.py b/src/dom_tokenizers/pre_tokenizers/base64.py new file mode 100644 index 0000000..2e3102f --- /dev/null +++ b/src/dom_tokenizers/pre_tokenizers/base64.py @@ -0,0 +1,58 @@ +from typing import Optional + + +class B64SkewCalculator: + def __init__(self, ranges=("AZ", "az", "09"), extras="/+"): + bins = [ + "".join(map(chr, range(start, stop + 1))) + for start, stop in ( + map(ord, start_stop) + for start_stop in ranges + ) + ] + if extras: + bins.append(extras) + + self._num_bins = len(bins) + self._char_bins = {} + for bin_index, bin_chars in enumerate(bins): + for c in bin_chars: + self._char_bins[c] = bin_index + + alphabet_size = len(self._char_bins) + self._expectations = [len(b) / alphabet_size for b in bins] + + def __call__(self, text: str) -> Optional[float]: + """Return a value indicating how different `text` appears + compared with base64-encoded random data, with zero being + "this looks exactly like base64-encoded random data", and + None being "it's not possible to decide". + """ + if not text: + return None + counts = [0] * self._num_bins + try: + for c in text: + counts[self._char_bins[c]] += 1 + except KeyError: + return None # invalid character + + normalized = 1 / len(text) + return max( + abs(normalized * count - expectation) + for count, expectation in zip(counts, self._expectations) + ) + + +base64_skew = B64SkewCalculator(extras=None) + + +def base64_probability(text: str) -> float: + """Return a value indicating how closely `text` resembles base64- + encoded random data, with 1 being "exactly like it" and 0 being + "100% not it". + """ + skew = base64_skew(text) + if not skew: + return 0 + return 1 - skew diff --git a/src/dom_tokenizers/pre_tokenizers/splitter.py b/src/dom_tokenizers/pre_tokenizers/splitter.py index bbfbef0..1af33ff 100644 --- a/src/dom_tokenizers/pre_tokenizers/splitter.py +++ b/src/dom_tokenizers/pre_tokenizers/splitter.py @@ -13,6 +13,7 @@ from unidecode import unidecode from ..internal import json +from .base64 import base64_probability logger = logging.getLogger(__name__) debug = logger.debug @@ -561,6 +562,10 @@ def _postprocess(self, tokens: Iterable[str]) -> Iterable[str]: yield "digits" continue + if len(token) > 4 and base64_probability(token) >= 0.92: + yield self.base64_token + continue + if len(token) <= self.MAXWORDLEN: yield token continue diff --git a/tests/resources/base64-misses/1655961866939.json b/tests/resources/base64-misses/1655961866939.json new file mode 100644 index 0000000..7c7ef8f --- /dev/null +++ b/tests/resources/base64-misses/1655961866939.json @@ -0,0 +1 @@ +{"text":"(function(){window._cf_chl_opt={cvId: '3',cZone: \"crozdesk.com\",cType: 'managed',cNounce: '90691',cRay: '87a1de5f6973dd79',cHash: 'f9d62fffa411e3c',cUPMDTk: \"\\/software\\/cookie-control?__cf_chl_tk=FPjx2tJ8PU8Y.AobQjVkcVg6coBd990lKwldS85vv14-1714085181-0.0.1.1-1621\",cFPWv: 'b',cTTimeMs: '1000',cMTimeMs: '375000',cTplV: 5,cTplB: 'cf',cK: \"visitor-time\",fa: \"\\/software\\/cookie-control?__cf_chl_f_tk=FPjx2tJ8PU8Y.AobQjVkcVg6coBd990lKwldS85vv14-1714085181-0.0.1.1-1621\",md: \"562HSg_FYcKoLibZf3oC_h256oIM.L58AFzt0jp1HKU-1714085181-1.1.1.1-i0bcAS9e5f_bkXs0wSeU2cgw9KSq9TNgaaCSVcYWlZTUxv6YoPBUbmVCJd4f1D4F.REFzmQ9q7ctiA6JTjMTNg8qUISocK32A4FEUyUTFHduoLVShyiw5TvBrf.dlwAZATg2EIFWKv6gBjjrxqPvjG.a8kA6BHIeB.C32cLVAhn59e899GYZReNqxIIthBFt05Wb7nXd42so33jopcKUpJ60Aj.OIuzMWo2oFr3tD1hXSUO9Y4xKC2lB1TYtss2Wm7gLMPhuvNpBb.DAht_hNWTatGuI_ltmul_EdAyEse.Slinrn3rOee2BX50FpTz44lDxQDVhjc1G9sg66Kfh7TUbqNcKKayPfrP.7hRmEhkVDiUrH4D.orGU5_cH5_67KV1B.1qRUYessfajnqaRgDDK5JoHk5oJItIFM92s_A.LZJfq0IKpwo6B66s9TVBITe7ssk9TDgeDmkhpbjXtzuF0__oV0tpLm4U2Rv3veQJzgyXYzOCLVH9wFA2LqgXx8iPGZk4mc_L0gH7uiS0HpxahWElsqTPIQS2YzobL.gUBrEpJp40LBECRsAQ1aw64g5CAcc44_qUyDJk_cvRBjnh1F.jTmhV8FOhoGU34h4UtbSr_bFZ3j3zkaueUh003K0iL9IGKOvLtdZFtK8GNF6xpP3SiOJ835c6Pivj0eaegPeiaMLDqxrOUPYV6U5vdsS7g40r7zluc6IJiyVbaO44l8vAt3QnVY_RMsN27iL__yHFjL2WMwMRHXYgJLEzttkJQplNghhkyHZ7tj3EGgA8SPT1WR1NoSN2bfE1Cj7eeQm7DvosXZRrxJ7JpyhA2VEvrDUtjCK0EHN8Fg1jzvsNiKp7x9O_umMS7qa0g1uzfVfFvOuBWtvLsOAprEAjcZw_SX7i1TKjg2xYxsK2P4fuY09PpHOoSEO.fRYeo7LxRj3Xhh0jBZQghNzJRkshOaLmiM0OiHg8I7t0WqW3zUaO1poLw09ObhdpwxAz0S3oqttbeqofG_htJNfxXkI412gHDBRMfM9wdVLNKMaeJqeJaUmLl_tui0ZTdmMfNHUXP.Fu04cJeNamdRzBa4iu6QgMwDcCn883ZMENOBYkCb2ZYqXZtkXsAigUs9OHhwT7pgNujoV5GmobW.DvDgv2JQtr0NVhcyH7JjwSf.ly_T5cAp1gLxlStbKEQXdlKADePz3Ntf6jItopV.MRtp3jxdZpXBZp1or7CJh5OvrxeQu943gPLBJtQbhBSMuut8ZAVjYVUc1iqnTjMZ.MDETe1Pzht6ytQzqQQfV34_mj4oTS2BXv1v6RIFdrGe0T5j0wEozuZao4URTB4uOCEbKok32PrUtKl0BOAzbUsvv7bdrDR8zXOrnUw8MBvafjmlEuGWR2v2QT.dFE.gVpczCMIDe5.VKHaL145EAQOPSejxa7JaRX7z39Frdya_Nnhki2h5qkOQNQFxtSH_HugPgzZqQCHYyZ1WNL0Uuw.1igq8mNwUlEHPwRq9Hf2QfQSCoiQN6ooVn64auw3HXFvRp.aauTX3w7ZGUxrHHcS92z2eQ\",mdrd: \"T35c6laXGLyrOtBecJOz9VpyXFNkueYalGKEiiXkjyw-1714085181-1.1.1.1-Q0PHtT_qGYN9ygMsBLWjAphWXKOqOLZFXtLCMUbcFEqb6ckSWt5Q7NBesfLtRzisiglTPu5W2oSGee.kjzOdYXcNydPaVBsIVFAOhD5Qn8Gfcv2zJ.y6nhZ3qycIUzHqsWAmeXpVmnNE0GjEAGsuawS0ZD1SxA_8ldavog1vhPMjvyLIE3dZA7SasLj8JTMtCgEr0TkJabcdW9oViIDegaRXENEWcckVmO.kY706w1J4Uy2i6XhKiGuF4DVHtyNm6VAy2.ewU.XOpxtadKJClRmKRgMkHQApACAnKKHFruckpKjDMadQ2N_zJyBpjZHteDVmLgEi0HiDy9HZ8do1MrTematwlgxdyBCJnSdyOu2CAAl_f1FfKlyj1tt0csBEGl8GStPjz3UubIkH6gCnVeDv5O8cf3ebTLDhQWzBr9OmbuA0ckSwpdtABeREtA0p3kVK4E_gVaS7jcDT._S_ZVRYYKHpbur2BtCSkbHvvgoZNPDp78khw7BowOpfvgrFfAODgcH8Bs7BN5eyShJtkyiWcBmZmc.QJRBphGwdQAebLzzTFdBkC7USxu1ZM.nQA3XCRIYNygTV9Fs0jswyKKy1xS5FDIpXoTY4RRPeLhJhLGgi8VvF6fyCktIpyrR7HcchNWzQUX9WiLTh0fRrlqX0lxsI6TuMQQ2I6DGAHHMIeql.kbNMrNsshob360f64e7F_7Eh6uXhzOYOgmA_oVarCxo43QPi_uJvc17SC7h2qtme4VHgn2W.k3ZBafNZihLdzp6gevr7I_1b95QppTFSZLFY1IfURQfA_zSLOc4N.a.7xxEWrMwSC_J6iXSL6AZQv9_6fTfiP7eIlbAHr8o4pK6GP1GdLGBO4Bwgv9LFNAFMuqrf_RTf.JRih.YbbY2CkzU1F4jsSor92aAZVRlyz_wPOlum91__PU7OdhTi0X.VPOQgMgR0twx3edxoAoP.K_qzb4k4RBfPsoN4L27_mI_JU6AMJ7Ya2QVVWv5xtsr8qkkHn5IKed458tIP9ApRhlhCf4x8Y4kCcp_dC0Mzw2LVGc6x249R7ip7RBGumIETUI1WXJltj9ejfoDpplrtFlWu.jcwqzt4545SwILjxxUGTnkT3dIs8T7TcG7BYU3nADo1W6kVx6I3zgf_Sp4V7xj1oDA4uXRhQIyMt0C9uJrPmTeuOk4fr.vqfQ1La4RN_6AQDYymTj66GzQKBx97KuVadCgU6m733IFFTZbLw6l5QCYPuyyew4uopsWoyP7iom2BZtpk1hTZewOLaNYdIk1pBLBt38jxGY3IDcBpe5THLLL.6vgRHlzCXYzZ_ECBUFwJU9hOx502LrI2LV7bT27i.bzaMb5HEKvni43w5ZD5H9t1dir4rU5Wm3GWTjbH3eD4Co9I44v3HvMl_w1xIZmsx0mhAvZ_abQGbGJIhg8EiQKi.fx0nbqh2BbPxsjzgI6jcvZHxpHrPvWFEkr5rHHNq.VVlaDNr7aYJm_L2Nvc73seCFBmJF03Qyso7eOcLemqq1uWF.HTWgHY5OS.RWW1Skdk74UxdDdVta_1McXswwfYX0YHQV_MSLxCPGUlU7ZBeKh2bgWXqH.An9X7z42J4OFw2gmM9eUVwEvP1rv2BWgZjnGLg6cG8.5g7WZxtNRC6wy8b2RE9u3uUdadxyauwjoHGB69O_owt2_yKJsIPrlT6Zc3kGRfiZEBe8.Y1BPSpVLImbo2V2DWPLhIMBOIY4zGEb9caiyDOZvyP0LnQ7XrAWC9b5NN.D3REnlrxKSioN1q8aB2p1GpS1EnB3INpcuvsmPSe4jbib8SAZyKoQ2fqONE.uZLr3qiEetNiVKagUPJVebntPihfbZunA9ANLH_OqqenkQUkIKqZBnny4o9raEIAIcBUr6GIFhGSJmpJtDvhPk4NnOLuth7zStx.LRCgs5qAy_lXq4Fmr0GN23.5NsRMEOtY1418dO0X6ujtCQdvDE9AQEcsvnMz7bItlg_7bJ9mWbu.YfPyDFKB0mxLhkH0Ipxom0BXNIP0fgZIXlzKkJJlXUxEGGm5TEcPRLtmWgIVSE4y2xPbFmVz_pFkejLM520buqmwWNtEpt65oIHrn18dLw8Wft0MkNkv2IjtHkplrCo_fkwavWA7swJYEDb_wSM7kvETAaZ1g049G23dDAneN4uDM6NbH8XFrWFaeewqKdrPQ\",cRq: {ru: 'aHR0cHM6Ly9jcm96ZGVzay5jb20vc29mdHdhcmUvY29va2llLWNvbnRyb2w=',ra: 'TW96aWxsYS81LjAgKFgxMTsgTGludXggeDg2XzY0KSBBcHBsZVdlYktpdC81MzcuMzYgKEtIVE1MLCBsaWtlIEdlY2tvKSBDaHJvbWUvMTI0LjAuNjM2Ny42MCBTYWZhcmkvNTM3LjM2',rm: 'R0VU',d: 'xxlrWNIAoEN/qf8D3FzPx2jeQgMfm7eM/iqXTgRGd4ZTBn8UAM9GL3uRL8pPferEg2AvinHtgC3XwM7SzPiXyvK9pkde3NLgCgNK+Z0RkxMB00lp5+fN7GWU5kmI5kKg8YXhQFJIfACOqWcF73WeMI0rMNU58ZuLPe4l/z8X36g7ay7zCFnvpaT5l1R1tPN3ztwtNeVkntG6j7j5oMEX1AWH9HGuXYzFKRDZZnUgQsJTFLkR32O2zsvoGdAcl77gkHCjKTygy4QQIoFvVN5tzR0eEleeTS6PMMLwm1cRLlFmnXtg2LMBP2f20j5ZU3evTLWwgd1Ae5g+GngEppnGViEuroaBt2VN5jpjShVCFwSHs70h0iRNrh+QAGVhpLxf1JkQfVnisFT/adBTGIgIUseaHG4r7Pw0EPd7sxogrBqSDtGIQHsbyoyJz1hun6/6pAFw25mEPKk1E6V25+wx1fIoHq0Y+KDGtKYFHfBi2VMtgixdImxEkOu9n8w5ut3YTIN78hZsz+OMvhEDWmhChni8fMJs823OgRTA5yoAAyi3fiFH38LbH5a4Ffew0x8j0eXE7tgpR5wu59A5A5PkDQ==',t: 'MTcxNDA4NTE4MS4zNDYwMDA=',cT: Math.floor(Date.now() / 1000),m: 'UJ/hKTrF+QVzceRRYb4MjF7SDPkpYw3nqqAJkN9no3k=',i1: 'HZqFhSRh2u+ql7TKRab42Q==',i2: 'HDBQsd+unjYiNY0ahnd43Q==',zh: 'SBbNOloTyDCKW6YVLbijVjU5GKcySW1sxaNdzdUxmWo=',uh: 'nqnWoBf0oAOqKsinkrrsJhx68E95jW04Tm27bN1KxEg=',hh: '2QzrHTBmBTmEAXpuuzGhJIr4P4WtG4m+QOTQ/vWtF2M=',}};var cpo = document.createElement('script');cpo.src = '/cdn-cgi/challenge-platform/h/b/orchestrate/chl_page/v1?ray=87a1de5f6973dd79';window._cf_chl_opt.cOgUHash = location.hash === '' && location.href.indexOf('#') !== -1 ? '#' : location.hash;window._cf_chl_opt.cOgUQuery = location.search === '' && location.href.slice(0, location.href.length - window._cf_chl_opt.cOgUHash.length).indexOf('?') !== -1 ? '?' : location.search;if (window.history && window.history.replaceState) {var ogU = location.pathname + window._cf_chl_opt.cOgUQuery + window._cf_chl_opt.cOgUHash;history.replaceState(null, null, \"\\/software\\/cookie-control?__cf_chl_rt_tk=FPjx2tJ8PU8Y.AobQjVkcVg6coBd990lKwldS85vv14-1714085181-0.0.1.1-1621\" + window._cf_chl_opt.cOgUHash);cpo.onload = function() {history.replaceState(null, null, ogU);}}document.getElementsByTagName('head')[0].appendChild(cpo);}());"} \ No newline at end of file diff --git a/tests/test_base64_misses.py b/tests/test_base64_misses.py new file mode 100644 index 0000000..3138cd0 --- /dev/null +++ b/tests/test_base64_misses.py @@ -0,0 +1,12 @@ +from dom_tokenizers.pre_tokenizers.splitter import TextSplitter + +from .util import load_resource, json + + +def load_b64_miss(basename): + return json.loads(load_resource(f"base64-misses/{basename}.json"))["text"] + + +def test_1655961866939(): + tokens = TextSplitter().split(load_b64_miss(1655961866939)) + assert "L0gH7uiS0HpxahWElsqTPIQS2YzobL" not in tokens diff --git a/tests/test_splitter.py b/tests/test_splitter.py index e980fe5..0f9c3ed 100644 --- a/tests/test_splitter.py +++ b/tests/test_splitter.py @@ -232,7 +232,7 @@ def test_prefixed_hex(text, expect_tokens): ("src: url(//fonts.gstatic.com/s/roboto/v18/KFOmCnqEu92Fr1Mu4mxK" ".woff2) format('woff2');\\n unicode-range: U+0000-00FF, ", ["src", "url", "fonts", "gstatic", "com", "s", "roboto", "v18", - "KFOmCnqEu92Fr1Mu4mxK", "woff2", "format", "woff2", "unicode", + "[BASE64]", "woff2", "format", "woff2", "unicode", "range", "U", "0000", "00FF"]), )) def test_regressions(text, expect_tokens):