diff --git a/src/dom_tokenizers/pre_tokenizers/splitter.py b/src/dom_tokenizers/pre_tokenizers/splitter.py index 62b33aa..fcbe745 100644 --- a/src/dom_tokenizers/pre_tokenizers/splitter.py +++ b/src/dom_tokenizers/pre_tokenizers/splitter.py @@ -295,6 +295,12 @@ def _sub_js_escape(self, splits, cursor): curr = splits[cursor_limit] cursor_limit += 1 + if not curr: + with open("bad-fails.log", "a") as fp: + print(splits[max(cursor-10, 0):min(cursor+10, len(splits))], + file=fp) + curr = "[error]" # XXX what to do? + # Store what we want at `splits[cursor:cursor_limit]` in `result`. match curr[0]: case "'":