diff --git a/README.md b/README.md index 8a28f3b..2b7d7f8 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,68 @@ DOM-aware tokenization for Hugging Face language models. +## TL;DR + +Input: + +```html + + + + Hello world + ... +``` + +Output: + +![<](https://img.shields.io/badge/%3C-CCBFEE?style=flat-square)![html](https://img.shields.io/badge/html-BEEDC6?style=flat-square)![>](https://img.shields.io/badge/%3E-F6D9AB?style=flat-square)![<](https://img.shields.io/badge/%3C-F4AEB1?style=flat-square)![head](https://img.shields.io/badge/head-A4DCF3?style=flat-square)![>](https://img.shields.io/badge/%3E-CCBFEE?style=flat-square)![<](https://img.shields.io/badge/%3C-BEEDC6?style=flat-square)![meta](https://img.shields.io/badge/meta-F6D9AB?style=flat-square)![_](https://img.shields.io/badge/__-F4AEB1?style=flat-square)![http](https://img.shields.io/badge/http-A4DCF3?style=flat-square)![equiv](https://img.shields.io/badge/equiv-CCBFEE?style=flat-square)![=](https://img.shields.io/badge/%3D-BEEDC6?style=flat-square)![content](https://img.shields.io/badge/content-F6D9AB?style=flat-square)![type](https://img.shields.io/badge/type-F4AEB1?style=flat-square)![_](https://img.shields.io/badge/__-A4DCF3?style=flat-square)![content](https://img.shields.io/badge/content-CCBFEE?style=flat-square)![=](https://img.shields.io/badge/%3D-BEEDC6?style=flat-square)![text](https://img.shields.io/badge/text-F6D9AB?style=flat-square)![html](https://img.shields.io/badge/html-F4AEB1?style=flat-square)![charset](https://img.shields.io/badge/charset-A4DCF3?style=flat-square)![utf](https://img.shields.io/badge/utf-CCBFEE?style=flat-square)![8](https://img.shields.io/badge/8-BEEDC6?style=flat-square)![>](https://img.shields.io/badge/%3E-F6D9AB?style=flat-square)![<](https://img.shields.io/badge/%3C-F4AEB1?style=flat-square)![meta](https://img.shields.io/badge/meta-A4DCF3?style=flat-square)![_](https://img.shields.io/badge/__-CCBFEE?style=flat-square)![name](https://img.shields.io/badge/name-BEEDC6?style=flat-square)![=](https://img.shields.io/badge/%3D-F6D9AB?style=flat-square)![viewport](https://img.shields.io/badge/viewport-F4AEB1?style=flat-square)![_](https://img.shields.io/badge/__-A4DCF3?style=flat-square)![content](https://img.shields.io/badge/content-CCBFEE?style=flat-square)![=](https://img.shields.io/badge/%3D-BEEDC6?style=flat-square)![width](https://img.shields.io/badge/width-F6D9AB?style=flat-square)![device](https://img.shields.io/badge/device-F4AEB1?style=flat-square)![width](https://img.shields.io/badge/width-A4DCF3?style=flat-square)![>](https://img.shields.io/badge/%3E-CCBFEE?style=flat-square)![<](https://img.shields.io/badge/%3C-BEEDC6?style=flat-square)![title](https://img.shields.io/badge/title-F6D9AB?style=flat-square)![>](https://img.shields.io/badge/%3E-F4AEB1?style=flat-square)![hello](https://img.shields.io/badge/hello-A4DCF3?style=flat-square)![world](https://img.shields.io/badge/world-CCBFEE?style=flat-square)![](https://img.shields.io/badge/%3E-F4AEB1?style=flat-square)![<](https://img.shields.io/badge/%3C-A4DCF3?style=flat-square)![script](https://img.shields.io/badge/script-CCBFEE?style=flat-square)![>](https://img.shields.io/badge/%3E-BEEDC6?style=flat-square)![document](https://img.shields.io/badge/document-F6D9AB?style=flat-square)![getElementById](https://img.shields.io/badge/getElementById-F4AEB1?style=flat-square)![demo](https://img.shields.io/badge/demo-A4DCF3?style=flat-square)![innerHTML](https://img.shields.io/badge/innerHTML-CCBFEE?style=flat-square)![Hello](https://img.shields.io/badge/Hello-BEEDC6?style=flat-square)![JavaScript](https://img.shields.io/badge/JavaScript-F6D9AB?style=flat-square)![](https://img.shields.io/badge/%3E-CCBFEE?style=flat-square)![...](https://img.shields.io/badge/...-FFFFFF?style=flat-square) + + +## Why? + +Natural language tokeniz(er,ation scheme)s are designed so +as to +a) group particles of meaning together +b) (omit/discard/hide) unimportant details +such that models consuming sequences of token IDs +are presented with what they need in a way they can most +easily (process/derive meaning from) +[in theory, models could consume streams of utf-8, but +the model will have to learn everything the tokenizer does +so consuming resources (layers/neurons/parameters) +and (portentally vastyl) extending training time.] + +for example, tokenizers aimed at languages that delimit with +whitespace generally have features to (omit/discard/embed/hide) +whitespace in their output so the model/consumer does not need +to care about it. + +this shiz aims to do a similar thing but for HTML: +whitespace is discarded, +tag names, attribute names and attribbte values are tokenized +along with the textual content of the document, + +and special tokens are inserted to give context, so e.g. +start and end tags are wrapped in `<`, ``, +attribute names are preceded by `_` +and attribute values preceeded by `=`. + +## Limitations + +tokenizers are usually able to operate in either direction: +both *encoding* natural language into sequences of token IDs +for the model's input, +and *decoding* sequences of token IDs generated by the model +into natural language text. + +generation isn't a goal for me, for now at least: I'm interested +in extracting meaning, + + +, so this +tokenizer will discard some of its input in order to better distil +the meaning of what it's looking at. + ## Installation ### With PIP diff --git a/tokenize.py b/tokenize.py new file mode 100644 index 0000000..5a64fb4 --- /dev/null +++ b/tokenize.py @@ -0,0 +1,44 @@ +from urllib.parse import quote + +colors = [0xccbfee, 0xbeedc6, 0xf6d9ab, 0xf4aeb1, 0xa4dcf3] +tokens = [ + "<", "html", ">", + "<", "head", ">", + + "<", "meta", + "_", "http", "equiv", + "=", "content", "type", + "_", "content", + "=", "text", "html", "charset", "utf", "8", ">", + + "<", "meta", + "_", "name", + "=", "viewport", + "_", "content", + "=", "width", "device", "width", ">", + + "<", "title", ">", + "hello", "world", + "", + + "<", "script", ">", + "document", "getElementById", "demo", "innerHTML", "Hello", + "JavaScript", + "", + +# + + "...", +] + +URL = "https://img.shields.io/badge/" #just%20the%20message-8A2BE2 +EXTRA = "?style=flat-square" +SEP = "" # " " #   + +for i, token in enumerate(tokens): + color = 0xffffff if token == "..." else colors[i % len(colors)] + quoted_token = "__" if token == "_" else quote(token) + print(f"![{token}]({URL}{quoted_token}-{color:06X}{EXTRA})", end=SEP) +print()