diff --git a/README.md b/README.md
index 8a28f3b..2b7d7f8 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,68 @@
DOM-aware tokenization for Hugging Face language models.
+## TL;DR
+
+Input:
+
+```html
+
+
+
+ Hello world
+ ...
+```
+
+Output:
+
+![<](https://img.shields.io/badge/%3C-CCBFEE?style=flat-square)![html](https://img.shields.io/badge/html-BEEDC6?style=flat-square)![>](https://img.shields.io/badge/%3E-F6D9AB?style=flat-square)![<](https://img.shields.io/badge/%3C-F4AEB1?style=flat-square)![head](https://img.shields.io/badge/head-A4DCF3?style=flat-square)![>](https://img.shields.io/badge/%3E-CCBFEE?style=flat-square)![<](https://img.shields.io/badge/%3C-BEEDC6?style=flat-square)![meta](https://img.shields.io/badge/meta-F6D9AB?style=flat-square)![_](https://img.shields.io/badge/__-F4AEB1?style=flat-square)![http](https://img.shields.io/badge/http-A4DCF3?style=flat-square)![equiv](https://img.shields.io/badge/equiv-CCBFEE?style=flat-square)![=](https://img.shields.io/badge/%3D-BEEDC6?style=flat-square)![content](https://img.shields.io/badge/content-F6D9AB?style=flat-square)![type](https://img.shields.io/badge/type-F4AEB1?style=flat-square)![_](https://img.shields.io/badge/__-A4DCF3?style=flat-square)![content](https://img.shields.io/badge/content-CCBFEE?style=flat-square)![=](https://img.shields.io/badge/%3D-BEEDC6?style=flat-square)![text](https://img.shields.io/badge/text-F6D9AB?style=flat-square)![html](https://img.shields.io/badge/html-F4AEB1?style=flat-square)![charset](https://img.shields.io/badge/charset-A4DCF3?style=flat-square)![utf](https://img.shields.io/badge/utf-CCBFEE?style=flat-square)![8](https://img.shields.io/badge/8-BEEDC6?style=flat-square)![>](https://img.shields.io/badge/%3E-F6D9AB?style=flat-square)![<](https://img.shields.io/badge/%3C-F4AEB1?style=flat-square)![meta](https://img.shields.io/badge/meta-A4DCF3?style=flat-square)![_](https://img.shields.io/badge/__-CCBFEE?style=flat-square)![name](https://img.shields.io/badge/name-BEEDC6?style=flat-square)![=](https://img.shields.io/badge/%3D-F6D9AB?style=flat-square)![viewport](https://img.shields.io/badge/viewport-F4AEB1?style=flat-square)![_](https://img.shields.io/badge/__-A4DCF3?style=flat-square)![content](https://img.shields.io/badge/content-CCBFEE?style=flat-square)![=](https://img.shields.io/badge/%3D-BEEDC6?style=flat-square)![width](https://img.shields.io/badge/width-F6D9AB?style=flat-square)![device](https://img.shields.io/badge/device-F4AEB1?style=flat-square)![width](https://img.shields.io/badge/width-A4DCF3?style=flat-square)![>](https://img.shields.io/badge/%3E-CCBFEE?style=flat-square)![<](https://img.shields.io/badge/%3C-BEEDC6?style=flat-square)![title](https://img.shields.io/badge/title-F6D9AB?style=flat-square)![>](https://img.shields.io/badge/%3E-F4AEB1?style=flat-square)![hello](https://img.shields.io/badge/hello-A4DCF3?style=flat-square)![world](https://img.shields.io/badge/world-CCBFEE?style=flat-square)![](https://img.shields.io/badge/%3C/-BEEDC6?style=flat-square)![title](https://img.shields.io/badge/title-F6D9AB?style=flat-square)![>](https://img.shields.io/badge/%3E-F4AEB1?style=flat-square)![<](https://img.shields.io/badge/%3C-A4DCF3?style=flat-square)![script](https://img.shields.io/badge/script-CCBFEE?style=flat-square)![>](https://img.shields.io/badge/%3E-BEEDC6?style=flat-square)![document](https://img.shields.io/badge/document-F6D9AB?style=flat-square)![getElementById](https://img.shields.io/badge/getElementById-F4AEB1?style=flat-square)![demo](https://img.shields.io/badge/demo-A4DCF3?style=flat-square)![innerHTML](https://img.shields.io/badge/innerHTML-CCBFEE?style=flat-square)![Hello](https://img.shields.io/badge/Hello-BEEDC6?style=flat-square)![JavaScript](https://img.shields.io/badge/JavaScript-F6D9AB?style=flat-square)![](https://img.shields.io/badge/%3C/-F4AEB1?style=flat-square)![script](https://img.shields.io/badge/script-A4DCF3?style=flat-square)![>](https://img.shields.io/badge/%3E-CCBFEE?style=flat-square)![...](https://img.shields.io/badge/...-FFFFFF?style=flat-square)
+
+
+## Why?
+
+Natural language tokeniz(er,ation scheme)s are designed so
+as to
+a) group particles of meaning together
+b) (omit/discard/hide) unimportant details
+such that models consuming sequences of token IDs
+are presented with what they need in a way they can most
+easily (process/derive meaning from)
+[in theory, models could consume streams of utf-8, but
+the model will have to learn everything the tokenizer does
+so consuming resources (layers/neurons/parameters)
+and (portentally vastyl) extending training time.]
+
+for example, tokenizers aimed at languages that delimit with
+whitespace generally have features to (omit/discard/embed/hide)
+whitespace in their output so the model/consumer does not need
+to care about it.
+
+this shiz aims to do a similar thing but for HTML:
+whitespace is discarded,
+tag names, attribute names and attribbte values are tokenized
+along with the textual content of the document,
+
+and special tokens are inserted to give context, so e.g.
+start and end tags are wrapped in `<`, `` and `>`,
+attribute names are preceded by `_`
+and attribute values preceeded by `=`.
+
+## Limitations
+
+tokenizers are usually able to operate in either direction:
+both *encoding* natural language into sequences of token IDs
+for the model's input,
+and *decoding* sequences of token IDs generated by the model
+into natural language text.
+
+generation isn't a goal for me, for now at least: I'm interested
+in extracting meaning,
+
+
+, so this
+tokenizer will discard some of its input in order to better distil
+the meaning of what it's looking at.
+
## Installation
### With PIP
diff --git a/tokenize.py b/tokenize.py
new file mode 100644
index 0000000..5a64fb4
--- /dev/null
+++ b/tokenize.py
@@ -0,0 +1,44 @@
+from urllib.parse import quote
+
+colors = [0xccbfee, 0xbeedc6, 0xf6d9ab, 0xf4aeb1, 0xa4dcf3]
+tokens = [
+ "<", "html", ">",
+ "<", "head", ">",
+
+ "<", "meta",
+ "_", "http", "equiv",
+ "=", "content", "type",
+ "_", "content",
+ "=", "text", "html", "charset", "utf", "8", ">",
+
+ "<", "meta",
+ "_", "name",
+ "=", "viewport",
+ "_", "content",
+ "=", "width", "device", "width", ">",
+
+ "<", "title", ">",
+ "hello", "world",
+ "", "title", ">",
+
+ "<", "script", ">",
+ "document", "getElementById", "demo", "innerHTML", "Hello",
+ "JavaScript",
+ "", "script", ">",
+
+#
+
+ "...",
+]
+
+URL = "https://img.shields.io/badge/" #just%20the%20message-8A2BE2
+EXTRA = "?style=flat-square"
+SEP = "" # " " #  
+
+for i, token in enumerate(tokens):
+ color = 0xffffff if token == "..." else colors[i % len(colors)]
+ quoted_token = "__" if token == "_" else quote(token)
+ print(f"![{token}]({URL}{quoted_token}-{color:06X}{EXTRA})", end=SEP)
+print()