feat: add to tokenizer chat configuration. (#76)

b4rtaz · May 31, 2024 · 6eccd30 · 6eccd30
1 parent dc997b4
commit 6eccd30
Show file tree

Hide file tree

Showing 11 changed files with 712 additions and 191 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -30,13 +30,16 @@ jobs:
           make dllama-api
           make funcs-test
           make quants-test
+          make tokenizer-test
           make transformer-test
           make llama2-tasks-test
           make grok1-tasks-test
       - name: funcs-test
         run: ./funcs-test
       - name: quants-test
         run: ./quants-test
+      - name: tokenizer-test
+        run: ./tokenizer-test
       - name: transformer-test
         run: ./transformer-test
       - name: llama2-tasks-test
@@ -60,13 +63,16 @@ jobs:
           make dllama-api
           make funcs-test
           make quants-test
+          make tokenizer-test
           make transformer-test
           make llama2-tasks-test
           make grok1-tasks-test
       - name: funcs-test
         run: ./funcs-test
       - name: quants-test
         run: ./quants-test
+      - name: tokenizer-test
+        run: ./tokenizer-test
       - name: transformer-test
         run: ./transformer-test
       - name: llama2-tasks-test

diff --git a/Makefile b/Makefile
@@ -42,6 +42,8 @@ funcs-test: src/funcs-test.cpp funcs utils quants
 	$(CXX) $(CXXFLAGS) src/funcs-test.cpp -o funcs-test funcs.o utils.o quants.o $(LIBS)
 quants-test: src/quants.cpp utils quants
 	$(CXX) $(CXXFLAGS) src/quants-test.cpp -o quants-test utils.o quants.o $(LIBS)
+tokenizer-test: src/tokenizer-test.cpp tokenizer funcs utils quants
+	$(CXX) $(CXXFLAGS) src/tokenizer-test.cpp -o tokenizer-test tokenizer.o funcs.o utils.o quants.o $(LIBS)
 transformer-test: src/transformer-test.cpp funcs utils quants transformer socket
 	$(CXX) $(CXXFLAGS) src/transformer-test.cpp -o transformer-test funcs.o utils.o quants.o transformer.o socket.o $(LIBS)
 llama2-tasks-test: src/llama2-tasks-test.cpp utils quants funcs socket transformer tasks llama2-tasks tokenizer

diff --git a/converter/convert-tokenizer-hf.py b/converter/convert-tokenizer-hf.py
@@ -0,0 +1,71 @@
+import sys
+import json
+import os
+writer = __import__('tokenizer-writer')
+
+def openJson(path):
+    with open(path, 'r', encoding='utf-8') as file:
+        return json.load(file)
+
+def printUsage():
+    print('Usage: python convert-tokenizer-hf.py <tokenizerFolderPath> <name>')
+    print()
+    print('Options:')
+    print('  <tokenizerFolderPath> The path to the folder with tokenizer.json and tokenizer_config.json')
+    print('  <name>                The name of the tokenizer (e.g. "llama3")')
+
+if __name__ == '__main__':
+    if (len(sys.argv) < 2):
+        printUsage()
+        exit(1)
+
+    dirPath = sys.argv[1]
+    name = sys.argv[2]
+    tokenizerConfig = openJson(os.path.join(dirPath, 'tokenizer_config.json'))
+    tokenizer = openJson(os.path.join(dirPath, 'tokenizer.json'))
+
+    assert(tokenizerConfig['tokenizer_class'] == 'PreTrainedTokenizerFast')
+    assert(tokenizer['model']['type'] == 'BPE')
+    i = 0
+    tokens = []
+    scores = []
+    bosId = None
+    eosId = None
+    for token in tokenizer['model']['vocab'].keys():
+        assert(tokenizer['model']['vocab'][token] == i)
+        tokens.append(token.encode('utf8'))
+        scores.append(-float(i))
+        i += 1
+    if ('added_tokens' in tokenizer):
+        for at in tokenizer['added_tokens']:
+            assert(at['id'] == i)
+            tokens.append(at['content'].encode('utf8'))
+            scores.append(-float(i))
+            if (at['content'] == tokenizerConfig['bos_token']):
+                bosId = i
+            if (at['content'] == tokenizerConfig['eos_token']):
+                eosId = i
+            i += 1
+
+    templateChat = None
+    if ('chat_template' in tokenizerConfig):
+        template = tokenizerConfig['chat_template']
+        print('⭐ Found chat template:')
+        print()
+        print(template.replace('\n', '\\n'))
+        print()
+        print('⭐ To create the tokenizer file you need to manually specify chat template values. Enter \\n for new line.')
+        templateChat = {}
+        templateKeys = ['chat_message_start', 'chat_role_start', 'chat_role_end', 'chat_message_end', 'chat_generation_prompt', 'chat_extra_stop']
+        for key in templateKeys:
+            value = input(f'⏩ Enter value for chat template key "{key}":\n')
+            templateChat[key] = value.replace('\\n', '\n')
+
+    outputFileName = f'dllama_tokenizer_{name}.t'
+    with open(outputFileName, 'wb') as outputFile:
+        writer.writeTokenizer(outputFile, {
+            'bos_id': bosId,
+            'eos_id': eosId,
+            'chat_eos_id': eosId,
+        }, templateChat, tokens, scores)
+    print(f'✅ Created {outputFileName}')
diff --git a/converter/convert-tokenizer-llama3.py b/converter/convert-tokenizer-llama3.py
@@ -1,6 +1,7 @@
 import sys
 import struct
 import base64
+writer = __import__('tokenizer-writer')
 
 # Format of input file:
 # ```
@@ -28,16 +29,32 @@
 ]
 bosId = 128000
 eosId = 128001
+chatEosId = 128009
+chatTemplate = {
+    'chat_message_start': '',
+    'chat_role_start': '<|start_header_id|>',
+    'chat_role_end': '<|end_header_id|>\n\n',
+    'chat_message_end': '<|eot_id|>',
+    'chat_generation_prompt': '<|start_header_id|>assistant<|end_header_id|>\n\n',
+    'chat_extra_stop': ''
+}
+
+def printUsage():
+    print('Usage: python convert-tokenizer-llama3.py <tokenizerPath>')
+    print()
+    print('Options:')
+    print('  <tokenizerPath> The path to the Llama 3 tokenizer model (tokenizer.model)')
 
 if __name__ == '__main__':
     if (len(sys.argv) < 2):
-        print('Invalid usage')
+        printUsage()
         exit(1)
 
     modelPath = sys.argv[1]
+    outputFileName = 'dllama_tokenizer_llama3.t'
 
     with open(modelPath, 'r') as inputFile:
-        with open('dllama_tokenizer_llama3.t', 'wb') as outputFile:
+        with open(outputFileName, 'wb') as outputFile:
             inputLines = inputFile.readlines()
             nLines = len(inputLines)
 
@@ -58,22 +75,10 @@
                 scores.append(score)
                 specialTokenIndex += 1
 
-            vocabSize = len(tokens)
-            maxTokenLength = max(len(t) for t in tokens)
-
-            outputFile.write(struct.pack('IIIiii',
-                0x567123,
-                vocabSize,
-                maxTokenLength,
-                bosId,
-                eosId,
-                -1))
-
-            for i in range(0, vocabSize):
-                outputFile.write(struct.pack('fI', scores[i], len(tokens[i])))
-                outputFile.write(tokens[i])
+            writer.writeTokenizer(outputFile, {
+                'bos_id': bosId,
+                'eos_id': eosId,
+                'chat_eos_id': chatEosId,
+            }, chatTemplate, tokens, scores)
 
-            print(f'maxTokenLength={maxTokenLength}')
-            print(f'bosId={bosId}')
-            print(f'eosId={eosId}')
-            print(f'vocabSize={vocabSize}')
+    print(f'✅ Created {outputFileName}')
diff --git a/converter/tokenizer-writer.py b/converter/tokenizer-writer.py
@@ -0,0 +1,57 @@
+import struct
+
+def writeTokenizer(file, params, chatTemplate, tokens, scores):
+    assert(params['eos_id'] is not None)
+    assert(params['bos_id'] is not None)
+
+    headerKeys = {
+        'version': 0,
+        'vocab_size': 1,
+        'max_token_length': 2,
+        'bos_id': 3,
+        'eos_id': 4,
+        'pad_id': 5,
+        'chat_eos_id': 6,
+        'chat_template': 7
+    }
+    header = struct.pack('i', 0x567124)
+
+    nTokens = len(tokens)
+    maxTokenLength = max(len(t) for t in tokens)
+
+    params['version'] = 0
+    params['vocab_size'] = nTokens
+    params['max_token_length'] = maxTokenLength
+    if (chatTemplate):
+        params['chat_template'] = len(chatTemplate)
+
+    data = b''
+    for key in params:
+        if key in headerKeys:
+            data += struct.pack('ii', headerKeys[key], params[key])
+        else:
+            print(f'Unknown header key: {key}')
+
+    header += struct.pack('i', len(header) * 2 + len(data))
+    file.write(header)
+    file.write(data)
+
+    print(params)
+    if (chatTemplate):
+        print(chatTemplate)
+
+    if (chatTemplate):
+        chatTemplateValue = list(chatTemplate.values())
+        nChatTemplates = len(chatTemplateValue)
+        for i in range(0, nChatTemplates):
+            file.write(struct.pack('I', len(chatTemplateValue[i].encode('utf8'))))
+        for i in range(0, nChatTemplates):
+            data = chatTemplateValue[i].encode('utf8')
+            if (len(data) > 0):
+                file.write(data)
+
+    for i in range(0, nTokens):
+        size = len(tokens[i])
+        assert(size > 0)
+        file.write(struct.pack('fI', scores[i], size))
+        file.write(tokens[i])
diff --git a/src/app.cpp b/src/app.cpp
@@ -113,12 +113,12 @@ void App::run(AppArgs* args, void (*program)(Inference* inference, SocketPool* s
 
     TransformerSpec spec = Transformer::loadSpecFromFile(args->modelPath, nSlices, args->weightsFloatType, args->bufferFloatType);
     TransformerArch arch = TransformerArchFactory::create(&spec);
+    Tokenizer tokenizer(args->tokenizerPath, spec.vocabSize);
 
     if (args->steps == 0 || args->steps > spec.seqLen) {
         args->steps = spec.seqLen;
     }
 
-    Tokenizer tokenizer(args->tokenizerPath, spec.vocabSize);
     Transformer transformer = Transformer::loadRootFromFile(args->modelPath, &spec, socketPool);
     socketPool->setTurbo(true);