Skip to content

Commit

Permalink
feat: add to tokenizer chat configuration. (#76)
Browse files Browse the repository at this point in the history
  • Loading branch information
b4rtaz authored May 31, 2024
1 parent dc997b4 commit 6eccd30
Show file tree
Hide file tree
Showing 11 changed files with 712 additions and 191 deletions.
6 changes: 6 additions & 0 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,16 @@ jobs:
make dllama-api
make funcs-test
make quants-test
make tokenizer-test
make transformer-test
make llama2-tasks-test
make grok1-tasks-test
- name: funcs-test
run: ./funcs-test
- name: quants-test
run: ./quants-test
- name: tokenizer-test
run: ./tokenizer-test
- name: transformer-test
run: ./transformer-test
- name: llama2-tasks-test
Expand All @@ -60,13 +63,16 @@ jobs:
make dllama-api
make funcs-test
make quants-test
make tokenizer-test
make transformer-test
make llama2-tasks-test
make grok1-tasks-test
- name: funcs-test
run: ./funcs-test
- name: quants-test
run: ./quants-test
- name: tokenizer-test
run: ./tokenizer-test
- name: transformer-test
run: ./transformer-test
- name: llama2-tasks-test
Expand Down
2 changes: 2 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ funcs-test: src/funcs-test.cpp funcs utils quants
$(CXX) $(CXXFLAGS) src/funcs-test.cpp -o funcs-test funcs.o utils.o quants.o $(LIBS)
quants-test: src/quants.cpp utils quants
$(CXX) $(CXXFLAGS) src/quants-test.cpp -o quants-test utils.o quants.o $(LIBS)
tokenizer-test: src/tokenizer-test.cpp tokenizer funcs utils quants
$(CXX) $(CXXFLAGS) src/tokenizer-test.cpp -o tokenizer-test tokenizer.o funcs.o utils.o quants.o $(LIBS)
transformer-test: src/transformer-test.cpp funcs utils quants transformer socket
$(CXX) $(CXXFLAGS) src/transformer-test.cpp -o transformer-test funcs.o utils.o quants.o transformer.o socket.o $(LIBS)
llama2-tasks-test: src/llama2-tasks-test.cpp utils quants funcs socket transformer tasks llama2-tasks tokenizer
Expand Down
71 changes: 71 additions & 0 deletions converter/convert-tokenizer-hf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import sys
import json
import os
writer = __import__('tokenizer-writer')

def openJson(path):
with open(path, 'r', encoding='utf-8') as file:
return json.load(file)

def printUsage():
print('Usage: python convert-tokenizer-hf.py <tokenizerFolderPath> <name>')
print()
print('Options:')
print(' <tokenizerFolderPath> The path to the folder with tokenizer.json and tokenizer_config.json')
print(' <name> The name of the tokenizer (e.g. "llama3")')

if __name__ == '__main__':
if (len(sys.argv) < 2):
printUsage()
exit(1)

dirPath = sys.argv[1]
name = sys.argv[2]
tokenizerConfig = openJson(os.path.join(dirPath, 'tokenizer_config.json'))
tokenizer = openJson(os.path.join(dirPath, 'tokenizer.json'))

assert(tokenizerConfig['tokenizer_class'] == 'PreTrainedTokenizerFast')
assert(tokenizer['model']['type'] == 'BPE')
i = 0
tokens = []
scores = []
bosId = None
eosId = None
for token in tokenizer['model']['vocab'].keys():
assert(tokenizer['model']['vocab'][token] == i)
tokens.append(token.encode('utf8'))
scores.append(-float(i))
i += 1
if ('added_tokens' in tokenizer):
for at in tokenizer['added_tokens']:
assert(at['id'] == i)
tokens.append(at['content'].encode('utf8'))
scores.append(-float(i))
if (at['content'] == tokenizerConfig['bos_token']):
bosId = i
if (at['content'] == tokenizerConfig['eos_token']):
eosId = i
i += 1

templateChat = None
if ('chat_template' in tokenizerConfig):
template = tokenizerConfig['chat_template']
print('⭐ Found chat template:')
print()
print(template.replace('\n', '\\n'))
print()
print('⭐ To create the tokenizer file you need to manually specify chat template values. Enter \\n for new line.')
templateChat = {}
templateKeys = ['chat_message_start', 'chat_role_start', 'chat_role_end', 'chat_message_end', 'chat_generation_prompt', 'chat_extra_stop']
for key in templateKeys:
value = input(f'⏩ Enter value for chat template key "{key}":\n')
templateChat[key] = value.replace('\\n', '\n')

outputFileName = f'dllama_tokenizer_{name}.t'
with open(outputFileName, 'wb') as outputFile:
writer.writeTokenizer(outputFile, {
'bos_id': bosId,
'eos_id': eosId,
'chat_eos_id': eosId,
}, templateChat, tokens, scores)
print(f'✅ Created {outputFileName}')
45 changes: 25 additions & 20 deletions converter/convert-tokenizer-llama3.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import sys
import struct
import base64
writer = __import__('tokenizer-writer')

# Format of input file:
# ```
Expand Down Expand Up @@ -28,16 +29,32 @@
]
bosId = 128000
eosId = 128001
chatEosId = 128009
chatTemplate = {
'chat_message_start': '',
'chat_role_start': '<|start_header_id|>',
'chat_role_end': '<|end_header_id|>\n\n',
'chat_message_end': '<|eot_id|>',
'chat_generation_prompt': '<|start_header_id|>assistant<|end_header_id|>\n\n',
'chat_extra_stop': ''
}

def printUsage():
print('Usage: python convert-tokenizer-llama3.py <tokenizerPath>')
print()
print('Options:')
print(' <tokenizerPath> The path to the Llama 3 tokenizer model (tokenizer.model)')

if __name__ == '__main__':
if (len(sys.argv) < 2):
print('Invalid usage')
printUsage()
exit(1)

modelPath = sys.argv[1]
outputFileName = 'dllama_tokenizer_llama3.t'

with open(modelPath, 'r') as inputFile:
with open('dllama_tokenizer_llama3.t', 'wb') as outputFile:
with open(outputFileName, 'wb') as outputFile:
inputLines = inputFile.readlines()
nLines = len(inputLines)

Expand All @@ -58,22 +75,10 @@
scores.append(score)
specialTokenIndex += 1

vocabSize = len(tokens)
maxTokenLength = max(len(t) for t in tokens)

outputFile.write(struct.pack('IIIiii',
0x567123,
vocabSize,
maxTokenLength,
bosId,
eosId,
-1))

for i in range(0, vocabSize):
outputFile.write(struct.pack('fI', scores[i], len(tokens[i])))
outputFile.write(tokens[i])
writer.writeTokenizer(outputFile, {
'bos_id': bosId,
'eos_id': eosId,
'chat_eos_id': chatEosId,
}, chatTemplate, tokens, scores)

print(f'maxTokenLength={maxTokenLength}')
print(f'bosId={bosId}')
print(f'eosId={eosId}')
print(f'vocabSize={vocabSize}')
print(f'✅ Created {outputFileName}')
57 changes: 57 additions & 0 deletions converter/tokenizer-writer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import struct

def writeTokenizer(file, params, chatTemplate, tokens, scores):
assert(params['eos_id'] is not None)
assert(params['bos_id'] is not None)

headerKeys = {
'version': 0,
'vocab_size': 1,
'max_token_length': 2,
'bos_id': 3,
'eos_id': 4,
'pad_id': 5,
'chat_eos_id': 6,
'chat_template': 7
}
header = struct.pack('i', 0x567124)

nTokens = len(tokens)
maxTokenLength = max(len(t) for t in tokens)

params['version'] = 0
params['vocab_size'] = nTokens
params['max_token_length'] = maxTokenLength
if (chatTemplate):
params['chat_template'] = len(chatTemplate)

data = b''
for key in params:
if key in headerKeys:
data += struct.pack('ii', headerKeys[key], params[key])
else:
print(f'Unknown header key: {key}')

header += struct.pack('i', len(header) * 2 + len(data))
file.write(header)
file.write(data)

print(params)
if (chatTemplate):
print(chatTemplate)

if (chatTemplate):
chatTemplateValue = list(chatTemplate.values())
nChatTemplates = len(chatTemplateValue)
for i in range(0, nChatTemplates):
file.write(struct.pack('I', len(chatTemplateValue[i].encode('utf8'))))
for i in range(0, nChatTemplates):
data = chatTemplateValue[i].encode('utf8')
if (len(data) > 0):
file.write(data)

for i in range(0, nTokens):
size = len(tokens[i])
assert(size > 0)
file.write(struct.pack('fI', scores[i], size))
file.write(tokens[i])
2 changes: 1 addition & 1 deletion src/app.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,12 +113,12 @@ void App::run(AppArgs* args, void (*program)(Inference* inference, SocketPool* s

TransformerSpec spec = Transformer::loadSpecFromFile(args->modelPath, nSlices, args->weightsFloatType, args->bufferFloatType);
TransformerArch arch = TransformerArchFactory::create(&spec);
Tokenizer tokenizer(args->tokenizerPath, spec.vocabSize);

if (args->steps == 0 || args->steps > spec.seqLen) {
args->steps = spec.seqLen;
}

Tokenizer tokenizer(args->tokenizerPath, spec.vocabSize);
Transformer transformer = Transformer::loadRootFromFile(args->modelPath, &spec, socketPool);
socketPool->setTurbo(true);

Expand Down
Loading

0 comments on commit 6eccd30

Please sign in to comment.