diff --git a/tokenizer_service/example-curl.sh b/tokenizer_service/example-curl.sh new file mode 100755 index 00000000..7bedb21e --- /dev/null +++ b/tokenizer_service/example-curl.sh @@ -0,0 +1,3 @@ +curl -X POST http://127.0.0.1:5000/granite-7b-lab \ + -H "Content-Type: application/json" \ + -d '{"text": "This is an example sentence to check tokenization."}' diff --git a/tokenizer_service/requirements.txt b/tokenizer_service/requirements.txt new file mode 100644 index 00000000..993e0e04 --- /dev/null +++ b/tokenizer_service/requirements.txt @@ -0,0 +1,2 @@ +flask +transformers diff --git a/tokenizer_service/server.py b/tokenizer_service/server.py new file mode 100644 index 00000000..8e56bebc --- /dev/null +++ b/tokenizer_service/server.py @@ -0,0 +1,37 @@ +from flask import Flask, request, jsonify +from transformers import AutoTokenizer + +app = Flask(__name__) + + +granite_tokenizer = AutoTokenizer.from_pretrained("instructlab/granite-7b-lab") +merlinite_tokenizer = AutoTokenizer.from_pretrained("instructlab/merlinite-7b-lab") + +def count_tokens(tokenizer, text): + tokens = tokenizer.tokenize(text) + return len(tokens) + +# Endpoint for the 'granite-7b-lab' model +@app.route('/granite-7b-lab', methods=['POST']) +def granite_token_count(): + data = request.json + if 'text' not in data: + return jsonify({"error": "Text is required"}), 400 + + text = data['text'] + num_tokens = count_tokens(granite_tokenizer, text) + return jsonify({"model": "instructlab/granite-7b-lab", "tokens": num_tokens}) + +# Endpoint for the 'merlinite' model +@app.route('/merlinite-7b-lab', methods=['POST']) +def merlinite_token_count(): + data = request.json + if 'text' not in data: + return jsonify({"error": "Text is required"}), 400 + + text = data['text'] + num_tokens = count_tokens(merlinite_tokenizer, text) + return jsonify({"model": "instructlab/merlinite-7b-lab", "tokens": num_tokens}) + +if __name__ == '__main__': + app.run(debug=True)