From 80e1b4cf9b43d13827e9a1fab7b8653cf439e808 Mon Sep 17 00:00:00 2001 From: wylilong Date: Thu, 26 Sep 2024 20:05:18 +0800 Subject: [PATCH 1/2] update to support agent chat sft. --- finetune_demo/finetune.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/finetune_demo/finetune.py b/finetune_demo/finetune.py index d5f0666..a92c556 100644 --- a/finetune_demo/finetune.py +++ b/finetune_demo/finetune.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- import os +import json import jieba import dataclasses as dc import functools @@ -243,6 +244,18 @@ def process_message(message): v is not None} elif 'tools' in message: del message['tools'] + + # convert tarin data of agent chat. + if message['role'] == 'assistant': + content = message['content'] + if isinstance(content, str) and content.startswith("{") and content.endswith("}"): + try: + content_ = eval(content) + if isinstance(content_, dict) and "name" in content_ and "arguments" in content_: + message['content'] = json.dumps(content_["arguments"], ensure_ascii=False) + message['metadata'] = content_["name"] + except: + pass return message @@ -270,7 +283,7 @@ def process_batch( for message in conv: message = process_message(message) loss_mask_val = False if message['role'] in ('system', 'user', 'observation') else True - new_input_ids = tokenizer.apply_chat_template([message], tokenize=True, return_dict=False)[2:] + new_input_ids = tokenizer.apply_chat_template([message], tokenize=True, return_dict=False)[0][2:] input_ids += new_input_ids loss_masks += [loss_mask_val] * len(new_input_ids) @@ -324,7 +337,7 @@ def process_batch_eval( break else: message = process_message(message) - new_input_ids = tokenizer.apply_chat_template([message], tokenize=True, return_dict=False)[2:] + new_input_ids = tokenizer.apply_chat_template([message], tokenize=True, return_dict=False)[0][2:] if message['role'] == 'assistant': output_prompt, output_ids = ( new_input_ids[:1], From c6f06290074d10c8a40115f6cb40609908259efe Mon Sep 17 00:00:00 2001 From: wylilong Date: Fri, 27 Sep 2024 18:00:16 +0800 Subject: [PATCH 2/2] update to support agent chat sft. --- finetune_demo/finetune.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/finetune_demo/finetune.py b/finetune_demo/finetune.py index a92c556..66d4819 100644 --- a/finetune_demo/finetune.py +++ b/finetune_demo/finetune.py @@ -283,7 +283,7 @@ def process_batch( for message in conv: message = process_message(message) loss_mask_val = False if message['role'] in ('system', 'user', 'observation') else True - new_input_ids = tokenizer.apply_chat_template([message], tokenize=True, return_dict=False)[0][2:] + new_input_ids = tokenizer.apply_chat_template([message], tokenize=True, return_dict=False)[2:] input_ids += new_input_ids loss_masks += [loss_mask_val] * len(new_input_ids) @@ -337,7 +337,7 @@ def process_batch_eval( break else: message = process_message(message) - new_input_ids = tokenizer.apply_chat_template([message], tokenize=True, return_dict=False)[0][2:] + new_input_ids = tokenizer.apply_chat_template([message], tokenize=True, return_dict=False)[2:] if message['role'] == 'assistant': output_prompt, output_ids = ( new_input_ids[:1],