61
61
MT_BENCH_SCORES_PATH = path .join (DATA_PVC_MOUNT_PATH , "mt-bench-best.txt" )
62
62
MT_BENCH_BRANCH_SCORES_PATH = path .join (DATA_PVC_MOUNT_PATH , "mt-bench-branch-best.txt" )
63
63
MMLU_BRANCH_SCORES_PATH = path .join (DATA_PVC_MOUNT_PATH , "mmlu-branch-best.txt" )
64
- CANDIDATE_MODEL_PATH = path .join (DATA_PVC_OUTPUT_PATH , "hf_format/candidate_model" )
64
+ CANDIDATE_MODEL_PATH = path .join (
65
+ DATA_PVC_MOUNT_PATH , "model/output/phase_2/hf_format/candidate_model"
66
+ )
65
67
SDG_OBJECT_STORE_SECRET_NAME = "sdg-object-store-credentials"
66
68
KFP_MODEL_SERVER_CM = """
67
69
# TODO: remove the following line and replace it with the actual ConfigMap/Secret
265
267
fi
266
268
267
269
if [ "$STRATEGY" == "upload" ]; then
268
- export FINAL_DATA_TAR_FILE="final .$SDG_OBJECT_STORE_DATA_KEY"
270
+ export FINAL_DATA_TAR_FILE="$(date +"%Y-%m-%d_%H-%M-%S") .$SDG_OBJECT_STORE_DATA_KEY"
269
271
export FINAL_DATA_TAR_PATH="{data_pvc_mount_path}/$FINAL_DATA_TAR_FILE"
270
272
echo "Final data tarball path: $FINAL_DATA_TAR_PATH"
271
273
echo "Final data tarball file: $FINAL_DATA_TAR_FILE"
272
274
echo "Archiving data before pushing to the object store"
275
+ # Use '--ignore-failed-read' to ignore missing files, needed when no MMLU tasks directories are found MMLU_branch is skipped
276
+ # So '{mmlu_branch_scores_path}' will not exist
273
277
tar --create \
274
278
--gzip \
275
279
--verbose \
280
+ --ignore-failed-read \
276
281
--file "$FINAL_DATA_TAR_PATH" {mt_bench_output_path} {mt_bench_scores_path} {mt_bench_branch_scores_path} {mmlu_branch_scores_path} {candidate_model_path}
277
- # TODO: change model path for the final model!!!
278
282
fi
279
283
280
284
tmp=$(mktemp -d)
@@ -844,11 +848,8 @@ def run(
844
848
845
849
# Final evaluation
846
850
ctx .obj ["eval_type" ] = "final-eval"
847
- scores = ctx .invoke (evaluation )
848
- scores = json .loads (scores )
849
- logger .info ("Best model: %s" , scores .get ("best_model" ))
850
- ctx .obj ["candidate_model" ] = scores .get ("best_model" )
851
- logger .info ("instructLab Training Finished!" )
851
+ ctx .invoke (evaluation )
852
+ logger .info ("InstructLab Training Finished!" )
852
853
853
854
# Push the best model to S3
854
855
ctx .invoke (upload_trained_model )
@@ -2120,7 +2121,7 @@ def find_node_dataset_directories(base_dir: str):
2120
2121
json.dump(mt_bench_branch_data, f, indent=4)
2121
2122
"""
2122
2123
exec_run_final_eval_op_args = """
2123
- run_final_eval_op(mmlu_branch_output='/data/mmlu-branch-best.txt',mt_bench_branch_output='/data/mt-bench-branch-best.txt',candidate_model='/data/model/output/hf_format/candidate_model', taxonomy='/data/taxonomy', tasks='/data/generated', base_branch='', candidate_branch='', device=None, base_model_dir='/data/model', max_workers='auto', merge_system_user_message=False, model_dtype='bfloat16', few_shots=5, batch_size=8)
2124
+ run_final_eval_op(mmlu_branch_output='/data/mmlu-branch-best.txt',mt_bench_branch_output='/data/mt-bench-branch-best.txt',candidate_model='/data/model/output/phase_2/ hf_format/candidate_model', taxonomy='/data/taxonomy', tasks='/data/generated', base_branch='', candidate_branch='', device=None, base_model_dir='/data/model', max_workers='auto', merge_system_user_message=False, model_dtype='bfloat16', few_shots=5, batch_size=8)
2124
2125
"""
2125
2126
2126
2127
if eval_type == "mt-bench" :
@@ -2324,6 +2325,7 @@ def run_job(namespace: str, job: kubernetes.client.V1Job) -> str:
2324
2325
name = pods .items [0 ].metadata .name , namespace = namespace
2325
2326
)
2326
2327
w .stop ()
2328
+ break
2327
2329
elif job_event .status .failed == 1 :
2328
2330
logger .error ("Job failed. Pod logs:" )
2329
2331
pods = core_v1 .list_namespaced_pod (
@@ -2739,23 +2741,29 @@ def train(
2739
2741
namespace = namespace ,
2740
2742
plural = "pytorchjobs" ,
2741
2743
):
2742
- job_event = event ["object" ]
2744
+ pytorchjob_event = event ["object" ]
2743
2745
if (
2744
- job_event ["metadata" ]["name" ]
2746
+ pytorchjob_event ["metadata" ]["name" ]
2745
2747
!= pytorch_training_job_yaml ["metadata" ]["name" ]
2746
2748
):
2747
2749
continue
2748
- job_name = job_event ["metadata" ]["name" ]
2750
+ pytorchjob_name = pytorchjob_event ["metadata" ]["name" ]
2749
2751
2750
- if "status" not in job_event or "conditions" not in job_event ["status" ]:
2752
+ if (
2753
+ "status" not in pytorchjob_event
2754
+ or "conditions" not in pytorchjob_event ["status" ]
2755
+ ):
2751
2756
continue
2752
2757
logger .info (
2753
- "Job : %s - %s" ,
2754
- job_name ,
2755
- job_event ["status" ].get ("conditions" , "No conditions yet" ),
2758
+ "PytorchJob : %s - %s" ,
2759
+ pytorchjob_name ,
2760
+ pytorchjob_event ["status" ].get ("conditions" , "No conditions yet" ),
2756
2761
)
2757
2762
2758
- for job_condition in job_event ["status" ]["conditions" ]:
2763
+ # Always start by the last condition so that if the job is completed, we can stop watching
2764
+ # If we don't do this, we might get 'stuck' into the Running condition and never stop watching
2765
+ for job_condition in reversed (pytorchjob_event ["status" ]["conditions" ]):
2766
+ print (job_condition )
2759
2767
if job_condition ["type" ] == "Running" :
2760
2768
# now watch for pod event
2761
2769
for event in w .stream (
@@ -2764,7 +2772,7 @@ def train(
2764
2772
label_selector = f"training.kubeflow.org/job-name=train-phase-{ training_phase } " ,
2765
2773
):
2766
2774
pod_event = event ["object" ]
2767
- if pod_event .metadata .name .startswith (job_name ):
2775
+ if pod_event .metadata .name .startswith (pytorchjob_name ):
2768
2776
logger .info (
2769
2777
"Pod: %s - %s" ,
2770
2778
pod_event .metadata .name ,
@@ -2786,15 +2794,25 @@ def train(
2786
2794
if pod_event .status .phase == "Failed" :
2787
2795
log_pod_containers (pod_event , "init_containers" , namespace )
2788
2796
log_pod_containers (pod_event , "containers" , namespace )
2789
- if job_condition ["type" ] == "Succeeded" :
2797
+ w .stop ()
2798
+ if pod_event .status .phase == "Succeeded" :
2799
+ continue
2800
+ elif job_condition ["type" ] == "Succeeded" :
2790
2801
logger .info (
2791
- "Job '%s' completed successfully: %s" ,
2792
- job_name ,
2802
+ "PytorchJob '%s' completed successfully: %s" ,
2803
+ pytorchjob_name ,
2793
2804
job_condition ["reason" ],
2794
2805
)
2806
+ logger .info ("Training phase %s completed." , training_phase )
2795
2807
w .stop ()
2808
+ # Break here to avoid going into other conditions, we are done
2809
+ break
2796
2810
elif job_condition ["type" ] == "Failed" :
2797
- logger .error ("Job' %s' failed: %s" , job_name , job_condition ["reason" ])
2811
+ logger .error (
2812
+ "PytorchJob' %s' failed: %s" ,
2813
+ pytorchjob_name ,
2814
+ job_condition ["reason" ],
2815
+ )
2798
2816
w .stop ()
2799
2817
raise RuntimeError ("Job failed." )
2800
2818
@@ -2816,7 +2834,9 @@ def evaluation(ctx: click.Context) -> str:
2816
2834
eval_type = ctx .obj ["eval_type" ]
2817
2835
2818
2836
if eval_type is None :
2819
- raise ValueError ("Evaluation type must be provided with --eval-type=[mt-bench]" )
2837
+ raise ValueError (
2838
+ "Evaluation type must be provided with --eval-type=[mt-bench|final-eval]"
2839
+ )
2820
2840
2821
2841
logger .info ("Running %s evaluation." , eval_type )
2822
2842
@@ -2825,17 +2845,21 @@ def evaluation(ctx: click.Context) -> str:
2825
2845
namespace = namespace , job_name = f"eval-{ eval_type } " , eval_type = eval_type
2826
2846
)
2827
2847
scores = run_job (namespace , job )
2828
- scores = scores .replace ("'" , '"' )
2829
2848
2830
- try :
2831
- scores_data = json .loads (scores )
2832
- if isinstance (scores_data , dict ):
2833
- scores = json .dumps (scores_data )
2834
- else :
2835
- raise ValueError ("Unexpected format for scores data" )
2836
- except json .JSONDecodeError as e :
2837
- logger .error ("Failed to parse scores: %s" , e )
2838
- raise
2849
+ if eval_type == "mt-bench" :
2850
+ scores = scores .replace ("'" , '"' )
2851
+
2852
+ try :
2853
+ scores_data = json .loads (scores )
2854
+ if isinstance (scores_data , dict ):
2855
+ scores = json .dumps (scores_data )
2856
+ else :
2857
+ raise ValueError ("Unexpected format for scores data" )
2858
+ except json .JSONDecodeError as e :
2859
+ logger .error ("Failed to parse scores: %s" , e )
2860
+ raise
2861
+
2862
+ return scores
2839
2863
2840
2864
logger .info ("Evaluation scores: %s" , scores )
2841
2865
0 commit comments