From 2fe7e9ea93757886bd9fa58a5c5d9765203229b5 Mon Sep 17 00:00:00 2001 From: Joshua David Date: Sat, 6 Jul 2024 22:47:43 -0700 Subject: [PATCH] Begin implementing a training edge case to pickup where the training left off if interrupeted. --- train.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/train.py b/train.py index 6f8ed26..31a0992 100644 --- a/train.py +++ b/train.py @@ -267,7 +267,15 @@ def train( ) # Save checkpoint - accelerator.save_state(f"checkpoint_epoch_{epoch}.pt") + accelerator.save_state( + {"epoch": epoch, "best_val_loss": best_val_loss}, + f"checkpoint_epoch_{epoch}.pt", + ) + + # Save latest checkpoint + accelerator.save_state( + {"epoch": epoch, "best_val_loss": best_val_loss}, "checkpoint_latest.pt" + ) # Early stopping if avg_val_loss < best_val_loss: