diff --git a/train.py b/train.py index fec0a239c0af..b9bbf483d1b1 100644 --- a/train.py +++ b/train.py @@ -36,6 +36,12 @@ import torch.nn as nn import yaml from torch.optim import lr_scheduler + +try: + import torch.amp as amp +except ImportError: + import torch.cuda.amp as amp + from tqdm import tqdm FILE = Path(__file__).resolve() @@ -221,7 +227,7 @@ def train(hyp, opt, device, callbacks): LOGGER.info(f"Transferred {len(csd)}/{len(model.state_dict())} items from {weights}") # report else: model = Model(cfg, ch=3, nc=nc, anchors=hyp.get("anchors")).to(device) # create - amp = check_amp(model) # check AMP + use_amp = check_amp(model) # check AMP # Freeze freeze = [f"model.{x}." for x in (freeze if len(freeze) > 1 else range(freeze[0]))] # layers to freeze @@ -238,7 +244,7 @@ def train(hyp, opt, device, callbacks): # Batch size if RANK == -1 and batch_size == -1: # single-GPU only, estimate best batch size - batch_size = check_train_batch_size(model, imgsz, amp) + batch_size = check_train_batch_size(model, imgsz, use_amp) loggers.on_params_update({"batch_size": batch_size}) # Optimizer @@ -352,7 +358,8 @@ def lf(x): maps = np.zeros(nc) # mAP per class results = (0, 0, 0, 0, 0, 0, 0) # P, R, mAP@.5, mAP@.5-.95, val_loss(box, obj, cls) scheduler.last_epoch = start_epoch - 1 # do not move - scaler = torch.cuda.amp.GradScaler(enabled=amp) + # scaler = torch.cuda.amp.GradScaler(enabled=amp) + scaler = amp.GradScaler(enabled=use_amp) stopper, stop = EarlyStopping(patience=opt.patience), False compute_loss = ComputeLoss(model) # init loss class callbacks.run("on_train_start") @@ -409,7 +416,8 @@ def lf(x): imgs = nn.functional.interpolate(imgs, size=ns, mode="bilinear", align_corners=False) # Forward - with torch.cuda.amp.autocast(amp): + # with torch.cuda.amp.autocast(amp): + with amp.autocast(enabled=use_amp, device_type=device.type): pred = model(imgs) # forward loss, loss_items = compute_loss(pred, targets.to(device)) # loss scaled by batch_size if RANK != -1: @@ -458,7 +466,7 @@ def lf(x): data_dict, batch_size=batch_size // WORLD_SIZE * 2, imgsz=imgsz, - half=amp, + half=use_amp, model=ema.ema, single_cls=single_cls, dataloader=val_loader, diff --git a/utils/dataloaders.py b/utils/dataloaders.py index fc5da6bff2a3..08240483339f 100644 --- a/utils/dataloaders.py +++ b/utils/dataloaders.py @@ -355,9 +355,9 @@ def __init__(self, path, img_size=640, stride=32, auto=True, transforms=None, vi self._new_video(videos[0]) # new video else: self.cap = None - assert self.nf > 0, ( - f"No images or videos found in {p}. Supported formats are:\nimages: {IMG_FORMATS}\nvideos: {VID_FORMATS}" - ) + assert ( + self.nf > 0 + ), f"No images or videos found in {p}. Supported formats are:\nimages: {IMG_FORMATS}\nvideos: {VID_FORMATS}" def __iter__(self): """Initializes iterator by resetting count and returns the iterator object itself.""" diff --git a/utils/general.py b/utils/general.py index 41cd2032c821..db6ddacefd85 100644 --- a/utils/general.py +++ b/utils/general.py @@ -495,9 +495,9 @@ def check_file(file, suffix=""): assert Path(file).exists() and Path(file).stat().st_size > 0, f"File download failed: {url}" # check return file elif file.startswith("clearml://"): # ClearML Dataset ID - assert "clearml" in sys.modules, ( - "ClearML is not installed, so cannot use ClearML dataset. Try running 'pip install clearml'." - ) + assert ( + "clearml" in sys.modules + ), "ClearML is not installed, so cannot use ClearML dataset. Try running 'pip install clearml'." return file else: # search files = [] diff --git a/utils/loggers/clearml/clearml_utils.py b/utils/loggers/clearml/clearml_utils.py index 2ddf3a4711a3..fc19c8cfe22a 100644 --- a/utils/loggers/clearml/clearml_utils.py +++ b/utils/loggers/clearml/clearml_utils.py @@ -41,9 +41,11 @@ def construct_dataset(clearml_info_string): with open(yaml_filenames[0]) as f: dataset_definition = yaml.safe_load(f) - assert set(dataset_definition.keys()).issuperset({"train", "test", "val", "nc", "names"}), ( - "The right keys were not found in the yaml file, make sure it at least has the following keys: ('train', 'test', 'val', 'nc', 'names')" - ) + assert set( + dataset_definition.keys() + ).issuperset( + {"train", "test", "val", "nc", "names"} + ), "The right keys were not found in the yaml file, make sure it at least has the following keys: ('train', 'test', 'val', 'nc', 'names')" data_dict = { "train": ( diff --git a/utils/torch_utils.py b/utils/torch_utils.py index 29877faa6ce3..53e707607915 100644 --- a/utils/torch_utils.py +++ b/utils/torch_utils.py @@ -121,9 +121,9 @@ def select_device(device="", batch_size=0, newline=True): os.environ["CUDA_VISIBLE_DEVICES"] = "-1" # force torch.cuda.is_available() = False elif device: # non-cpu device requested os.environ["CUDA_VISIBLE_DEVICES"] = device # set environment variable - must be before assert is_available() - assert torch.cuda.is_available() and torch.cuda.device_count() >= len(device.replace(",", "")), ( - f"Invalid CUDA '--device {device}' requested, use '--device cpu' or pass valid CUDA device(s)" - ) + assert torch.cuda.is_available() and torch.cuda.device_count() >= len( + device.replace(",", "") + ), f"Invalid CUDA '--device {device}' requested, use '--device cpu' or pass valid CUDA device(s)" if not cpu and not mps and torch.cuda.is_available(): # prefer GPU if available devices = device.split(",") if device else "0" # range(torch.cuda.device_count()) # i.e. 0,1,6,7