Skip to content

Latest commit

 

History

History
37 lines (28 loc) · 1.32 KB

README.md

File metadata and controls

37 lines (28 loc) · 1.32 KB

Code organization

A few pointers

Distributed training & changing config

import train

# Configure the worker
train.config["n_workers"] = 4
train.config["rank"] = 0 # number of this worker in [0,4).

# Override some hyperparameters to train PowerSGD
train.config["optimizer_scale_lr_with_factor"] = 4  # workers
train.config["optimizer_reducer"] = "RankKReducer"
train.config["optimizer_reducer_rank"] = 4
train.config["optimizer_memory"] = True
train.config["optimizer_reducer_reuse_query"] = True
train.config["optimizer_reducer_n_power_iterations"] = 0

# You can customize the outputs of the training script by overriding these members
train.output_dir = "choose_a_directory"
train.log_info = your_function_pointer
train.log_metric = your_metric_function_pointer

# Start training
train.main()

Note that torch.distributed uses global state, so you cannot easily run train.main() multiple times after each other in the same script.