Add Clone trait to the OptimizerAdaptor and Clone implementations to the optimizers (#1770)

getumen · web-flow · commit e823338750c6 · 2024-05-15T09:18:09.000-04:00
diff --git a/crates/burn-core/src/grad_clipping/base.rs b/crates/burn-core/src/grad_clipping/base.rs
@@ -30,6 +30,7 @@ impl GradientClippingConfig {
 /// Gradient Clipping provides a way to mitigate exploding gradients
 /// by clipping every component of the gradient by value or by norm during
 /// backpropagation.
+#[derive(Clone)]
 pub enum GradientClipping {
     /// Clip the gradient by value.
     Value(f32),
diff --git a/crates/burn-core/src/optim/adagrad.rs b/crates/burn-core/src/optim/adagrad.rs
@@ -26,6 +26,7 @@ pub struct AdaGradConfig {
 }
 
 /// AdaGrad optimizer
+#[derive(Clone)]
 pub struct AdaGrad<B: Backend> {
     lr_decay: LrDecay,
     weight_decay: Option<WeightDecay<B>>,
@@ -105,6 +106,7 @@ pub struct LrDecayState<B: Backend, const D: usize> {
     sum: Tensor<B, D>,
 }
 
+#[derive(Clone)]
 struct LrDecay {
     lr_decay: f64,
     epsilon: f32,
diff --git a/crates/burn-core/src/optim/adam.rs b/crates/burn-core/src/optim/adam.rs
@@ -31,6 +31,7 @@ pub struct AdamConfig {
 }
 
 /// Adam optimizer as described in the paper [Adam: A Method for Stochastic Optimization](https://arxiv.org/pdf/1412.6980.pdf).
+#[derive(Clone)]
 pub struct Adam<B: Backend> {
     momentum: AdaptiveMomentum,
     weight_decay: Option<WeightDecay<B>>,
@@ -113,6 +114,7 @@ pub struct AdaptiveMomentumState<B: Backend, const D: usize> {
     moment_2: Tensor<B, D>,
 }
 
+#[derive(Clone)]
 struct AdaptiveMomentum {
     beta_1: f32,
     beta_2: f32,
diff --git a/crates/burn-core/src/optim/adamw.rs b/crates/burn-core/src/optim/adamw.rs
@@ -30,6 +30,7 @@ pub struct AdamWConfig {
 }
 
 /// AdamW optimizer as described in the paper [Decoupled Weight Decay Regularization, Loshchilov and Hutter, 2019](https://arxiv.org/abs/1711.05101).
+#[derive(Clone)]
 pub struct AdamW<B: Backend> {
     momentum: AdaptiveMomentumW,
     weight_decay: f32,
@@ -112,6 +113,7 @@ pub struct AdaptiveMomentumWState<B: Backend, const D: usize> {
     moment_2: Tensor<B, D>,
 }
 
+#[derive(Clone)]
 struct AdaptiveMomentumW {
     beta_1: f32,
     beta_2: f32,
diff --git a/crates/burn-core/src/optim/decay.rs b/crates/burn-core/src/optim/decay.rs
@@ -20,6 +20,7 @@ pub struct WeightDecayState<B: Backend, const D: usize> {
 }
 
 /// Weight decay implementation that transforms gradients.
+#[derive(Clone)]
 pub struct WeightDecay<B: Backend> {
     penalty: B::FloatElem,
 }
diff --git a/crates/burn-core/src/optim/momentum.rs b/crates/burn-core/src/optim/momentum.rs
@@ -27,6 +27,7 @@ pub struct MomentumState<B: Backend, const D: usize> {
 }
 
 /// Momemtum implementation that transforms gradients.
+#[derive(Clone)]
 pub struct Momentum<B: Backend> {
     momentum: B::FloatElem,
     dampening: f64,
diff --git a/crates/burn-core/src/optim/rmsprop.rs b/crates/burn-core/src/optim/rmsprop.rs
@@ -64,6 +64,7 @@ impl RmsPropConfig {
 
 /// Optimizer that implements stochastic gradient descent with momentum.
 /// The optimizer can be configured with [RmsPropConfig](RmsPropConfig).
+#[derive(Clone)]
 pub struct RmsProp<B: Backend> {
     alpha: f32,
     // epsilon: f32,
@@ -251,6 +252,7 @@ impl<B: Backend, const D: usize> CenteredState<B, D> {
 
 /// [RmsPropMomentum](RmsPropMomentum) is to store config status for optimizer.
 /// (, which is stored in [optimizer](RmsProp) itself and not passed in during `step()` calculation)
+#[derive(Clone)]
 pub struct RmsPropMomentum {
     momentum: f32,
     epsilon: f32,
diff --git a/crates/burn-core/src/optim/sgd.rs b/crates/burn-core/src/optim/sgd.rs
@@ -25,6 +25,7 @@ pub struct SgdConfig {
 /// Optimizer that implements stochastic gradient descent with momentum.
 ///
 /// The optimizer can be configured with [SgdConfig](SgdConfig).
+#[derive(Clone)]
 pub struct Sgd<B: Backend> {
     momentum: Option<Momentum<B>>,
     weight_decay: Option<WeightDecay<B>>,
diff --git a/crates/burn-core/src/optim/simple/adaptor.rs b/crates/burn-core/src/optim/simple/adaptor.rs
@@ -11,6 +11,7 @@ use hashbrown::HashMap;
 
 /// Wrapper struct that adapts any [simple optimizer](SimpleOptimizer) into
 /// an [optimizer](Optimizer).
+#[derive(Clone)]
 pub struct OptimizerAdaptor<O, M, B>
 where
     O: SimpleOptimizer<B::InnerBackend>,
diff --git a/crates/burn-core/src/optim/simple/base.rs b/crates/burn-core/src/optim/simple/base.rs
@@ -6,7 +6,7 @@ use burn_tensor::{backend::Backend, Tensor};
 ///
 /// Implementations don't have to handle missing gradients, loading and exporting records, navigate the
 /// module parameter structure, handle tracked and untracked tensors, and the likes.
-pub trait SimpleOptimizer<B>: Send + Sync
+pub trait SimpleOptimizer<B>: Send + Sync + Clone
 where
     B: Backend,
 {

Original file line number	Diff line number	Diff line change
`@@ -26,6 +26,7 @@ pub struct AdaGradConfig {`
`26`	`26`	`}`
`27`	`27`
`28`	`28`	`/// AdaGrad optimizer`
	`29`	`+#[derive(Clone)]`
`29`	`30`	`pub struct AdaGrad<B: Backend> {`
`30`	`31`	`lr_decay: LrDecay,`
`31`	`32`	`weight_decay: Option<WeightDecay<B>>,`
`@@ -105,6 +106,7 @@ pub struct LrDecayState<B: Backend, const D: usize> {`
`105`	`106`	`sum: Tensor<B, D>,`
`106`	`107`	`}`
`107`	`108`
	`109`	`+#[derive(Clone)]`
`108`	`110`	`struct LrDecay {`
`109`	`111`	`lr_decay: f64,`
`110`	`112`	`epsilon: f32,`
Original file line number	Diff line number	Diff line change
`@@ -31,6 +31,7 @@ pub struct AdamConfig {`
`31`	`31`	`}`
`32`	`32`
`33`	`33`	`/// Adam optimizer as described in the paper [Adam: A Method for Stochastic Optimization](https://arxiv.org/pdf/1412.6980.pdf).`
	`34`	`+#[derive(Clone)]`
`34`	`35`	`pub struct Adam<B: Backend> {`
`35`	`36`	`momentum: AdaptiveMomentum,`
`36`	`37`	`weight_decay: Option<WeightDecay<B>>,`
`@@ -113,6 +114,7 @@ pub struct AdaptiveMomentumState<B: Backend, const D: usize> {`
`113`	`114`	`moment_2: Tensor<B, D>,`
`114`	`115`	`}`
`115`	`116`
	`117`	`+#[derive(Clone)]`
`116`	`118`	`struct AdaptiveMomentum {`
`117`	`119`	`beta_1: f32,`
`118`	`120`	`beta_2: f32,`
Original file line number	Diff line number	Diff line change
`@@ -30,6 +30,7 @@ pub struct AdamWConfig {`
`30`	`30`	`}`
`31`	`31`
`32`	`32`	`/// AdamW optimizer as described in the paper [Decoupled Weight Decay Regularization, Loshchilov and Hutter, 2019](https://arxiv.org/abs/1711.05101).`
	`33`	`+#[derive(Clone)]`
`33`	`34`	`pub struct AdamW<B: Backend> {`
`34`	`35`	`momentum: AdaptiveMomentumW,`
`35`	`36`	`weight_decay: f32,`
`@@ -112,6 +113,7 @@ pub struct AdaptiveMomentumWState<B: Backend, const D: usize> {`
`112`	`113`	`moment_2: Tensor<B, D>,`
`113`	`114`	`}`
`114`	`115`
	`116`	`+#[derive(Clone)]`
`115`	`117`	`struct AdaptiveMomentumW {`
`116`	`118`	`beta_1: f32,`
`117`	`119`	`beta_2: f32,`
Original file line number	Diff line number	Diff line change
`@@ -20,6 +20,7 @@ pub struct WeightDecayState<B: Backend, const D: usize> {`
`20`	`20`	`}`
`21`	`21`
`22`	`22`	`/// Weight decay implementation that transforms gradients.`
	`23`	`+#[derive(Clone)]`
`23`	`24`	`pub struct WeightDecay<B: Backend> {`
`24`	`25`	`penalty: B::FloatElem,`
`25`	`26`	`}`
Original file line number	Diff line number	Diff line change
`@@ -27,6 +27,7 @@ pub struct MomentumState<B: Backend, const D: usize> {`
`27`	`27`	`}`
`28`	`28`
`29`	`29`	`/// Momemtum implementation that transforms gradients.`
	`30`	`+#[derive(Clone)]`
`30`	`31`	`pub struct Momentum<B: Backend> {`
`31`	`32`	`momentum: B::FloatElem,`
`32`	`33`	`dampening: f64,`
Original file line number	Diff line number	Diff line change
`@@ -6,7 +6,7 @@ use burn_tensor::{backend::Backend, Tensor};`
`6`	`6`	`///`
`7`	`7`	`/// Implementations don't have to handle missing gradients, loading and exporting records, navigate the`
`8`	`8`	`/// module parameter structure, handle tracked and untracked tensors, and the likes.`
`9`		`-pub trait SimpleOptimizer<B>: Send + Sync`
	`9`	`+pub trait SimpleOptimizer<B>: Send + Sync + Clone`
`10`	`10`	`where`
`11`	`11`	`B: Backend,`
`12`	`12`	`{`