Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions modules/ui/OptimizerParamsWindow.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,7 @@ def create_dynamic_ui(
'approx_mars': {'title': 'Approx MARS-M', 'tooltip': 'Enables Approximated MARS-M, a variance reduction technique. It uses the previous step\'s gradient to correct the current update, leading to lower losses and improved convergence stability. This requires additional state to store the previous gradient.', 'type': 'bool'},
'kappa_p': {'title': 'Lion-K P-value', 'tooltip': 'Controls the Lp-norm geometry for the Lion update. 1.0 = Standard Lion (Sign update, coordinate-wise), best for Transformers. 2.0 = Spherical Lion (Normalized update, rotational invariant), best for Conv2d layers (in unet models). Values between 1.0 and 2.0 interpolate behavior between the two.', 'type': 'float'},
'auto_kappa_p': {'title': 'Auto Lion-K', 'tooltip': 'Automatically determines the optimal P-value based on layer dimensions. Uses p=2.0 (Spherical) for 4D (Conv) tensors for stability and rotational invariance, and p=1.0 (Sign) for 2D (Linear) tensors for sparsity. Overrides the manual P-value. Recommend for unet models.', 'type': 'bool'},
'compile': {'title': 'Compiled Optimizer', 'tooltip': 'Enables PyTorch compilation for the optimizer internal step logic. This is intended to improve performance by allowing PyTorch to fuse operations and optimize the computational graph.', 'type': 'bool'},
}
# @formatter:on

Expand Down
2 changes: 2 additions & 0 deletions modules/util/config/TrainConfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ class TrainOptimizerConfig(BaseConfig):
approx_mars: False
kappa_p: float
auto_kappa_p: False
compile: False

def __init__(self, data: list[(str, Any, type, bool)]):
super().__init__(data)
Expand Down Expand Up @@ -261,6 +262,7 @@ def default_values():
data.append(("approx_mars", False, bool, False))
data.append(("kappa_p", None, float, True))
data.append(("auto_kappa_p", False, bool, False))
data.append(("compile", False, bool, False))

return TrainOptimizerConfig(data)

Expand Down
8 changes: 8 additions & 0 deletions modules/util/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -1080,6 +1080,7 @@ def create_optimizer(
alpha=optimizer_config.alpha if optimizer_config.alpha is not None else 5,
kourkoutas_beta=optimizer_config.kourkoutas_beta if optimizer_config.kourkoutas_beta is not None else False,
k_warmup_steps=optimizer_config.k_warmup_steps if optimizer_config.k_warmup_steps is not None else 0,
compiled_optimizer=optimizer_config.compile if optimizer_config.compile is not None else False,
)

# ADOPT_ADV Optimizer
Expand All @@ -1106,6 +1107,7 @@ def create_optimizer(
alpha_grad=optimizer_config.alpha_grad if optimizer_config.alpha_grad is not None else 100,
kourkoutas_beta=optimizer_config.kourkoutas_beta if optimizer_config.kourkoutas_beta is not None else False,
k_warmup_steps=optimizer_config.k_warmup_steps if optimizer_config.k_warmup_steps is not None else 0,
compiled_optimizer=optimizer_config.compile if optimizer_config.compile is not None else False,
)

# PRODIGY_ADV Optimizer
Expand Down Expand Up @@ -1139,6 +1141,7 @@ def create_optimizer(
alpha_grad=optimizer_config.alpha_grad if optimizer_config.alpha_grad is not None else 100,
kourkoutas_beta=optimizer_config.kourkoutas_beta if optimizer_config.kourkoutas_beta is not None else False,
k_warmup_steps=optimizer_config.k_warmup_steps if optimizer_config.k_warmup_steps is not None else 0,
compiled_optimizer=optimizer_config.compile if optimizer_config.compile is not None else False,
)

# SIMPLIFIED_AdEMAMix Optimizer
Expand All @@ -1161,6 +1164,7 @@ def create_optimizer(
orthogonal_gradient=optimizer_config.orthogonal_gradient if optimizer_config.orthogonal_gradient is not None else False,
kourkoutas_beta=optimizer_config.kourkoutas_beta if optimizer_config.kourkoutas_beta is not None else False,
k_warmup_steps=optimizer_config.k_warmup_steps if optimizer_config.k_warmup_steps is not None else 0,
compiled_optimizer=optimizer_config.compile if optimizer_config.compile is not None else False,
)

# LION_ADV Optimizer
Expand All @@ -1180,6 +1184,7 @@ def create_optimizer(
orthogonal_gradient=optimizer_config.orthogonal_gradient if optimizer_config.orthogonal_gradient is not None else False,
kappa_p=optimizer_config.kappa_p if optimizer_config.kappa_p is not None else 1.0,
auto_kappa_p=optimizer_config.auto_kappa_p if optimizer_config.auto_kappa_p is not None else False,
compiled_optimizer=optimizer_config.compile if optimizer_config.compile is not None else False,
)

# LION_PRODIGY_ADV Optimizer
Expand All @@ -1206,6 +1211,7 @@ def create_optimizer(
orthogonal_gradient=optimizer_config.orthogonal_gradient if optimizer_config.orthogonal_gradient is not None else False,
kappa_p=optimizer_config.kappa_p if optimizer_config.kappa_p is not None else 1.0,
auto_kappa_p=optimizer_config.auto_kappa_p if optimizer_config.auto_kappa_p is not None else False,
compiled_optimizer=optimizer_config.compile if optimizer_config.compile is not None else False,
)

# MUON_ADV Optimizer
Expand Down Expand Up @@ -1254,6 +1260,7 @@ def create_optimizer(
accelerated_ns=optimizer_config.accelerated_ns if optimizer_config.accelerated_ns is not None else False,
orthogonal_gradient=optimizer_config.orthogonal_gradient if optimizer_config.orthogonal_gradient is not None else False,
approx_mars=optimizer_config.approx_mars if optimizer_config.approx_mars is not None else False,
compiled_optimizer=optimizer_config.compile if optimizer_config.compile is not None else False,
**adam_kwargs
)

Expand Down Expand Up @@ -1307,6 +1314,7 @@ def create_optimizer(
accelerated_ns=optimizer_config.accelerated_ns if optimizer_config.accelerated_ns is not None else False,
orthogonal_gradient=optimizer_config.orthogonal_gradient if optimizer_config.orthogonal_gradient is not None else False,
approx_mars=optimizer_config.approx_mars if optimizer_config.approx_mars is not None else False,
compiled_optimizer=optimizer_config.compile if optimizer_config.compile is not None else False,
**adam_kwargs
)

Expand Down
8 changes: 8 additions & 0 deletions modules/util/optimizer_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -457,6 +457,7 @@ def init_model_parameters(
"use_bias_correction": True,
"nnmf_factor": False,
"stochastic_rounding": True,
"compile": False,
"fused_back_pass": False,
"use_atan2": False,
"cautious_mask": False,
Expand All @@ -476,6 +477,7 @@ def init_model_parameters(
"weight_decay": 0.0,
"nnmf_factor": False,
"stochastic_rounding": True,
"compile": False,
"fused_back_pass": False,
"use_atan2": False,
"cautious_mask": False,
Expand All @@ -498,6 +500,7 @@ def init_model_parameters(
"weight_decay": 0.0,
"nnmf_factor": False,
"stochastic_rounding": True,
"compile": False,
"fused_back_pass": False,
"d0": 1e-6,
"d_coef": 1.0,
Expand Down Expand Up @@ -529,6 +532,7 @@ def init_model_parameters(
"use_bias_correction": True,
"nnmf_factor": False,
"stochastic_rounding": True,
"compile": False,
"fused_back_pass": False,
"orthogonal_gradient": False,
"kourkoutas_beta": False,
Expand All @@ -542,6 +546,7 @@ def init_model_parameters(
"clip_threshold": None,
"nnmf_factor": False,
"stochastic_rounding": True,
"compile": False,
"fused_back_pass": False,
"cautious_mask": False,
"orthogonal_gradient": False,
Expand All @@ -557,6 +562,7 @@ def init_model_parameters(
"clip_threshold": None,
"nnmf_factor": False,
"stochastic_rounding": True,
"compile": False,
"fused_back_pass": False,
"d0": 1e-6,
"d_coef": 1.0,
Expand All @@ -580,6 +586,7 @@ def init_model_parameters(
"rms_rescaling": True,
"nnmf_factor": False,
"stochastic_rounding": True,
"compile": False,
"fused_back_pass": False,
"MuonWithAuxAdam": True,
"muon_hidden_layers": None,
Expand Down Expand Up @@ -610,6 +617,7 @@ def init_model_parameters(
"rms_rescaling": True,
"nnmf_factor": False,
"stochastic_rounding": True,
"compile": False,
"fused_back_pass": False,
"MuonWithAuxAdam": True,
"muon_hidden_layers": None,
Expand Down
4 changes: 2 additions & 2 deletions requirements-global.txt
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ prodigyopt==1.1.2 # prodigy optimizer
schedulefree==1.4.1 # schedule-free optimizers
pytorch_optimizer==3.6.0 # pytorch optimizers
prodigy-plus-schedule-free==2.0.1 # Prodigy plus optimizer
adv_optm==1.4.0 # advanced optimizers
adv_optm==2.0.1 # advanced optimizers
-e git+https://github.com/KellerJordan/Muon.git@f90a42b#egg=muon-optimizer

# Profiling
Expand All @@ -56,5 +56,5 @@ fabric==3.2.2

# debug
psutil==7.0.0
requests==2.32.3
requests==2.32.5
deepdiff==8.6.1 # output easy to read diff for troublshooting