Nerogar · Koratahiu · Dec 28, 2025 · Dec 28, 2025 · Dec 28, 2025 · Dec 28, 2025
diff --git a/modules/module/LoRAModule.py b/modules/module/LoRAModule.py
@@ -346,6 +346,7 @@ class OFTModule(PeftBase):
     block_share: bool
     dropout_probability: float
     adjustment_info: tuple[int, int] | None # for reporting
+    oft_scale: float
 
     def __init__(self, prefix: str, orig_module: nn.Module | None, oft_block_size: int, coft: bool, coft_eps: float, block_share: bool, **kwargs):
         super().__init__(prefix, orig_module)
@@ -357,6 +358,7 @@ def __init__(self, prefix: str, orig_module: nn.Module | None, oft_block_size: i
         self.dropout_probability = kwargs.pop('dropout_probability', 0.0)
         self.oft_R = None
         self.adjustment_info = None
+        self.oft_scale = 1.0
 
 
         if orig_module is not None:
@@ -433,12 +435,16 @@ def forward(self, x, *args, **kwargs):
 
         # For Linear layers, rotating the input is mathematically equivalent to rotating the weights.
         if isinstance(self.orig_module, nn.Linear):
-            rotated_x = self.oft_R(x)
+            rotated_x = self.oft_R(x, scale=self.oft_scale)
             return self.orig_forward(rotated_x, *args, **kwargs)
 
         # For Conv2d, we must rotate the weights, not the input, to preserve spatial information.
+
+        # Apply Scale
+        effective_weight = self.oft_R.weight * self.oft_scale if self.oft_scale != 1.0 else self.oft_R.weight
+
         orth_rotate = self.oft_R._cayley_batch(
-            self.oft_R.weight, self.oft_R.block_size, self.oft_R.use_cayley_neumann, self.oft_R.num_cayley_neumann_terms
+            effective_weight, self.oft_R.block_size, self.oft_R.use_cayley_neumann, self.oft_R.num_cayley_neumann_terms
         )
         orth_rotate = self.oft_R.dropout(orth_rotate)
 
@@ -654,9 +660,17 @@ def __create_modules(self, orig_module: nn.Module | None, config: TrainConfig) -
                 prefixed_name = (self.prefix + "." + name) if self.prefix != "" else name
                 lora_module = self.klass(prefixed_name, child_module, *self.additional_args, **self.additional_kwargs)
                 lora_modules[name] = lora_module
-                if self.peft_type == PeftType.OFT_2 and lora_module.adjustment_info:
-                    old, new = lora_module.adjustment_info
-                    oft_adjustments.append({'old': old, 'new': new})
+                if self.peft_type == PeftType.OFT_2:
+                    if lora_module.adjustment_info:
+                        old, new = lora_module.adjustment_info
+                        oft_adjustments.append({'old': old, 'new': new})
+
+                    if config.oft_linear_scaling:
+                        # Normalize against the configured block size
+                        # If actual < config, scale > 1.0 (Boosting small blocks).
+                        # If actual > config, scale < 1.0 (Damping large blocks).
+                        lora_module.oft_scale = config.oft_block_size / lora_module.oft_block_size
+
                 selected.append(name)
             else:
                 deselected.append(name)

diff --git a/modules/module/oft_utils.py b/modules/module/oft_utils.py
@@ -134,7 +134,7 @@ def _project_batch(self, Q, coft_eps=1e-4):
 
         return self._pytorch_skew_symmetric_inv(out, self.block_size)
 
-    def forward(self, x):
+    def forward(self, x, scale: float = 1.0):
         required_dtype = x.dtype
         if required_dtype != self.weight.dtype:
             x = x.to(self.weight.dtype)
@@ -145,8 +145,11 @@ def forward(self, x):
             with torch.no_grad():
                 self.weight.copy_(self._project_batch(self.weight, coft_eps=self.coft_eps))
 
+        # Apply scaling to the weight (Q matrix) before Cayley transform
+        effective_weight = self.weight * scale if scale != 1.0 else self.weight
+
         orth_rotate = self._cayley_batch(
-            self.weight, self.block_size, self.use_cayley_neumann, self.num_cayley_neumann_terms
+            effective_weight, self.block_size, self.use_cayley_neumann, self.num_cayley_neumann_terms
         )
         orth_rotate = self.dropout(orth_rotate)
 

diff --git a/modules/ui/LoraTab.py b/modules/ui/LoraTab.py
@@ -135,6 +135,11 @@ def setup_lora(self, peft_type: PeftType):
                              tooltip="Share the OFT parameters between blocks. A single rotation matrix is shared across all blocks within a layer, drastically cutting the number of trainable parameters and yielding very compact adapter files, potentially improving generalization but at the cost of significant expressiveness, which can lead to underfitting on more complex or diverse tasks.")
             components.switch(master, 3, 4, self.ui_state, "oft_block_share")
 
+            # SQRT Scaling
+            components.label(master, 4, 3, "Linear Scaling",
+                             tooltip="Automatically scales the weights of OFT layers that have different block sizes using a linear strategy. Ensures that layers with different blocks maintain similar 'energy' and learning rate, preventing them from exploding or becoming negligible during training due to dimension mismatches.")
+            components.switch(master, 4, 4, self.ui_state, "oft_linear_scaling")
+
             # Dropout Percentage
             components.label(master, 2, 0, "Dropout Probability",
                             tooltip="Dropout probability. This percentage of the rotated adapter nodes that will be randomly restored to the base model initial statue. Helps with overfitting. 0 disables, 1 maximum.")

diff --git a/modules/util/config/TrainConfig.py b/modules/util/config/TrainConfig.py
@@ -532,6 +532,7 @@ class TrainConfig(BaseConfig):
     oft_coft: bool
     coft_eps: float
     oft_block_share: bool
+    oft_linear_scaling: bool
 
     # optimizer
     optimizer: TrainOptimizerConfig
@@ -1160,6 +1161,7 @@ def default_values() -> 'TrainConfig':
         data.append(("oft_coft", False, bool, False))
         data.append(("coft_eps", 1e-4, float, False))
         data.append(("oft_block_share", False, bool, False))
+        data.append(("oft_linear_scaling", False, bool, False))
 
         # optimizer
         data.append(("optimizer", TrainOptimizerConfig.default_values(), TrainOptimizerConfig, False))