Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 19 additions & 5 deletions modules/module/LoRAModule.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,7 @@ class OFTModule(PeftBase):
block_share: bool
dropout_probability: float
adjustment_info: tuple[int, int] | None # for reporting
oft_scale: float

def __init__(self, prefix: str, orig_module: nn.Module | None, oft_block_size: int, coft: bool, coft_eps: float, block_share: bool, **kwargs):
super().__init__(prefix, orig_module)
Expand All @@ -357,6 +358,7 @@ def __init__(self, prefix: str, orig_module: nn.Module | None, oft_block_size: i
self.dropout_probability = kwargs.pop('dropout_probability', 0.0)
self.oft_R = None
self.adjustment_info = None
self.oft_scale = 1.0


if orig_module is not None:
Expand Down Expand Up @@ -433,12 +435,16 @@ def forward(self, x, *args, **kwargs):

# For Linear layers, rotating the input is mathematically equivalent to rotating the weights.
if isinstance(self.orig_module, nn.Linear):
rotated_x = self.oft_R(x)
rotated_x = self.oft_R(x, scale=self.oft_scale)
return self.orig_forward(rotated_x, *args, **kwargs)

# For Conv2d, we must rotate the weights, not the input, to preserve spatial information.

# Apply Scale
effective_weight = self.oft_R.weight * self.oft_scale if self.oft_scale != 1.0 else self.oft_R.weight

orth_rotate = self.oft_R._cayley_batch(
self.oft_R.weight, self.oft_R.block_size, self.oft_R.use_cayley_neumann, self.oft_R.num_cayley_neumann_terms
effective_weight, self.oft_R.block_size, self.oft_R.use_cayley_neumann, self.oft_R.num_cayley_neumann_terms
)
orth_rotate = self.oft_R.dropout(orth_rotate)

Expand Down Expand Up @@ -654,9 +660,17 @@ def __create_modules(self, orig_module: nn.Module | None, config: TrainConfig) -
prefixed_name = (self.prefix + "." + name) if self.prefix != "" else name
lora_module = self.klass(prefixed_name, child_module, *self.additional_args, **self.additional_kwargs)
lora_modules[name] = lora_module
if self.peft_type == PeftType.OFT_2 and lora_module.adjustment_info:
old, new = lora_module.adjustment_info
oft_adjustments.append({'old': old, 'new': new})
if self.peft_type == PeftType.OFT_2:
if lora_module.adjustment_info:
old, new = lora_module.adjustment_info
oft_adjustments.append({'old': old, 'new': new})

if config.oft_linear_scaling:
# Normalize against the configured block size
# If actual < config, scale > 1.0 (Boosting small blocks).
# If actual > config, scale < 1.0 (Damping large blocks).
lora_module.oft_scale = config.oft_block_size / lora_module.oft_block_size

selected.append(name)
else:
deselected.append(name)
Expand Down
7 changes: 5 additions & 2 deletions modules/module/oft_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ def _project_batch(self, Q, coft_eps=1e-4):

return self._pytorch_skew_symmetric_inv(out, self.block_size)

def forward(self, x):
def forward(self, x, scale: float = 1.0):
required_dtype = x.dtype
if required_dtype != self.weight.dtype:
x = x.to(self.weight.dtype)
Expand All @@ -145,8 +145,11 @@ def forward(self, x):
with torch.no_grad():
self.weight.copy_(self._project_batch(self.weight, coft_eps=self.coft_eps))

# Apply scaling to the weight (Q matrix) before Cayley transform
effective_weight = self.weight * scale if scale != 1.0 else self.weight

orth_rotate = self._cayley_batch(
self.weight, self.block_size, self.use_cayley_neumann, self.num_cayley_neumann_terms
effective_weight, self.block_size, self.use_cayley_neumann, self.num_cayley_neumann_terms
)
orth_rotate = self.dropout(orth_rotate)

Expand Down
5 changes: 5 additions & 0 deletions modules/ui/LoraTab.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,11 @@ def setup_lora(self, peft_type: PeftType):
tooltip="Share the OFT parameters between blocks. A single rotation matrix is shared across all blocks within a layer, drastically cutting the number of trainable parameters and yielding very compact adapter files, potentially improving generalization but at the cost of significant expressiveness, which can lead to underfitting on more complex or diverse tasks.")
components.switch(master, 3, 4, self.ui_state, "oft_block_share")

# SQRT Scaling
components.label(master, 4, 3, "Linear Scaling",
tooltip="Automatically scales the weights of OFT layers that have different block sizes using a linear strategy. Ensures that layers with different blocks maintain similar 'energy' and learning rate, preventing them from exploding or becoming negligible during training due to dimension mismatches.")
components.switch(master, 4, 4, self.ui_state, "oft_linear_scaling")

# Dropout Percentage
components.label(master, 2, 0, "Dropout Probability",
tooltip="Dropout probability. This percentage of the rotated adapter nodes that will be randomly restored to the base model initial statue. Helps with overfitting. 0 disables, 1 maximum.")
Expand Down
2 changes: 2 additions & 0 deletions modules/util/config/TrainConfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -532,6 +532,7 @@ class TrainConfig(BaseConfig):
oft_coft: bool
coft_eps: float
oft_block_share: bool
oft_linear_scaling: bool

# optimizer
optimizer: TrainOptimizerConfig
Expand Down Expand Up @@ -1160,6 +1161,7 @@ def default_values() -> 'TrainConfig':
data.append(("oft_coft", False, bool, False))
data.append(("coft_eps", 1e-4, float, False))
data.append(("oft_block_share", False, bool, False))
data.append(("oft_linear_scaling", False, bool, False))

# optimizer
data.append(("optimizer", TrainOptimizerConfig.default_values(), TrainOptimizerConfig, False))
Expand Down