vllm-project · kylesayrs · Oct 9, 2024 · Oct 9, 2024 · Oct 9, 2024 · Oct 17, 2024
diff --git a/src/llmcompressor/modifiers/quantization/gptq/base.py b/src/llmcompressor/modifiers/quantization/gptq/base.py
@@ -21,6 +21,7 @@
 from llmcompressor.modifiers.utils.layer_compressor import LayerCompressor
 from llmcompressor.modifiers.utils.pytorch_helpers import run_calibration_forward
 from llmcompressor.utils.fsdp.context import fix_fsdp_module_name
+from llmcompressor.utils.helpers import DisableKVCache
 from llmcompressor.utils.pytorch.module import (
  get_layers,
  get_no_split_params,
@@ -286,48 +287,46 @@ def apply_compression(
  # want to calibrate wrt to these
  self.model.apply(disable_quantization)
 
- forward_pass_use_cache = self.model.config.use_cache
- self.model.config.use_cache = False
+ with DisableKVCache(self.model):
+ # in non-sequential mode we run calibration through the full model
+ # in sequential mode we run calibration up to the first transformer target
+ intermediates = run_calibration_forward(
+ self.model, dataloader, mask_padding=True
+ )
+ self.layer_compressors_[0].clear_early_stop()
 
- # in non-sequential mode we run calibration through the full model
- # in sequential mode we run calibration up to the first transformer target
- intermediates = run_calibration_forward(
- self.model, dataloader, mask_padding=True
- )
- self.layer_compressors_[0].clear_early_stop()
-
- # empty cache if not using sequential update
- if not self.sequential_update:
- del intermediates
- gc.collect()
- torch.cuda.empty_cache()
-
- num_layers = len(self.compressible_layers_)
- for idx, layer_compressor in enumerate(self.layer_compressors_):
- logger.info(f"\n===== Compressing layer {idx+1}/{num_layers} " " =====")
-
- if self.sequential_update:
- # in sequential mode we run the forward pass for each transformer layer
- # one at a time, caching the intermediate outputs between layers
- logger.info(f"Calibrating {layer_compressor.name}...")
- layer_compressor.pre_compress()
- unquantized_outputs = layer_compressor.calibrate_layer(intermediates)
-
- layer_compressor.compress()
- layer_compressor.post_compress()
- layer_compressor.revert_layer_wrappers()
-
- if self.sequential_update:
- quantized_outputs = layer_compressor.calibrate_layer(intermediates)
- error = get_output_error(unquantized_outputs, quantized_outputs)
- logger.info(f"Mean output error from quantization: {error:.3f}")
- intermediates = quantized_outputs
- del unquantized_outputs
-
- gc.collect()
- torch.cuda.empty_cache()
-
- self.model.config.use_cache = forward_pass_use_cache
+ # empty cache if not using sequential update
+ if not self.sequential_update:
+ del intermediates
+ gc.collect()
+ torch.cuda.empty_cache()
+
+ num_layers = len(self.compressible_layers_)
+ for idx, layer_compressor in enumerate(self.layer_compressors_):
+ logger.info(f"\n===== Compressing layer {idx+1}/{num_layers} " " =====")
+
+ if self.sequential_update:
+ # in sequential mode we run the forward pass for each layer
+ # one at a time, caching the intermediate outputs between layers
+ logger.info(f"Calibrating {layer_compressor.name}...")
+ layer_compressor.pre_compress()
+ unquantized_outputs = layer_compressor.calibrate_layer(
+ intermediates
+ )
+
+ layer_compressor.compress()
+ layer_compressor.post_compress()
+ layer_compressor.revert_layer_wrappers()
+
+ if self.sequential_update:
+ quantized_outputs = layer_compressor.calibrate_layer(intermediates)
+ error = get_output_error(unquantized_outputs, quantized_outputs)
+ logger.info(f"Mean output error from quantization: {error:.3f}")
+ intermediates = quantized_outputs
+ del unquantized_outputs
+
+ gc.collect()
+ torch.cuda.empty_cache()
 
  # re-enable quantization
  self.model.apply(enable_quantization)

diff --git a/src/llmcompressor/utils/helpers.py b/src/llmcompressor/utils/helpers.py
@@ -22,6 +22,7 @@
 from urllib.parse import urlparse
 
 import numpy
+import torch
 from loguru import logger
 
 __all__ = [
@@ -59,6 +60,7 @@
  "is_package_available",
  "import_from_path",
  "getattr_chain",
+ "DisableKVCache",
 ]
 
 
@@ -1041,3 +1043,30 @@ def getattr_chain(obj: Any, chain_str: str, *args, **kwargs) -> Any:
  res = getattr(res, attr_name)
 
  return res
+
+
+class DisableKVCache:
+ def __init__(self, model: torch.nn.Module):
+ if hasattr(model.config, "use_cache"):
+ self.config = model.config
+
+ # MllamaConfig
+ elif hasattr(model.config, "text_config") and hasattr(
+ model.config.text_config, "use_cache"
+ ):
+ self.config = model.config.text_config
+
+ # unknown config structure
+ else:
+ raise NotImplementedError(
+ f"Cannot find `use_cache` for config of type {type(model.config)}"
+ )
+
+ self.restore_value = self.config.use_cache
+
+ def __enter__(self):
+ self.restore_value = self.config.use_cache
+ self.config.use_cache = False
+
+ def __exit__(self, _exc_type, _exc_val, _exc_tb):
+ self.config.use_cache = self.restore_value