From 3b6ddfce4a5526db1cfd1778b1d93bb50703e392 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Wed, 16 Oct 2024 10:49:09 -0400 Subject: [PATCH 1/2] align gptq check to transformers for supporting cpu --- optimum/gptq/quantizer.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py index 949d4d260d..4061340989 100644 --- a/optimum/gptq/quantizer.py +++ b/optimum/gptq/quantizer.py @@ -14,8 +14,10 @@ # limitations under the License. import json import os +import importlib from enum import Enum from logging import getLogger +from packaging import version from typing import Any, Dict, List, Optional, Tuple, Union import torch @@ -320,7 +322,9 @@ def quantize_model(self, model: nn.Module, tokenizer: Optional[Any] = None): if not is_auto_gptq_available(): raise RuntimeError("auto-gptq is required in order to perform quantzation : `pip install auto-gptq`") - if not torch.cuda.is_available(): + + gptq_supports_cpu = version.parse(importlib.metadata.version("auto-gptq")) > version.parse("0.4.2") + if not gptq_supports_cpu and not torch.cuda.is_available(): raise RuntimeError("No GPU found. A GPU is needed to quantize model.") model.eval() @@ -405,12 +409,13 @@ def quantize_model(self, model: nn.Module, tokenizer: Optional[Any] = None): if not has_device_map: # put modules from module_name_preceding_first_block on cuda + to_device = "cuda:0" if torch.cuda.is_available() else "cpu" for module_name in self.module_name_preceding_first_block: module = recurse_getattr(model, module_name) if module is None: raise ValueError(f"Module {module_name} was not found in model") - module = module.to(0) - blocks[0] = blocks[0].to(0) + module = module.to(to_device) + blocks[0] = blocks[0].to(to_device) def store_input_hook(_, input, *args): kwargs = args[0] @@ -432,7 +437,7 @@ def store_input_hook(_, input, *args): for data in dataset: for k, v in data.items(): # put the data on gpu, we won't put them back to cpu - if not has_device_map or device.type == "cpu": + if (not has_device_map or device.type == "cpu") and torch.cuda.is_available(): data[k] = v.to(0) else: data[k] = v.to(device) @@ -461,7 +466,7 @@ def store_input_hook(_, input, *args): for data in dataset: for k, v in data.items(): # put the data on gpu, we won't put them back to cpu - if not has_device_map or device.type == "cpu": + if (not has_device_map or device.type == "cpu") and torch.cuda.is_available(): data[k] = v.to(0) else: data[k] = v.to(device) @@ -473,7 +478,7 @@ def store_input_hook(_, input, *args): # move block to cuda if needed # in case we have offload modules, we need to put them on cuda because of GPTQ object - if not has_device_map or get_device(block) == torch.device("cpu"): + if (not has_device_map or get_device(block) == torch.device("cpu")) and torch.cuda.is_available(): block = block.to(0) layers = get_layers(block) if isinstance(self.modules_in_block_to_quantize, list) and len(self.modules_in_block_to_quantize) > 0: From 50a405a476e30e11d20d79b4971b59f22d4a4c40 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Wed, 16 Oct 2024 10:51:53 -0400 Subject: [PATCH 2/2] fix comment --- optimum/gptq/quantizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py index 4061340989..2565e7c536 100644 --- a/optimum/gptq/quantizer.py +++ b/optimum/gptq/quantizer.py @@ -408,7 +408,7 @@ def quantize_model(self, model: nn.Module, tokenizer: Optional[Any] = None): blocks = recurse_getattr(model, self.block_name_to_quantize) if not has_device_map: - # put modules from module_name_preceding_first_block on cuda + # put modules from module_name_preceding_first_block on cuda or cpu to_device = "cuda:0" if torch.cuda.is_available() else "cpu" for module_name in self.module_name_preceding_first_block: module = recurse_getattr(model, module_name)