From 3b6ddfce4a5526db1cfd1778b1d93bb50703e392 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Wed, 16 Oct 2024 10:49:09 -0400
Subject: [PATCH 1/2] align gptq check to transformers for supporting cpu

---
 optimum/gptq/quantizer.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py
index 949d4d260d..4061340989 100644
--- a/optimum/gptq/quantizer.py
+++ b/optimum/gptq/quantizer.py
@@ -14,8 +14,10 @@
 # limitations under the License.
 import json
 import os
+import importlib
 from enum import Enum
 from logging import getLogger
+from packaging import version
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import torch
@@ -320,7 +322,9 @@ def quantize_model(self, model: nn.Module, tokenizer: Optional[Any] = None):
 
         if not is_auto_gptq_available():
             raise RuntimeError("auto-gptq is required in order to perform quantzation : `pip install auto-gptq`")
-        if not torch.cuda.is_available():
+
+        gptq_supports_cpu = version.parse(importlib.metadata.version("auto-gptq")) > version.parse("0.4.2")
+        if not gptq_supports_cpu and not torch.cuda.is_available():
             raise RuntimeError("No GPU found. A GPU is needed to quantize model.")
 
         model.eval()
@@ -405,12 +409,13 @@ def quantize_model(self, model: nn.Module, tokenizer: Optional[Any] = None):
 
         if not has_device_map:
             # put modules from module_name_preceding_first_block on cuda
+            to_device = "cuda:0" if torch.cuda.is_available() else "cpu"
             for module_name in self.module_name_preceding_first_block:
                 module = recurse_getattr(model, module_name)
                 if module is None:
                     raise ValueError(f"Module {module_name} was not found in model")
-                module = module.to(0)
-            blocks[0] = blocks[0].to(0)
+                module = module.to(to_device)
+            blocks[0] = blocks[0].to(to_device)
 
         def store_input_hook(_, input, *args):
             kwargs = args[0]
@@ -432,7 +437,7 @@ def store_input_hook(_, input, *args):
             for data in dataset:
                 for k, v in data.items():
                     # put the data on gpu, we won't put them back to cpu
-                    if not has_device_map or device.type == "cpu":
+                    if (not has_device_map or device.type == "cpu") and torch.cuda.is_available():
                         data[k] = v.to(0)
                     else:
                         data[k] = v.to(device)
@@ -461,7 +466,7 @@ def store_input_hook(_, input, *args):
                 for data in dataset:
                     for k, v in data.items():
                         # put the data on gpu, we won't put them back to cpu
-                        if not has_device_map or device.type == "cpu":
+                        if (not has_device_map or device.type == "cpu") and torch.cuda.is_available():
                             data[k] = v.to(0)
                         else:
                             data[k] = v.to(device)
@@ -473,7 +478,7 @@ def store_input_hook(_, input, *args):
 
             # move block to cuda if needed
             # in case we have offload modules, we need to put them on cuda because of GPTQ object
-            if not has_device_map or get_device(block) == torch.device("cpu"):
+            if (not has_device_map or get_device(block) == torch.device("cpu")) and torch.cuda.is_available():
                 block = block.to(0)
             layers = get_layers(block)
             if isinstance(self.modules_in_block_to_quantize, list) and len(self.modules_in_block_to_quantize) > 0:

From 50a405a476e30e11d20d79b4971b59f22d4a4c40 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Wed, 16 Oct 2024 10:51:53 -0400
Subject: [PATCH 2/2] fix comment

---
 optimum/gptq/quantizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py
index 4061340989..2565e7c536 100644
--- a/optimum/gptq/quantizer.py
+++ b/optimum/gptq/quantizer.py
@@ -408,7 +408,7 @@ def quantize_model(self, model: nn.Module, tokenizer: Optional[Any] = None):
         blocks = recurse_getattr(model, self.block_name_to_quantize)
 
         if not has_device_map:
-            # put modules from module_name_preceding_first_block on cuda
+            # put modules from module_name_preceding_first_block on cuda or cpu
             to_device = "cuda:0" if torch.cuda.is_available() else "cpu"
             for module_name in self.module_name_preceding_first_block:
                 module = recurse_getattr(model, module_name)