Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion qllm/auto_model_quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ def eval_model(self, model, pack_mode, dev):

inputs = self.tokenizer(
"compared with awq, gptq is", return_tensors="pt").to(model.device)
inputs["pad_token_id"] = self.tokenizer.eos_token_id
out = model.generate(**inputs, max_length=50)

# from .plugin import perplexity_utils
Expand All @@ -76,6 +77,9 @@ def eval_model(self, model, pack_mode, dev):

# TODO: perform packing on GPU
def pack_model(self, model, quantizers, pack_mode):
if not quantizers:
logger.warning("No quantized layers found, skip packing, If you are not using VPTQ, please check the log")
return model
attention_layers = find_layers(model, self.quant_layers+[ScaledLinear])
attention_layers = {n: attention_layers[n] for n in quantizers}

Expand Down Expand Up @@ -239,7 +243,7 @@ def run(self, args):
if args.save:
def repack_func(): return self.repack_to_new_mode(model, args.pack_mode)
AutoQuantizedModelForCausalLM.save_pretrained(model, self.tokenizer, args.save,
args.pack_mode, repack_func, safe_serialization=False)
args.pack_mode, repack_func)

if args.eval:
self.eval_model(model, args.pack_mode, "cuda")
Expand Down
6 changes: 3 additions & 3 deletions qllm/modeling/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,7 @@ def from_quantized(
return model

@staticmethod
def save_pretrained(model, tokenizer, save_directory: Union[str, Path], pack_mode: str, repack_func, safe_serialization: bool = False):
def save_pretrained(model, tokenizer, save_directory: Union[str, Path], pack_mode: str, repack_func, safe_serialization: bool = True):
quant_config_by_layer, quant_config = model.quant_config_by_layer, model.quant_config
if pack_mode != quant_config.version and pack_mode != "AUTO":
repack_func()
Expand All @@ -327,6 +327,6 @@ def save_pretrained(model, tokenizer, save_directory: Union[str, Path], pack_mod
tokenizer is not None and tokenizer.save_pretrained(save_directory)

with open(save_directory + "/quant_config_by_layer.json", 'w') as fp:
fp.write(json.dumps(quant_config_by_layer))
fp.write(json.dumps(quant_config_by_layer, indent=4))
with open(save_directory + "/quantize_config.json", 'w') as fp:
fp.write(json.dumps(quant_config.to_dict()))
fp.write(json.dumps(quant_config.to_dict(), indent=4))
6 changes: 5 additions & 1 deletion qllm/modeling/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ def load_quant_op_config(self, model_name_or_path):
with open(Path(model_name_or_path) / "quant_config_by_layer.json") as fp:
qunat_info = json.load(fp)
self.quant_config_by_op = qunat_info
if self.quant_method == "vptq":
self.quant_config_by_op = self.quant_config.get('config_for_layers', None)

def load_quant_config(self, model_name_or_path):
while True:
Expand All @@ -100,7 +102,9 @@ def load_quant_config(self, model_name_or_path):

wbits = quant_config.get("w_bit", quant_config.get("bits", None))
groupsize = quant_config.get("q_group_size", quant_config.get("group_size", None))
assert wbits is not None and groupsize is not None
quant_method = quant_config.get("quant_method", None)
if quant_method != "vptq":
assert wbits is not None and groupsize is not None

if quant_config.get('COMPATIBLE_WITH_AUTOGPTQ', None):
self.COMPATIBLE_WITH_AUTOGPTQ = True
Expand Down
9 changes: 8 additions & 1 deletion qllm/quantization/config_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ class HessianConfig(MetaInterface):
class VPTQLayerConfig(MetaInterface):
bias: bool = dataclasses.field(default=False)
enable_norm: bool = dataclasses.field(default=True)
enable_perm: bool = dataclasses.field(default=False)
enable_perm: bool = dataclasses.field(default=True)
group_num: int = dataclasses.field(default=1)
outlier_size: int = dataclasses.field(default=0)
group_size: int = dataclasses.field(default=-1)
Expand Down Expand Up @@ -125,6 +125,13 @@ class VPTQConfig(MetaInterface):
def from_dict(cls, config: dict):
return dataclass_from_dict(cls, config)

@dataclass
class VPTQInferConfig(MetaInterface):
group_size: int = 8
bits: int = 2
version: str = ""
quant_method: str = "vptq"
config_for_layers: typing.Dict[str, dict] = dataclasses.field(default_factory=dict)


def build_config(args):
Expand Down
16 changes: 2 additions & 14 deletions qllm/quantization/vptq/quant_vptq.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,20 +205,8 @@ def do_quantize(self, model, dataloader, model_prefix, dev):
torch.save(attention_layers[layer_idx], self.quant_cache_dir / f"layer_{layer_idx}.pt")

config_for_layers = {k:v.init_args for k,v in find_layers(model, [VQuantLinear]).items()}
MetaConf = type(
"MetaConf",
(object,),
{
"version": "AUTO",
"quant_method": "vptq",
"bits": 2,
"to_dict": lambda self: self.config_for_layers,
"to_meta": property(lambda self: self),
},
)
meta_conf = MetaConf()
meta_conf.config_for_layers = {"config_for_layers": config_for_layers}
self.quant_config = meta_conf
from qllm.quantization.config_builder import VPTQInferConfig
self.quant_config = VPTQInferConfig(config_for_layers=config_for_layers)
model.quant_config_by_layer = {}
model = pack_model(model, from_type=torch.uint16, to_type=torch.uint16, as_type=torch.int16)
# if self.quant_config.absorb_perm:
Expand Down
20 changes: 13 additions & 7 deletions qllm/utils/modelutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ def gen_conditions(_wbits, _groupsize):

def select_quant_linear(pack_mode: str, wbits:int, quant_method:str):
from ..modeling.q_layers import QuantLinearGPTQ
from ..modeling.q_layers.quant_linear_vptq import VQuantLinear
from ..modeling.q_layers.quant_linear_awq import WQLinear_GEMM
from ..modeling.q_layers.ext_package_checker import is_the_machine_support_awq_engine
from ..modeling.q_layers.quant_linear_onnxruntime import QuantLinearORT
Expand All @@ -54,7 +53,8 @@ def select_quant_linear(pack_mode: str, wbits:int, quant_method:str):
quant_method = quant_method.lower()

if quant_method == "vptq":
target_layer = VQuantLinear
import vptq
target_layer = vptq.VQuantLinear
elif pack_mode == "ORT":
target_layer = QuantLinearORT
elif quant_method == "hqq":
Expand Down Expand Up @@ -166,12 +166,18 @@ def make_mixbits_quant_linear(module, replaced_names, quant_info: dict, name='',
tmp = sub_module
if "groupsize" in quant_info and 'wbits' in quant_info:
bits, groupsize = quant_info['wbits'], quant_info['groupsize']
else:
elif 'wbits' in quant_info[module_name] and 'groupsize' in quant_info[module_name]:
bits, groupsize = quant_info[module_name]['wbits'], quant_info[module_name]['groupsize']
new_module = target_layer(
bits, groupsize, tmp.in_features, tmp.out_features, tmp.bias is not None, dtype=dtype
)
new_module.bias = tmp.bias.data if tmp.bias is not None else None
else:
bits = None
if bits is None:
new_module = target_layer(**quant_info[module_name], dtype=dtype)
new_module.bias = tmp.bias if tmp.bias is not None else None
else:
new_module = target_layer(
bits, groupsize, tmp.in_features, tmp.out_features, tmp.bias is not None, dtype=dtype
)
new_module.bias = tmp.bias.data if tmp.bias is not None else None
set_op_by_name(module, module_name, new_module)
return
#if isinstance(module, target_layer):
Expand Down