forked from NVIDIA/TensorRT-LLM
-
Notifications
You must be signed in to change notification settings - Fork 0
/
convert_utils.py
78 lines (64 loc) · 2.7 KB
/
convert_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from typing import Dict
import torch
from ..quantization import QuantAlgo
def split(v, tp_size, idx, dim=0):
if tp_size == 1:
return v
if len(v.shape) == 1:
return torch.chunk(v, tp_size)[idx].contiguous()
else:
return torch.chunk(v, tp_size, dim=dim)[idx].clone()
def split_qkv_tp(v, n_head, n_hidden, tensor_parallel, rank):
"""
Splits the QKV matrix according to tensor parallelism
"""
v = v.reshape(3, n_hidden, n_hidden)
split_v = split(v, tensor_parallel, rank, dim=1)
split_v = split_v.reshape(3 * (n_hidden // tensor_parallel), n_hidden)
return split_v.clone()
def split_qkv_bias_tp(v, n_head, n_hidden, tensor_parallel, rank):
"""
Splits the QKV bias according to tensor parallelism
"""
v = v.reshape(3, n_hidden)
split_v = split(v, tensor_parallel, rank, dim=1)
split_v = split_v.reshape(3 * (n_hidden // tensor_parallel))
return split_v.clone()
def split_matrix_tp(v, tensor_parallel, rank, dim):
return split(v, tensor_parallel, rank, dim=dim)
def weight_only_quantize(weight: torch.Tensor,
quant_algo: str,
plugin: bool = True):
assert quant_algo in [QuantAlgo.W4A16, QuantAlgo.W8A16
], f'unsupported quant algo: {quant_algo}'
if quant_algo == QuantAlgo.W4A16:
assert plugin, 'W4A16 is only supported with plugin'
if weight.dim() > 2:
v = weight.transpose(-1, -2)
else:
v = weight.t()
t = torch.quint4x2 if quant_algo == QuantAlgo.W4A16 else torch.int8
processed_torch_weights, torch_weight_scales = \
torch.ops.trtllm.symmetric_quantize_last_axis_of_batched_matrix(
v.contiguous(), t)
if plugin:
return processed_torch_weights, torch_weight_scales
else:
return v, torch_weight_scales
def weight_only_quantize_dict(weights: Dict[str, torch.Tensor],
quant_algo: str,
quant_weights=[
'qkv.weight', 'dense.weight', 'fc.weight',
'proj.weight', 'gate.weight'
],
plugin: bool = True):
if quant_algo not in [QuantAlgo.W4A16, QuantAlgo.W8A16]:
return weights
for name in list(weights):
if any([_name in name for _name in quant_weights
]) and weights[name].dtype != torch.int8:
quant_weight, quant_scale = weight_only_quantize(
weight=weights[name], quant_algo=quant_algo, plugin=plugin)
weights[name] = quant_weight
weights[name.replace('.weight', '.per_channel_scale')] = quant_scale
return weights