Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .aitk/configs/checks.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"configCheck": 139,
"copyCheck": 178,
"copyCheck": 171,
"extensionCheck": 1,
"gitignoreCheck": 40,
"inferenceModelCheck": 25,
Expand All @@ -9,8 +9,8 @@
"modelProjectCheck": 41,
"oliveCheck": 45,
"oliveJsonCheck": 139,
"pathCheck": 1153,
"pathCheck": 1158,
"requirementsCheck": 37,
"templateCheck": 3,
"venvRequirementsCheck": 13
"venvRequirementsCheck": 14
}
10 changes: 5 additions & 5 deletions .aitk/configs/model_list.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
"architecture": "Transformer",
"status": "Ready",
"relativePath": "microsoft-Phi-3.5-mini-instruct/aitk",
"version": 5,
"version": 6,
"p0": true
},
{
Expand Down Expand Up @@ -63,7 +63,7 @@
"architecture": "Transformer",
"status": "Ready",
"relativePath": "deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk",
"version": 5,
"version": 6,
"p0": true
},
{
Expand Down Expand Up @@ -172,7 +172,7 @@
"architecture": "Transformer",
"status": "Ready",
"relativePath": "meta-llama-Llama-3.2-1B-Instruct/aitk",
"version": 5,
"version": 6,
"p0": true
},
{
Expand Down Expand Up @@ -239,7 +239,7 @@
"architecture": "Transformer",
"status": "Ready",
"relativePath": "Qwen-Qwen2.5-1.5B-Instruct/aitk",
"version": 5,
"version": 6,
"p0": true
},
{
Expand Down Expand Up @@ -407,7 +407,7 @@
"architecture": "Transformer",
"status": "Ready",
"relativePath": "meta-llama-Llama-3.1-8B-Instruct/aitk",
"version": 3,
"version": 4,
"p0": false
},
{
Expand Down
4 changes: 4 additions & 0 deletions .aitk/configs/parameter_template.json
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,10 @@
"name": "Quantization Dataset Size",
"type": "int"
},
"QuantizationDatasetLength": {
"name": "Quantization Dataset Sequence Length",
"type": "int"
},
"QuantizationDatasetSplit": {
"name": "Quantization Dataset Split",
"tags": [
Expand Down
20 changes: 20 additions & 0 deletions .aitk/requirements/requirements-NvidiaGPU-GptqModel.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# follow https://github.com/CodeLinaro/GPTQModel/blob/rel_4.2.5/requirements.txt except for torch
# uvpip:install git+https://github.com/CodeLinaro/GPTQModel.git@64231a266cc70c5597fe97f26e7ec5ccda660c37 --no-build-isolation;post;{"BUILD_CUDA_EXT":"0"}
# download:fast_hadamard_transform-1.0.4.post1-cp312-cp312-win_amd64.whl
./fast_hadamard_transform-1.0.4.post1-cp312-cp312-win_amd64.whl
accelerate==1.10.1
device-smi==0.4.1
hf_transfer==0.1.9
huggingface_hub==0.34.4
logbar==0.0.4
maturin==1.9.3
numpy==2.2.6
packaging==24.2
pillow==11.3.0
protobuf==6.32.0
random_word==1.0.13
safetensors==0.6.2
threadpoolctl==3.6.0
tokenicer==0.0.5
transformers==4.56.0
wheel==0.45.1
20 changes: 10 additions & 10 deletions .aitk/requirements/requirements-NvidiaGPU.txt
Original file line number Diff line number Diff line change
Expand Up @@ -34,15 +34,15 @@ multidict==6.6.4
multiprocess==0.70.16
networkx==3.4.2
numpy==2.2.4
# olive-ai==0.10.1
olive-ai==0.10.1
# olive-ai==0.11.0
olive-ai==0.11.0
# onnx==1.17.0
onnx==1.17.0
onnx-ir==0.1.10
# onnxruntime-genai-cuda==0.7.0
onnxruntime-genai-cuda==0.7.0
# onnxruntime-gpu==1.21.0
onnxruntime-gpu==1.21.0
# onnxruntime-genai-cuda==0.11.2
onnxruntime-genai-cuda==0.11.2
# onnxruntime-gpu==1.24.1
onnxruntime-gpu==1.24.1
onnxscript==0.5.3
# optimum==1.26.1
optimum==1.26.1
Expand All @@ -69,11 +69,11 @@ sympy==1.13.3
# tabulate==0.9.0
tabulate==0.9.0
tokenizers==0.21.4
# torch==2.7.0+cu128
torch==2.7.0+cu128
# torch==2.8.0+cu128
torch==2.8.0+cu128
torchmetrics==1.7.1
# torchvision==0.22.0+cu128
torchvision==0.22.0+cu128
# torchvision==0.23.0+cu128
torchvision==0.23.0+cu128
tqdm==4.67.1
transformers==4.51.3
typing-extensions==4.15.0
Expand Down
6 changes: 3 additions & 3 deletions .aitk/requirements/requirements-QNN.txt
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,11 @@ multidict==6.6.4
multiprocess==0.70.16
networkx==3.5
numpy==2.2.4
# olive-ai==0.10.1
olive-ai==0.10.1
# olive-ai==0.11.0
olive-ai==0.11.0
onnx==1.17.0
onnx-ir==0.1.10
# uvpip:install onnxruntime-qnn==1.22.0.dev20250402004 --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple --no-deps;post
# uvpip:install onnxruntime-qnn==1.23.2 --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple --no-deps;post
onnxscript==0.5.3
optuna==4.2.1
packaging==24.2
Expand Down
2 changes: 1 addition & 1 deletion .aitk/scripts/install_freeze.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@
# if from git: "git+https://github.com/microsoft/Olive.git@COMMIT_ID#egg=olive_ai
oliveAi = "olive-ai==0.10.1"
torchVision = "torchvision==0.22.0"
# TODO it is an example
amdQuark = "AMD__Quark_py3.10.17"


Expand Down Expand Up @@ -283,4 +282,5 @@ def write_requires_recursively(name: str):


if __name__ == "__main__":
raise "deprecated, need revise"
main()
3 changes: 3 additions & 0 deletions .aitk/scripts/sanitize/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,8 @@ class OliveDeviceTypes(Enum):
# Should sort by value
class OlivePassNames:
AitkPython = "aitkpython"
GptqModel = "gptqmodel"
GptqQuantizer = "gptqquantizer"
ModelBuilder = "modelbuilder"
NVModelOptQuantization = "nvmodeloptquantization"
OnnxFloatToFloat16 = "onnxfloattofloat16"
Expand Down Expand Up @@ -145,6 +147,7 @@ class OlivePropertyNames:
Host = "host"
LoadDatasetConfig = "load_dataset_config"
MaxSamples = "max_samples"
MaxSeqLen = "max_seq_len"
Metrics = "metrics"
Name = "name"
NumCalibData = "num_calib_data"
Expand Down
22 changes: 19 additions & 3 deletions .aitk/scripts/sanitize/generator_amd.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,16 @@


def generate_quantization_config(
configFile: Path, modelList: ModelList, parameter: ModelParameter
configFile: Path | dict, modelList: ModelList, parameter: ModelParameter
) -> Optional[Section]:
"""
Generates a quantization configuration section for the given file.
"""
with open_ex(configFile, "r") as f:
content = json.load(f)
if isinstance(configFile, Path):
with open_ex(configFile, "r") as f:
content = json.load(f)
else:
content = configFile
parameters = []
data_configs = content.get(OlivePropertyNames.DataConfigs, [])
for k, v in content[OlivePropertyNames.Passes].items():
Expand Down Expand Up @@ -110,6 +113,19 @@ def generate_quantization_config(
)

pre_process_data_config = data_configs[i].get(OlivePropertyNames.PreProcessDataConfig)

max_seq_len = pre_process_data_config.get(OlivePropertyNames.MaxSeqLen)
if max_seq_len:
parameters.append(
Parameter(
autoGenerated=True,
template=Parameter(
template="QuantizationDatasetLength",
path=f"{OlivePropertyNames.DataConfigs}[{i}].{OlivePropertyNames.PreProcessDataConfig}.{OlivePropertyNames.MaxSeqLen}",
),
)
)

max_samples = pre_process_data_config.get(OlivePropertyNames.MaxSamples)
if max_samples:
parameters.append(
Expand Down
24 changes: 22 additions & 2 deletions .aitk/scripts/sanitize/generator_qnn.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,26 @@
from pathlib import Path
import json

from .constants import OlivePassNames, OlivePropertyNames, PhaseTypeEnum
from .generator_amd import generate_quantization_config
from .generator_common import create_model_parameter, set_optimization_path
from .model_info import ModelList
from .model_parameter import ModelParameter
from .utils import isLLM_by_id
from .utils import isLLM_by_id, open_ex


def setup_features(content: dict, parameter: ModelParameter):
def add(feature: str):
if parameter.executeRuntimeFeatures is None:
parameter.executeRuntimeFeatures = []
if feature not in parameter.executeRuntimeFeatures:
parameter.executeRuntimeFeatures.append(feature)

for k, v in content[OlivePropertyNames.Passes].items():
if v[OlivePropertyNames.Type].lower() == OlivePassNames.GptqQuantizer:
add("AutoGptq")
elif v[OlivePropertyNames.Type].lower() == OlivePassNames.GptqModel:
add("GptqModel")


def generator_qnn(id: str, recipe, folder: Path, modelList: ModelList):
Expand All @@ -30,9 +46,13 @@ def generator_qnn(id: str, recipe, folder: Path, modelList: ModelList):
if "npu" in runtime_values:
parameter.isQNNLLM = True

quantize = generate_quantization_config(configFile, modelList, parameter)
with open_ex(configFile, "r") as f:
content = json.load(f)
quantize = generate_quantization_config(content, modelList, parameter)
if quantize:
parameter.sections.append(quantize)

setup_features(content, parameter)

parameter.writeIfChanged()
print(f"\tGenerated QNN configuration for {file}")
6 changes: 0 additions & 6 deletions .aitk/scripts/sanitize/model_parameter.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,8 +237,6 @@ class ModelParameter(BaseModelClass):
# This kind of config will
# - setup runtimeOverwrite for CUDA EP and others
# + the previous EP is used for EPContextBinaryGeneator by PythonEnvironment
# - do not support cpu evaluation
# - setup executeRuntimeFeatures, pyEnvRuntimeFeatures
isQNNLLM: Optional[bool] = None
# SET AUTOMATICALLY TO TRUE WHEN CUDAExecutionProvider
# When true, it means some passes need CUDA so user could not run it without
Expand Down Expand Up @@ -300,9 +298,6 @@ def Check(self, templates: Dict[str, Parameter], oliveJson: Any, modelList: Mode
),
)

if self.isQNNLLM:
self.addCpu = False

# Add runtime
syskey, system = get_target_system(oliveJson)
currentEp: str = system[OlivePropertyNames.Accelerators][0][OlivePropertyNames.ExecutionProviders][0]
Expand All @@ -320,7 +315,6 @@ def Check(self, templates: Dict[str, Parameter], oliveJson: Any, modelList: Mode
executeEp=EPNames.CUDAExecutionProvider,
evaluateUsedInExecute=True,
)
self.executeRuntimeFeatures = ["AutoGptq"]

if self.runtimeOverwrite and not self.runtimeOverwrite.Check(oliveJson):
printError(f"{self._file} runtime overwrite has error")
Expand Down
2 changes: 2 additions & 0 deletions Qwen-Qwen2.5-1.5B-Instruct/aitk/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ To support both efficiently, we create **two model instances**:

## **PTQ + AOT Compilation for Qualcomm NPUs using QNN EP**

**When Quantization Dataset Sequence Length is 1024, it needs about 20GB GPU Memory. So adjust according to your hardware.**

This process extends the [**QDQ Model with 4-bit Weights & 16-bit Activations**](#qdq-model-with-4-bit-weights--16-bit-activations) by compiling it specifically for **Qualcomm NPUs** using the **QNN Execution Provider**.

### **Resource Optimization Strategy**
Expand Down
19 changes: 0 additions & 19 deletions Qwen-Qwen2.5-1.5B-Instruct/aitk/_copy.json.config
Original file line number Diff line number Diff line change
@@ -1,19 +1,5 @@
{
"copies": [
{
"src": "../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_qnn_config.json",
"dst": "qwen2_5_qnn_config.json",
"replacements": [
{
"find": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
"replace": "Qwen/Qwen2.5-1.5B-Instruct"
},
{
"find": "model/deepseek",
"replace": "model/qwen2_5"
}
]
},
{
"src": "../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_trtrtx_config.json",
"dst": "qwen2_5_trtrtx_config.json",
Expand Down Expand Up @@ -42,11 +28,6 @@
}
]
},
{
"src": "../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_dml_config.json.config",
"dst": "qwen2_5_dml_config.json.config",
"replacements": []
},
{
"src": "../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/README.md",
"dst": "README.md",
Expand Down
4 changes: 1 addition & 3 deletions Qwen-Qwen2.5-1.5B-Instruct/aitk/info.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,15 +37,13 @@ recipes:
ep: QNNExecutionProvider
aitk:
oliveFile: "QNN/config_gpu.json"
requirementsPatches:
- AutoGptq
isGPURequired: true
runtimeOverwrite:
executeEp: NvTensorRTRTXExecutionProvider
aitk:
modelInfo:
id: "huggingface/Qwen/Qwen2.5-1.5B-Instruct"
version: 5
version: 6
groupId: "huggingface/Qwen/Qwen2.5-1.5B-Instruct"
groupItemName: "1.5B"
p0: true
2 changes: 1 addition & 1 deletion Qwen-Qwen2.5-1.5B-Instruct/aitk/model_project.config
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,6 @@
],
"modelInfo": {
"id": "huggingface/Qwen/Qwen2.5-1.5B-Instruct",
"version": 5
"version": 6
}
}
Loading
Loading