microsoft · xieofxie · Feb 11, 2026 · Feb 11, 2026 · Feb 11, 2026 · Feb 11, 2026
@@ -1,6 +1,6 @@
 {
     "configCheck": 139,
-    "copyCheck": 178,
+    "copyCheck": 171,
     "extensionCheck": 1,
     "gitignoreCheck": 40,
     "inferenceModelCheck": 25,
@@ -9,8 +9,8 @@
     "modelProjectCheck": 41,
     "oliveCheck": 45,
     "oliveJsonCheck": 139,
-    "pathCheck": 1153,
+    "pathCheck": 1158,
     "requirementsCheck": 37,
     "templateCheck": 3,
-    "venvRequirementsCheck": 13
+    "venvRequirementsCheck": 14
 }
@@ -18,7 +18,7 @@
             "architecture": "Transformer",
             "status": "Ready",
             "relativePath": "microsoft-Phi-3.5-mini-instruct/aitk",
-            "version": 5,
+            "version": 6,
             "p0": true
         },
         {
@@ -63,7 +63,7 @@
             "architecture": "Transformer",
             "status": "Ready",
             "relativePath": "deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk",
-            "version": 5,
+            "version": 6,
             "p0": true
         },
         {
@@ -172,7 +172,7 @@
             "architecture": "Transformer",
             "status": "Ready",
             "relativePath": "meta-llama-Llama-3.2-1B-Instruct/aitk",
-            "version": 5,
+            "version": 6,
             "p0": true
         },
         {
@@ -239,7 +239,7 @@
             "architecture": "Transformer",
             "status": "Ready",
             "relativePath": "Qwen-Qwen2.5-1.5B-Instruct/aitk",
-            "version": 5,
+            "version": 6,
             "p0": true
         },
         {
@@ -407,7 +407,7 @@
             "architecture": "Transformer",
             "status": "Ready",
             "relativePath": "meta-llama-Llama-3.1-8B-Instruct/aitk",
-            "version": 3,
+            "version": 4,
             "p0": false
         },
         {

@@ -102,6 +102,10 @@
         "name": "Quantization Dataset Size",
         "type": "int"
     },
+    "QuantizationDatasetLength": {
+        "name": "Quantization Dataset Sequence Length",
+        "type": "int"
+    },
     "QuantizationDatasetSplit": {
         "name": "Quantization Dataset Split",
         "tags": [

@@ -0,0 +1,20 @@
+# follow https://github.com/CodeLinaro/GPTQModel/blob/rel_4.2.5/requirements.txt except for torch
+# uvpip:install git+https://github.com/CodeLinaro/GPTQModel.git@64231a266cc70c5597fe97f26e7ec5ccda660c37 --no-build-isolation;post;{"BUILD_CUDA_EXT":"0"}
+# download:fast_hadamard_transform-1.0.4.post1-cp312-cp312-win_amd64.whl
+./fast_hadamard_transform-1.0.4.post1-cp312-cp312-win_amd64.whl
+accelerate==1.10.1
+device-smi==0.4.1
+hf_transfer==0.1.9
+huggingface_hub==0.34.4
+logbar==0.0.4
+maturin==1.9.3
+numpy==2.2.6
+packaging==24.2
+pillow==11.3.0
+protobuf==6.32.0
+random_word==1.0.13
+safetensors==0.6.2
+threadpoolctl==3.6.0
+tokenicer==0.0.5
+transformers==4.56.0
+wheel==0.45.1
@@ -34,15 +34,15 @@ multidict==6.6.4
 multiprocess==0.70.16
 networkx==3.4.2
 numpy==2.2.4
-# olive-ai==0.10.1
-olive-ai==0.10.1
+# olive-ai==0.11.0
+olive-ai==0.11.0
 # onnx==1.17.0
 onnx==1.17.0
 onnx-ir==0.1.10
-# onnxruntime-genai-cuda==0.7.0
-onnxruntime-genai-cuda==0.7.0
-# onnxruntime-gpu==1.21.0
-onnxruntime-gpu==1.21.0
+# onnxruntime-genai-cuda==0.11.2
+onnxruntime-genai-cuda==0.11.2
+# onnxruntime-gpu==1.24.1
+onnxruntime-gpu==1.24.1
 onnxscript==0.5.3
 # optimum==1.26.1
 optimum==1.26.1
@@ -69,11 +69,11 @@ sympy==1.13.3
 # tabulate==0.9.0
 tabulate==0.9.0
 tokenizers==0.21.4
-# torch==2.7.0+cu128
-torch==2.7.0+cu128
+# torch==2.8.0+cu128
+torch==2.8.0+cu128
 torchmetrics==1.7.1
-# torchvision==0.22.0+cu128
-torchvision==0.22.0+cu128
+# torchvision==0.23.0+cu128
+torchvision==0.23.0+cu128
 tqdm==4.67.1
 transformers==4.51.3
 typing-extensions==4.15.0

@@ -34,11 +34,11 @@ multidict==6.6.4
 multiprocess==0.70.16
 networkx==3.5
 numpy==2.2.4
-# olive-ai==0.10.1
-olive-ai==0.10.1
+# olive-ai==0.11.0
+olive-ai==0.11.0
 onnx==1.17.0
 onnx-ir==0.1.10
-# uvpip:install onnxruntime-qnn==1.22.0.dev20250402004 --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple --no-deps;post
+# uvpip:install onnxruntime-qnn==1.23.2 --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple --no-deps;post
 onnxscript==0.5.3
 optuna==4.2.1
 packaging==24.2

@@ -31,7 +31,6 @@
 # if from git: "git+https://github.com/microsoft/Olive.git@COMMIT_ID#egg=olive_ai
 oliveAi = "olive-ai==0.10.1"
 torchVision = "torchvision==0.22.0"
-# TODO it is an example
 amdQuark = "AMD__Quark_py3.10.17"
 
 
@@ -283,4 +282,5 @@ def write_requires_recursively(name: str):
 
 
 if __name__ == "__main__":
+    raise "deprecated, need revise"
     main()
@@ -108,6 +108,8 @@ class OliveDeviceTypes(Enum):
 # Should sort by value
 class OlivePassNames:
     AitkPython = "aitkpython"
+    GptqModel = "gptqmodel"
+    GptqQuantizer = "gptqquantizer"
     ModelBuilder = "modelbuilder"
     NVModelOptQuantization = "nvmodeloptquantization"
     OnnxFloatToFloat16 = "onnxfloattofloat16"
@@ -145,6 +147,7 @@ class OlivePropertyNames:
     Host = "host"
     LoadDatasetConfig = "load_dataset_config"
     MaxSamples = "max_samples"
+    MaxSeqLen = "max_seq_len"
     Metrics = "metrics"
     Name = "name"
     NumCalibData = "num_calib_data"

@@ -11,13 +11,16 @@
 
 
 def generate_quantization_config(
-    configFile: Path, modelList: ModelList, parameter: ModelParameter
+    configFile: Path | dict, modelList: ModelList, parameter: ModelParameter
 ) -> Optional[Section]:
     """
     Generates a quantization configuration section for the given file.
     """
-    with open_ex(configFile, "r") as f:
-        content = json.load(f)
+    if isinstance(configFile, Path):
+        with open_ex(configFile, "r") as f:
+            content = json.load(f)
+    else:
+        content = configFile
     parameters = []
     data_configs = content.get(OlivePropertyNames.DataConfigs, [])
     for k, v in content[OlivePropertyNames.Passes].items():
@@ -110,6 +113,19 @@ def generate_quantization_config(
                             )
 
                         pre_process_data_config = data_configs[i].get(OlivePropertyNames.PreProcessDataConfig)
+
+                        max_seq_len = pre_process_data_config.get(OlivePropertyNames.MaxSeqLen)
+                        if max_seq_len:
+                            parameters.append(
+                                Parameter(
+                                    autoGenerated=True,
+                                    template=Parameter(
+                                        template="QuantizationDatasetLength",
+                                        path=f"{OlivePropertyNames.DataConfigs}[{i}].{OlivePropertyNames.PreProcessDataConfig}.{OlivePropertyNames.MaxSeqLen}",
+                                    ),
+                                )
+                            )
+
                         max_samples = pre_process_data_config.get(OlivePropertyNames.MaxSamples)
                         if max_samples:
                             parameters.append(

@@ -1,10 +1,26 @@
 from pathlib import Path
+import json
 
+from .constants import OlivePassNames, OlivePropertyNames, PhaseTypeEnum
 from .generator_amd import generate_quantization_config
 from .generator_common import create_model_parameter, set_optimization_path
 from .model_info import ModelList
 from .model_parameter import ModelParameter
-from .utils import isLLM_by_id
+from .utils import isLLM_by_id, open_ex
+
+
+def setup_features(content: dict, parameter: ModelParameter):
+    def add(feature: str):
+        if parameter.executeRuntimeFeatures is None:
+            parameter.executeRuntimeFeatures = []
+        if feature not in parameter.executeRuntimeFeatures:
+            parameter.executeRuntimeFeatures.append(feature)
+
+    for k, v in content[OlivePropertyNames.Passes].items():
+        if v[OlivePropertyNames.Type].lower() == OlivePassNames.GptqQuantizer:
+            add("AutoGptq")
+        elif v[OlivePropertyNames.Type].lower() == OlivePassNames.GptqModel:
+            add("GptqModel")
 
 
 def generator_qnn(id: str, recipe, folder: Path, modelList: ModelList):
@@ -30,9 +46,13 @@ def generator_qnn(id: str, recipe, folder: Path, modelList: ModelList):
     if "npu" in runtime_values:
         parameter.isQNNLLM = True
 
-    quantize = generate_quantization_config(configFile, modelList, parameter)
+    with open_ex(configFile, "r") as f:
+        content = json.load(f)
+    quantize = generate_quantization_config(content, modelList, parameter)
     if quantize:
         parameter.sections.append(quantize)
 
+    setup_features(content, parameter)
+
     parameter.writeIfChanged()
     print(f"\tGenerated QNN configuration for {file}")
@@ -237,8 +237,6 @@ class ModelParameter(BaseModelClass):
     # This kind of config will
     # - setup runtimeOverwrite for CUDA EP and others
     #   + the previous EP is used for EPContextBinaryGeneator by PythonEnvironment
-    # - do not support cpu evaluation
-    # - setup executeRuntimeFeatures, pyEnvRuntimeFeatures
     isQNNLLM: Optional[bool] = None
     # SET AUTOMATICALLY TO TRUE WHEN CUDAExecutionProvider
     # When true, it means some passes need CUDA so user could not run it without
@@ -300,9 +298,6 @@ def Check(self, templates: Dict[str, Parameter], oliveJson: Any, modelList: Mode
             ),
         )
 
-        if self.isQNNLLM:
-            self.addCpu = False
-
         # Add runtime
         syskey, system = get_target_system(oliveJson)
         currentEp: str = system[OlivePropertyNames.Accelerators][0][OlivePropertyNames.ExecutionProviders][0]
@@ -320,7 +315,6 @@ def Check(self, templates: Dict[str, Parameter], oliveJson: Any, modelList: Mode
                 executeEp=EPNames.CUDAExecutionProvider,
                 evaluateUsedInExecute=True,
             )
-            self.executeRuntimeFeatures = ["AutoGptq"]
 
         if self.runtimeOverwrite and not self.runtimeOverwrite.Check(oliveJson):
             printError(f"{self._file} runtime overwrite has error")

@@ -52,6 +52,8 @@ To support both efficiently, we create **two model instances**:
 
 ## **PTQ + AOT Compilation for Qualcomm NPUs using QNN EP**
 
+**When Quantization Dataset Sequence Length is 1024, it needs about 20GB GPU Memory. So adjust according to your hardware.**
+
 This process extends the [**QDQ Model with 4-bit Weights & 16-bit Activations**](#qdq-model-with-4-bit-weights--16-bit-activations) by compiling it specifically for **Qualcomm NPUs** using the **QNN Execution Provider**.
 
 ### **Resource Optimization Strategy**

@@ -1,19 +1,5 @@
 {
     "copies": [
-        {
-            "src": "../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_qnn_config.json",
-            "dst": "qwen2_5_qnn_config.json",
-            "replacements": [
-                {
-                    "find": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
-                    "replace": "Qwen/Qwen2.5-1.5B-Instruct"
-                },
-                {
-                    "find": "model/deepseek",
-                    "replace": "model/qwen2_5"
-                }
-            ]
-        },
         {
             "src": "../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_trtrtx_config.json",
             "dst": "qwen2_5_trtrtx_config.json",
@@ -42,11 +28,6 @@
                 }
             ]
         },
-        {
-            "src": "../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_dml_config.json.config",
-            "dst": "qwen2_5_dml_config.json.config",
-            "replacements": []
-        },
         {
             "src": "../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/README.md",
             "dst": "README.md",

@@ -37,15 +37,13 @@ recipes:
       ep: QNNExecutionProvider
       aitk:
         oliveFile: "QNN/config_gpu.json"
-        requirementsPatches:
-          - AutoGptq
         isGPURequired: true
         runtimeOverwrite:
           executeEp: NvTensorRTRTXExecutionProvider
 aitk:
     modelInfo:
         id: "huggingface/Qwen/Qwen2.5-1.5B-Instruct"
-        version: 5
+        version: 6
         groupId: "huggingface/Qwen/Qwen2.5-1.5B-Instruct"
         groupItemName: "1.5B"
         p0: true
@@ -31,6 +31,6 @@
     ],
     "modelInfo": {
         "id": "huggingface/Qwen/Qwen2.5-1.5B-Instruct",
-        "version": 5
+        "version": 6
     }
 }