microsoft · ziyuanguo1998 · Jan 26, 2026 · Jan 27, 2026 · Jan 27, 2026 · Jan 27, 2026
@@ -1,16 +1,16 @@
 {
-    "configCheck": 139,
+    "configCheck": 140,
     "copyCheck": 179,
     "extensionCheck": 1,
-    "gitignoreCheck": 38,
+    "gitignoreCheck": 39,
     "inferenceModelCheck": 25,
-    "ipynbCheck": 38,
-    "licenseCheck": 37,
-    "modelProjectCheck": 39,
+    "ipynbCheck": 39,
+    "licenseCheck": 38,
+    "modelProjectCheck": 40,
     "oliveCheck": 45,
-    "oliveJsonCheck": 139,
-    "pathCheck": 1153,
+    "oliveJsonCheck": 140,
+    "pathCheck": 1170,
     "requirementsCheck": 37,
     "templateCheck": 1,
-    "venvRequirementsCheck": 13
+    "venvRequirementsCheck": 14
 }
@@ -647,6 +647,19 @@
             "status": "Ready",
             "relativePath": "Qwen-Qwen2.5-Coder-14B-Instruct/aitk",
             "version": 3
+        },
+        {
+            "displayName": "stable-diffusion-v1-5/stable-diffusion-v1-5",
+            "icon": "HuggingFace",
+            "modelLink": "https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5",
+            "id": "huggingface/stable-diffusion-v1-5/stable-diffusion-v1-5",
+            "runtimes": [
+                "QNN"
+            ],
+            "architecture": "Transformer",
+            "status": "Hide",
+            "relativePath": "sd-legacy-stable-diffusion-v1-5/aitk",
+            "version": 1
         }
     ],
     "template_models": [
@@ -692,6 +705,7 @@
         "google-research-datasets/conceptual_captions": "https://huggingface.co/datasets/google-research-datasets/conceptual_captions",
         "AIMClab-RUC/COCO-CN": "https://huggingface.co/datasets/AIMClab-RUC/COCO-CN",
         "librispeech_asr": "https://huggingface.co/datasets/openslr/librispeech_asr",
+        "phiyodr/coco2017": "https://huggingface.co/datasets/phiyodr/coco2017",
         "pileval_for_awq_benchmark": "https://huggingface.co/datasets/mit-han-lab/pile-val-backup"
     },
     "LoginRequiredDatasets": [

@@ -0,0 +1,3 @@
+accelerate==1.12.0
+diffusers==0.35.0
+torch-fidelity==0.3.0
@@ -28,6 +28,7 @@
     "mistralai": IconEnum.mistralai,
     # TODO add
     "OFA-Sys": IconEnum.HuggingFace,
+    "stable-diffusion-v1-5": IconEnum.HuggingFace,
 }
 
 

@@ -0,0 +1,8 @@
+__pycache__
+/cache
+/history/*/*
+!/history/*/history.config
+!/history/*/olive_config.json
+/footprints
+/result_*.png
+/*data*/
@@ -0,0 +1,31 @@
+## Stable Diffusion Optimization with ONNX Runtime QNN EP
+
+### Generate data for static quantization
+
+To get better result, we need to generate real data from original model instead of using random data for static quantization.
+
+First generate onnx unoptimized model:
+
+`python stable_diffusion.py --model_id stable-diffusion-v1-5/stable-diffusion-v1-5 --script_dir .\ --provider cpu --format qdq --optimize --only_conversion`
+
+Then generate data:
+
+`python .\evaluation.py --save_data --script_dir .\ --model_id stable-diffusion-v1-5/stable-diffusion-v1-5 --num_inference_steps 25 --seed 0 --num_data 100 --guidance_scale 7.5`
+
+### Optimize
+
+Optimize the onnx models for performance improvements. vae_decoder and unet are per-channel quantized and text_encoder runs in fp16 precision.
+
+`python stable_diffusion.py --script_dir .\ --model_id stable-diffusion-v1-5/stable-diffusion-v1-5 --provider qnn --format qdq --optimize`
+
+### Test and evaluate
+
+`python .\evaluation.py --script_dir .\ --model_id stable-diffusion-v1-5/stable-diffusion-v1-5 --num_inference_steps 25 --seed 0 --num_data 100 --guidance_scale 7.5 --provider QNNExecutionProvider --model_dir optimized-qnn_qdq`
-`python .\evaluation.py --script_dir .\ --model_id stable-diffusion-v1-5/stable-diffusion-v1-5 --num_inference_steps 25 --seed 0 --num_data 100 --guidance_scale 7.5 --provider QNNExecutionProvider --model_dir optimized-qnn_qdq`
+`python .\evaluation.py --script_dir .\ --model_id stable-diffusion-v1-5/stable-diffusion-v1-5 --num_inference_steps 25 --seed 0 --num_data 100 --guidance_scale 7.5 --provider QNNExecutionProvider --model_dir models/optimized/stable-diffusion-v1-5/stable-diffusion-v1-5`
-`python .\evaluation.py --script_dir .\ --model_id stable-diffusion-v1-5/stable-diffusion-v1-5 --num_inference_steps 25 --seed 0 --num_data 100 --guidance_scale 7.5 --provider QNNExecutionProvider --model_dir optimized-qnn_qdq`
+`python .\evaluation.py --script_dir .\ --model_id stable-diffusion-v1-5/stable-diffusion-v1-5 --num_inference_steps 25 --seed 0 --num_data 100 --guidance_scale 7.5 --provider QNNExecutionProvider --model_dir models/optimized/stable-diffusion-v1-5/stable-diffusion-v1-5`
+
+To generate one image:
+
+`python stable_diffusion.py --script_dir .\ --model_id stable-diffusion-v1-5/stable-diffusion-v1-5 --provider qnn --format qdq --guidance_scale 7.5 --seed 0 --num_inference_steps 25 --prompt "A baby is laying down with a teddy bear"`
+
+### References
+
+[stable-diffusion-v1-4](https://github.com/microsoft/olive-recipes/tree/main/compvis-stable-diffusion-v1-4/olive#readme)
@@ -0,0 +1,98 @@
+{
+    "input_model": {
+        "type": "PyTorchModel",
+        "model_path": "stable-diffusion-v1-5/stable-diffusion-v1-5",
+        "model_loader": "safety_checker_load",
+        "model_script": "user_script.py",
+        "io_config": {
+            "input_names": [ "clip_input", "images" ],
+            "output_names": [ "out_images", "has_nsfw_concepts" ],
+            "dynamic_axes": {
+                "clip_input": { "0": "batch", "1": "channels", "2": "height", "3": "width" },
+                "images": { "0": "batch", "1": "height", "2": "width", "3": "channels" }
+            }
+        },
+        "dummy_inputs_func": "safety_checker_conversion_inputs"
+    },
+    "systems": {
+        "local_system": {
+            "type": "LocalSystem",
+            "accelerators": [ { "device": "gpu", "execution_providers": [ "CUDAExecutionProvider" ] } ]
+        }
+    },
+    "data_configs": [
+        {
+            "name": "latency_data_config",
+            "user_script": "user_script.py",
+            "load_dataset_config": { "type": "local_dataset" },
+            "dataloader_config": { "type": "safety_checker_data_loader", "batch_size": 1 }
+        }
+    ],
+    "evaluators": {
+        "common_evaluator": {
+            "metrics": [
+                {
+                    "name": "latency",
+                    "type": "latency",
+                    "data_config": "latency_data_config",
+                    "sub_types": [ { "name": "avg" } ]
+                }
+            ]
+        }
+    },
+    "passes": {
+        "convert": { "type": "OnnxConversion", "target_opset": 14 },
+        "ov_convert": {
+            "type": "OpenVINOConversion",
+            "user_script": "user_script.py",
+            "example_input_func": "safety_checker_conversion_inputs",
+            "output_model": "safety_checker"
+        },
+        "optimize": {
+            "type": "OrtTransformersOptimization",
+            "model_type": "unet",
+            "opt_level": 0,
+            "float16": true,
+            "use_gpu": true,
+            "keep_io_types": false,
+            "optimization_options": {
+                "enable_gelu": true,
+                "enable_layer_norm": true,
+                "enable_attention": true,
+                "use_multi_head_attention": true,
+                "enable_skip_layer_norm": false,
+                "enable_embed_layer_norm": true,
+                "enable_bias_skip_layer_norm": false,
+                "enable_bias_gelu": true,
+                "enable_gelu_approximation": false,
+                "enable_qordered_matmul": false,
+                "enable_shape_inference": true,
+                "enable_gemm_fast_gelu": false,
+                "enable_nhwc_conv": false,
+                "enable_group_norm": true,
+                "enable_bias_splitgelu": false,
+                "enable_packed_qkv": true,
+                "enable_packed_kv": true,
+                "enable_bias_add": false,
+                "group_norm_channels_last": false
+            },
+            "force_fp32_ops": [ "RandomNormalLike" ],
+            "force_fp16_inputs": { "GroupNorm": [ 0, 1, 2 ] }
+        },
+        "optimize_cuda": {
+            "type": "OrtTransformersOptimization",
+            "model_type": "unet",
+            "opt_level": 0,
+            "float16": true,
+            "use_gpu": true,
+            "keep_io_types": false
+        }
+    },
+    "log_severity_level": 0,
+    "evaluator": "common_evaluator",
+    "evaluate_input_model": false,
+    "host": "local_system",
+    "target": "local_system",
+    "cache_dir": "cache",
+    "output_dir": "footprints/safety_checker"
+}
@@ -0,0 +1,147 @@
+{
+    "input_model": {
+        "type": "PyTorchModel",
+        "model_path": "stable-diffusion-v1-5/stable-diffusion-v1-5",
+        "model_loader": "text_encoder_load",
+        "model_script": "user_script.py",
+        "io_config": {
+            "input_names": [ "input_ids" ],
+            "output_names": [ "last_hidden_state", "pooler_output" ],
+            "dynamic_axes": { "input_ids": { "0": "batch", "1": "sequence" } }
+        },
+        "dummy_inputs_func": "text_encoder_conversion_inputs"
+    },
+    "systems": {
+        "local_system": {
+            "type": "LocalSystem",
+            "accelerators": [ { "device": "gpu", "execution_providers": [ "CUDAExecutionProvider" ] } ]
+        }
+    },
+    "data_configs": [
+        {
+            "name": "latency_data_config",
+            "user_script": "user_script.py",
+            "load_dataset_config": { "type": "local_dataset" },
+            "dataloader_config": { "type": "text_encoder_data_loader", "batch_size": 1 }
+        },
+        {
+            "name": "quantize_data_config",
+            "user_script": "user_script.py",
+            "load_dataset_config": { "type": "local_dataset" },
+            "dataloader_config": { "type": "text_encoder_quantize_data_loader", "data_num": 100 }
+        }
+    ],
+    "evaluators": {
+        "common_evaluator": {
+            "metrics": [
+                {
+                    "name": "latency",
+                    "type": "latency",
+                    "data_config": "latency_data_config",
+                    "sub_types": [ { "name": "avg" } ]
+                }
+            ]
+        }
+    },
+    "passes": {
+        "convert": { "type": "OnnxConversion", "target_opset": 17 },
+        "ov_convert": {
+            "type": "OpenVINOConversion",
+            "user_script": "user_script.py",
+            "example_input_func": "text_encoder_conversion_inputs",
+            "output_model": "text_encoder"
+        },
+        "optimize": {
+            "type": "OrtTransformersOptimization",
+            "model_type": "clip",
+            "opt_level": 0,
+            "float16": true,
+            "use_gpu": true,
+            "keep_io_types": false,
+            "optimization_options": {
+                "enable_gelu": true,
+                "enable_layer_norm": true,
+                "enable_attention": true,
+                "use_multi_head_attention": true,
+                "enable_skip_layer_norm": false,
+                "enable_embed_layer_norm": true,
+                "enable_bias_skip_layer_norm": false,
+                "enable_bias_gelu": true,
+                "enable_gelu_approximation": false,
+                "enable_qordered_matmul": false,
+                "enable_shape_inference": true,
+                "enable_gemm_fast_gelu": false,
+                "enable_nhwc_conv": false,
+                "enable_group_norm": true,
+                "enable_bias_splitgelu": false,
+                "enable_packed_qkv": true,
+                "enable_packed_kv": true,
+                "enable_bias_add": false,
+                "group_norm_channels_last": false
+            },
+            "force_fp32_ops": [ "RandomNormalLike" ],
+            "force_fp16_inputs": { "GroupNorm": [ 0, 1, 2 ] }
+        },
+        "optimize_cuda": {
+            "type": "OrtTransformersOptimization",
+            "model_type": "clip",
+            "opt_level": 0,
+            "float16": true,
+            "use_gpu": true,
+            "keep_io_types": false
+        },
+        "dynamic_shape_to_fixed": {
+            "type": "DynamicToFixedShape",
+            "dim_param": [ "batch", "sequence" ],
+            "dim_value": [ 1, 77 ]
+        },
+        "surgery": { "type": "GraphSurgeries", "surgeries": [ { "surgeon": "ReplaceAttentionMaskValue", "replacement": -200.0 } ] },
+        "optimize_qdq": {
+            "type": "OrtTransformersOptimization",
+            "model_type": "clip",
+            "opt_level": 0,
+            "optimization_options": {
+                "enable_gelu": true,
+                "enable_layer_norm": true,
+                "enable_attention": false,
+                "use_multi_head_attention": false,
+                "enable_skip_layer_norm": false,
+                "enable_embed_layer_norm": false,
+                "enable_bias_skip_layer_norm": false,
+                "enable_bias_gelu": false,
+                "enable_gelu_approximation": false,
+                "enable_qordered_matmul": false,
+                "enable_shape_inference": false,
+                "enable_gemm_fast_gelu": false,
+                "enable_nhwc_conv": false,
+                "enable_group_norm": false,
+                "enable_bias_splitgelu": false,
+                "enable_packed_qkv": false,
+                "enable_packed_kv": false,
+                "enable_bias_add": false,
+                "group_norm_channels_last": false
+            }
+        },
+        "quantization": {
+            "type": "OnnxStaticQuantization",
+            "data_config": "quantize_data_config",
+            "activation_type": "uint16",
+            "precision": "uint8",
+            "calibrate_method": "MinMax",
+            "quant_preprocess": true
+        },
+        "cb": {
+            "type": "EPContextBinaryGenerator",
+            "provider_options": {
+                "htp_graph_finalization_optimization_mode": "3"
+            }
+        }
+    },
+    "log_severity_level": 0,
+    "evaluator": "common_evaluator",
+    "evaluate_input_model": false,
+    "host": "local_system",
+    "target": "local_system",
+    "cache_dir": "cache",
+    "output_dir": "footprints/text_encoder"
+}