From 68631903c1479906e68d0b3edd23e0cfc0ad885f Mon Sep 17 00:00:00 2001
From: "Peiyao Zhao (from Dev Box)" <peiyaozhao@microsoft.com>
Date: Tue, 2 Sep 2025 17:51:50 +0800
Subject: [PATCH 1/2] add cross-encoder/ms-marco-MiniLM-L-6-v2

---
 .aitk/configs/checks.json                     |  14 +-
 .aitk/configs/model_list.json                 |  20 ++
 .aitk/scripts/project_processor.py            |   1 +
 cross-encoder-ms-marco-MiniLM-L-6-v2/LICENSE  | 201 +++++++++++++
 .../aitk/.gitignore                           |   5 +
 .../aitk/README.md                            |  21 ++
 .../aitk/_copy.json.config                    |  18 ++
 .../aitk/inference_sample.ipynb               | 151 ++++++++++
 .../aitk/info.yml                             |  27 ++
 .../aitk/model_project.config                 |  28 ++
 ...marco-MiniLM-L-6-v2_context_ov_static.json |  97 +++++++
 ...iniLM-L-6-v2_context_ov_static.json.config | 182 ++++++++++++
 .../aitk/ms-marco-MiniLM-L-6-v2_dml.json      | 139 +++++++++
 .../ms-marco-MiniLM-L-6-v2_dml.json.config    | 123 ++++++++
 .../aitk/ms-marco-MiniLM-L-6-v2_qdq_amd.json  | 168 +++++++++++
 ...ms-marco-MiniLM-L-6-v2_qdq_amd.json.config | 273 ++++++++++++++++++
 .../aitk/ms-marco-MiniLM-L-6-v2_qdq_qnn.json  | 163 +++++++++++
 ...ms-marco-MiniLM-L-6-v2_qdq_qnn.json.config | 273 ++++++++++++++++++
 .../aitk/ms-marco-MiniLM-L-6-v2_trtrtx.json   | 128 ++++++++
 .../ms-marco-MiniLM-L-6-v2_trtrtx.json.config | 125 ++++++++
 .../aitk/requirements.txt                     |   4 +
 .../aitk/user_script.py                       |  83 ++++++
 22 files changed, 2237 insertions(+), 7 deletions(-)
 create mode 100644 cross-encoder-ms-marco-MiniLM-L-6-v2/LICENSE
 create mode 100644 cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/.gitignore
 create mode 100644 cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/README.md
 create mode 100644 cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/_copy.json.config
 create mode 100644 cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/inference_sample.ipynb
 create mode 100644 cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/info.yml
 create mode 100644 cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/model_project.config
 create mode 100644 cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_context_ov_static.json
 create mode 100644 cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_context_ov_static.json.config
 create mode 100644 cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_dml.json
 create mode 100644 cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_dml.json.config
 create mode 100644 cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_qdq_amd.json
 create mode 100644 cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_qdq_amd.json.config
 create mode 100644 cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_qdq_qnn.json
 create mode 100644 cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_qdq_qnn.json.config
 create mode 100644 cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_trtrtx.json
 create mode 100644 cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_trtrtx.json.config
 create mode 100644 cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/requirements.txt
 create mode 100644 cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/user_script.py

diff --git a/.aitk/configs/checks.json b/.aitk/configs/checks.json
index dd390c5a..4008640c 100644
--- a/.aitk/configs/checks.json
+++ b/.aitk/configs/checks.json
@@ -1,13 +1,13 @@
 {
-    "configCheck": 86,
+    "configCheck": 91,
     "extensionCheck": 1,
-    "gitignoreCheck": 34,
+    "gitignoreCheck": 35,
     "inferenceModelCheck": 24,
-    "ipynbCheck": 34,
-    "modelProjectCheck": 35,
+    "ipynbCheck": 35,
+    "modelProjectCheck": 36,
     "oliveCheck": 0,
-    "oliveJsonCheck": 86,
-    "pathCheck": 640,
-    "requirementsCheck": 34,
+    "oliveJsonCheck": 91,
+    "pathCheck": 708,
+    "requirementsCheck": 35,
     "venvRequirementsCheck": 12
 }
diff --git a/.aitk/configs/model_list.json b/.aitk/configs/model_list.json
index 7fa5917a..ed7e1886 100644
--- a/.aitk/configs/model_list.json
+++ b/.aitk/configs/model_list.json
@@ -1,5 +1,25 @@
 {
     "models": [
+        {
+            "displayName": "cross-encoder/ms-marco-MiniLM-L-6-v2",
+            "icon": "HuggingFace",
+            "modelLink": "https://huggingface.co/cross-encoder/ms-marco-MiniLM-L-6-v2",
+            "id": "huggingface/cross-encoder/ms-marco-MiniLM-L-6-v2",
+            "runtimes": [
+                "QNN",
+                "AMDNPU",
+                "NvidiaTRTRTX",
+                "IntelCPU",
+                "IntelGPU",
+                "IntelNPU",
+                "DML"
+            ],
+            "architecture": "Transformer",
+            "status": "Ready",
+            "relativePath": "cross-encoder-ms-marco-MiniLM-L-6-v2/aitk",
+            "version": 1,
+            "p0": true
+        },
         {
             "displayName": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
             "icon": "DeepSeek",
diff --git a/.aitk/scripts/project_processor.py b/.aitk/scripts/project_processor.py
index b84f571f..4940bdfd 100644
--- a/.aitk/scripts/project_processor.py
+++ b/.aitk/scripts/project_processor.py
@@ -21,6 +21,7 @@
     "Qwen": IconEnum.qwen,
     "meta-llama": IconEnum.Meta,
     "mistralai": IconEnum.mistralai,
+    "cross-encoder": IconEnum.HuggingFace
 }
 
 
diff --git a/cross-encoder-ms-marco-MiniLM-L-6-v2/LICENSE b/cross-encoder-ms-marco-MiniLM-L-6-v2/LICENSE
new file mode 100644
index 00000000..29f81d81
--- /dev/null
+++ b/cross-encoder-ms-marco-MiniLM-L-6-v2/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/.gitignore b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/.gitignore
new file mode 100644
index 00000000..48c03882
--- /dev/null
+++ b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/.gitignore
@@ -0,0 +1,5 @@
+__pycache__
+/cache
+/history/*/*
+!/history/*/history.config
+!/history/*/olive_config.json
diff --git a/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/README.md b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/README.md
new file mode 100644
index 00000000..46ba8a03
--- /dev/null
+++ b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/README.md
@@ -0,0 +1,21 @@
+# BERT Optimization
+
+This folder contains examples of BERT optimization using different workflows.
+
+- QDQ for Qualcomm NPU / AMD NPU
+- OpenVINO for Intel NPU
+- Float downcasting for NVIDIA TRT for RTX GPU / DML for general GPU
+
+## QDQ for Qualcomm NPU / AMD NPU
+
+This workflow quantizes the model. It performs the pipeline:
+- *HF Model-> ONNX Model ->Quantized Onnx Model*
+
+### Latency / Throughput
+
+| Model Version         | Latency (ms/sample)  | Throughput (token per second)| Dataset       |
+|-----------------------|----------------------|------------------------------|---------------|
+| PyTorch FP32          | 1162                 | 0.81                         | facebook/xnli |
+| ONNX INT8 (QDQ)       | 590                  | 1.75                         | facebook/xnli |
+
+*Note: Latency can vary significantly depending on the hardware and system environment. The values provided here are for reference only and may not reflect performance on all devices.*
diff --git a/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/_copy.json.config b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/_copy.json.config
new file mode 100644
index 00000000..7274f6c6
--- /dev/null
+++ b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/_copy.json.config
@@ -0,0 +1,18 @@
+{
+    "copies": [
+        {
+            "src": "ms-marco-MiniLM-L-6-v2_qdq_amd.json.config",
+            "dst": "ms-marco-MiniLM-L-6-v2_qdq_qnn.json.config",
+            "replacements": [
+                {
+                    "find": "bert/google_bert_qdq_vitis_ai.json",
+                    "replace": "bert/google_bert_qdq.json"
+                },
+                {
+                    "find": "Convert to AMD NPU",
+                    "replace": "Convert to Qualcomm NPU"
+                }
+            ]
+        }
+    ]
+}
diff --git a/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/inference_sample.ipynb b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/inference_sample.ipynb
new file mode 100644
index 00000000..194fbd3c
--- /dev/null
+++ b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/inference_sample.ipynb
@@ -0,0 +1,151 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "onnx_model_path = \"./model/model.onnx\"\n",
+    "\n",
+    "ExecutionProvider=\"QNNExecutionProvider\"\n",
+    "if ExecutionProvider == \"OpenVINOExecutionProvider\":\n",
+    "    onnx_model_path = \"./model/openvino_model_st_quant.onnx\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "inputs = \"This is an example sentence.\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import onnxruntime as ort\n",
+    "import torch\n",
+    "import torch.nn.functional as F\n",
+    "\n",
+    "from transformers import AutoModel, AutoTokenizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def mean_pooling(model_output, attention_mask):\n",
+    "    token_embeddings = torch.tensor(model_output[0])\n",
+    "    input_mask_expanded = attention_mask.unsqueeze(-1).expand_as(token_embeddings).float()\n",
+    "    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-multilingual-cased')\n",
+    "encoded_input = tokenizer(\n",
+    "    inputs,\n",
+    "    padding=\"max_length\",\n",
+    "    max_length=128,\n",
+    "    truncation=True,\n",
+    "    add_special_tokens=True,\n",
+    "    return_tensors=\"pt\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def add_ep_for_device(session_options, ep_name, device_type, ep_options=None):\n",
+    "    ep_devices = ort.get_ep_devices()\n",
+    "    for ep_device in ep_devices:\n",
+    "        if ep_device.ep_name == ep_name and ep_device.device.type == device_type:\n",
+    "            print(f\"Adding {ep_name} for {device_type}\")\n",
+    "            session_options.add_provider_for_devices([ep_device], {} if ep_options is None else ep_options)\n",
+    "            break\n",
+    "\n",
+    "\n",
+    "session_options = ort.SessionOptions()\n",
+    "\n",
+    "add_ep_for_device(session_options, ExecutionProvider, ort.OrtHardwareDeviceType.NPU)\n",
+    "\n",
+    "session = ort.InferenceSession(\n",
+    "    onnx_model_path, # a model with QNN EPContext nodes\n",
+    "    sess_options=session_options,\n",
+    ")\n",
+    "\n",
+    "input_ids = encoded_input[\"input_ids\"]\n",
+    "attention_mask = encoded_input[\"attention_mask\"]\n",
+    "token_type_ids = encoded_input[\"token_type_ids\"]\n",
+    "inputs = {\n",
+    "    \"input_ids\": input_ids.long().cpu().numpy(),\n",
+    "    \"attention_mask\": attention_mask.long().cpu().numpy(),\n",
+    "    \"token_type_ids\": token_type_ids.long().cpu().numpy()\n",
+    "}\n",
+    "\n",
+    "outputs = session.run(None, inputs)\n",
+    "embeds_1 = mean_pooling(outputs, encoded_input['attention_mask'])\n",
+    "embeds_1 = F.normalize(embeds_1, p=2, dim=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get text embedding from orinal model, as ground truth.\n",
+    "model = AutoModel.from_pretrained('google-bert/bert-base-multilingual-cased').eval()\n",
+    "with torch.no_grad():\n",
+    "    outputs = model(**encoded_input)\n",
+    "    embeds_2 = mean_pooling(outputs, encoded_input['attention_mask'])\n",
+    "    embeds_2 = F.normalize(embeds_2, p=2, dim=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "similarity = F.cosine_similarity(embeds_1, embeds_2).item()\n",
+    "print(\"Similarity: \", similarity)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/info.yml b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/info.yml
new file mode 100644
index 00000000..28be7948
--- /dev/null
+++ b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/info.yml
@@ -0,0 +1,27 @@
+keywords:
+    aitk
+arch: bert
+recipes:
+    - file: "ms-marco-MiniLM-L-6-v2_qdq_qnn.json"
+      device: npu
+      ep: QNNExecutionProvider
+    - file: "ms-marco-MiniLM-L-6-v2_qdq_amd.json"
+      device: npu
+      ep: VitisAIExecutionProvider
+    - file: "ms-marco-MiniLM-L-6-v2_context_ov_static.json"
+      devices:
+        - npu
+        - cpu
+        - gpu
+      ep: OpenVINOExecutionProvider
+    - file: "ms-marco-MiniLM-L-6-v2_trtrtx.json"
+      device: gpu
+      ep: NvTensorRTRTXExecutionProvider
+    - file: "ms-marco-MiniLM-L-6-v2_dml.json"
+      device: gpu
+      ep: DmlExecutionProvider
+aitk:
+    modelInfo:
+        id: "huggingface/cross-encoder/ms-marco-MiniLM-L-6-v2"
+        version: 1
+        p0: true
diff --git a/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/model_project.config b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/model_project.config
new file mode 100644
index 00000000..e99faf9d
--- /dev/null
+++ b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/model_project.config
@@ -0,0 +1,28 @@
+{
+    "workflows": [
+        {
+            "file": "ms-marco-MiniLM-L-6-v2_qdq_qnn.json",
+            "templateName": "ms-marco-MiniLM-L-6-v2_qdq_qnn"
+        },
+        {
+            "file": "ms-marco-MiniLM-L-6-v2_qdq_amd.json",
+            "templateName": "ms-marco-MiniLM-L-6-v2_qdq_amd"
+        },
+        {
+            "file": "ms-marco-MiniLM-L-6-v2_context_ov_static.json",
+            "templateName": "ms-marco-MiniLM-L-6-v2_context_ov_static"
+        },
+        {
+            "file": "ms-marco-MiniLM-L-6-v2_trtrtx.json",
+            "templateName": "ms-marco-MiniLM-L-6-v2_trtrtx"
+        },
+        {
+            "file": "ms-marco-MiniLM-L-6-v2_dml.json",
+            "templateName": "ms-marco-MiniLM-L-6-v2_dml"
+        }
+    ],
+    "modelInfo": {
+        "id": "huggingface/cross-encoder/ms-marco-MiniLM-L-6-v2",
+        "version": 1
+    }
+}
diff --git a/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_context_ov_static.json b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_context_ov_static.json
new file mode 100644
index 00000000..c371fe69
--- /dev/null
+++ b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_context_ov_static.json
@@ -0,0 +1,97 @@
+{
+    "input_model": {
+        "type": "HfModel",
+        "model_path": "cross-encoder/ms-marco-MiniLM-L-6-v2",
+        "task": "fill-mask"
+    },
+    "systems": {
+        "local_system": {
+            "type": "LocalSystem",
+            "accelerators": [
+                {
+                    "device": "npu",
+                    "execution_providers": [
+                        "OpenVINOExecutionProvider"
+                    ]
+                }
+            ]
+        }
+    },
+    "data_configs": [
+        {
+            "name": "quantize_data_config",
+            "user_script": "user_script.py",
+            "load_dataset_config": {
+                "type": "bert_base_multilingual_cased_dataset",
+                "data_name": "wikipedia",
+                "split": "train",
+                "max_samples": 300
+            },
+            "dataloader_config": {
+                "batch_size": 1,
+                "drop_last": true
+            }
+        }
+    ],
+    "evaluators": {
+        "common_evaluator": {
+            "metrics": [
+                {
+                    "name": "latency",
+                    "type": "latency",
+                    "sub_types": [
+                        { "name": "avg", "priority": 1, "metric_config": { "warmup_num": 20, "repeat_test_num": 100 } },
+                        { "name": "p90", "metric_config": { "warmup_num": 20, "repeat_test_num": 100 } }
+                    ]
+                }
+            ]
+        }
+    },
+    "passes": {
+        "optimum_convert": {
+            "type": "OpenVINOOptimumConversion",
+            "extra_args": {
+                "device": "npu",
+                "task": "feature-extraction"
+            }
+        },
+        "io_update": {
+            "type": "OpenVINOIoUpdate",
+            "input_shapes": [
+                [
+                    1,
+                    128
+                ],
+                [
+                    1,
+                    128
+                ],
+                [
+                    1,
+                    128
+                ]
+            ],
+            "static": true
+        },
+        "ov_quantize": {
+            "type": "OpenVINOQuantization",
+            "target_device": "npu",
+            "data_config": "quantize_data_config",
+            "model_type": "TRANSFORMER",
+            "user_script": "user_script.py",
+            "transform_fn": "custom_transform_func"
+        },
+        "encapsulation": {
+            "type": "OpenVINOEncapsulation",
+            "target_device": "npu",
+            "ov_version": "2025.1"
+        }
+    },
+    "search_strategy": false,
+    "host": "local_system",
+    "target": "local_system",
+    "cache_dir": "cache",
+    "evaluator": "common_evaluator",
+    "evaluate_input_model": false,
+    "output_dir": "model/bert-base-multilingual-cased_context_ov_static"
+}
diff --git a/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_context_ov_static.json.config b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_context_ov_static.json.config
new file mode 100644
index 00000000..9ed8f88e
--- /dev/null
+++ b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_context_ov_static.json.config
@@ -0,0 +1,182 @@
+{
+    "name": "Convert to Intel CPU/NPU/GPU",
+    "oliveFile": "bert/openvino/ms-marco-MiniLM-L-6-v2/ms-marco-MiniLM-L-6-v2_context_ov_static.json",
+    "isIntel": true,
+    "debugInfo": {
+        "autoGenerated": true,
+        "useOpenVINOOptimumConversion": "optimum_convert"
+    },
+    "addCpu": false,
+    "runtime": {
+        "autoGenerated": true,
+        "name": "Evaluate on",
+        "type": "enum",
+        "displayNames": [
+            "Intel CPU",
+            "Intel GPU",
+            "Intel NPU"
+        ],
+        "path": "systems.local_system.accelerators.0.device",
+        "values": [
+            "cpu",
+            "gpu",
+            "npu"
+        ],
+        "readOnly": false
+    },
+    "runtimeInConversion": {
+        "autoGenerated": true,
+        "name": "Convert/Quantize to",
+        "type": "enum",
+        "displayNames": [
+            "Intel CPU",
+            "Intel GPU",
+            "Intel NPU"
+        ],
+        "path": "passes.optimum_convert.extra_args.device",
+        "values": [
+            "cpu",
+            "gpu",
+            "npu"
+        ],
+        "actions": [
+            [
+                {
+                    "type": "update",
+                    "path": "passes.ov_quantize.target_device",
+                    "value": "cpu"
+                },
+                {
+                    "type": "update",
+                    "path": "passes.encapsulation.target_device",
+                    "value": "cpu"
+                }
+            ],
+            [
+                {
+                    "type": "update",
+                    "path": "passes.ov_quantize.target_device",
+                    "value": "gpu"
+                },
+                {
+                    "type": "update",
+                    "path": "passes.encapsulation.target_device",
+                    "value": "gpu"
+                }
+            ],
+            [
+                {
+                    "type": "update",
+                    "path": "passes.ov_quantize.target_device",
+                    "value": "npu"
+                },
+                {
+                    "type": "update",
+                    "path": "passes.encapsulation.target_device",
+                    "value": "npu"
+                }
+            ]
+        ]
+    },
+    "sections": [
+        {
+            "autoGenerated": true,
+            "name": "Convert",
+            "phase": "Conversion",
+            "parameters": [],
+            "toggle": {
+                "autoGenerated": true,
+                "name": "Convert to ONNX format",
+                "type": "bool",
+                "path": "passes.optimum_convert",
+                "actions": [
+                    [],
+                    []
+                ],
+                "readOnly": true
+            }
+        },
+        {
+            "name": "Quantize",
+            "phase": "Quantization",
+            "parameters": [
+                {
+                    "name": "Quantization Dataset",
+                    "tags": [
+                        "QuantizationDataset"
+                    ],
+                    "type": "enum",
+                    "path": "data_configs[0].load_dataset_config.data_name",
+                    "values": [
+                        "wikipedia"
+                    ],
+                    "template": {
+                        "path": "data_configs[0].load_dataset_config.data_name",
+                        "values": [
+                            "wikipedia"
+                        ],
+                        "template": "QuantizationDataset"
+                    }
+                },
+                {
+                    "name": "Quantization Dataset Split",
+                    "tags": [
+                        "QuantizationDatasetSplit",
+                        "DependsOnDataset"
+                    ],
+                    "type": "enum",
+                    "path": "data_configs[0].load_dataset_config.split",
+                    "values": [
+                        "train",
+                        "validation",
+                        "test"
+                    ],
+                    "template": {
+                        "path": "data_configs[0].load_dataset_config.split",
+                        "template": "QuantizationDatasetSplit"
+                    }
+                },
+                {
+                    "name": "Quantization Dataset Size",
+                    "type": "int",
+                    "path": "data_configs[0].load_dataset_config.max_samples",
+                    "template": {
+                        "path": "data_configs[0].load_dataset_config.max_samples",
+                        "template": "QuantizationDatasetSize"
+                    }
+                }
+            ],
+            "toggle": {
+                "autoGenerated": true,
+                "name": "Quantize model",
+                "type": "bool",
+                "path": "passes.optimum_convert",
+                "actions": [
+                    [],
+                    []
+                ],
+                "readOnly": true
+            }
+        },
+        {
+            "name": "Evaluate",
+            "phase": "Evaluation",
+            "parameters": [],
+            "toggle": {
+                "autoGenerated": true,
+                "name": "Evaluate model performance",
+                "type": "bool",
+                "path": "evaluator",
+                "actions": [
+                    [],
+                    [
+                        {
+                            "type": "delete",
+                            "path": "evaluator"
+                        }
+                    ]
+                ]
+            }
+        }
+    ]
+}
diff --git a/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_dml.json b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_dml.json
new file mode 100644
index 00000000..741cd73d
--- /dev/null
+++ b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_dml.json
@@ -0,0 +1,139 @@
+{
+    "input_model": {
+        "type": "HfModel",
+        "model_path": "cross-encoder/ms-marco-MiniLM-L-6-v2",
+        "task": "feature-extraction"
+    },
+    "systems": {
+        "host_system": {
+            "type": "LocalSystem",
+            "accelerators": [
+                {
+                    "device": "cpu",
+                    "execution_providers": [
+                        "CPUExecutionProvider"
+                    ]
+                }
+            ]
+        },
+        "target_system": {
+            "type": "LocalSystem",
+            "accelerators": [
+                {
+                    "device": "gpu",
+                    "execution_providers": [
+                        "DmlExecutionProvider"
+                    ]
+                }
+            ]
+        }
+    },
+    "data_configs": [
+        {
+            "name": "evaluation_data_config",
+            "type": "HuggingfaceContainer",
+            "load_dataset_config": {
+                "data_name": "facebook/xnli",
+                "subset": "en",
+                "split": "validation"
+            },
+            "pre_process_data_config": {
+                "input_cols": [
+                    "premise"
+                ],
+                "padding": "max_length",
+                "max_length": 128,
+                "max_samples": 10
+            },
+            "dataloader_config": {
+                "batch_size": 1
+            }
+        }
+    ],
+    "evaluators": {
+        "common_evaluator": {
+            "metrics": [
+                {
+                    "name": "latency",
+                    "type": "latency",
+                    "data_config": "evaluation_data_config",
+                    "sub_types": [
+                        {
+                            "name": "avg",
+                            "priority": 1,
+                            "goal": {
+                                "type": "percent-min-improvement",
+                                "value": 0.1
+                            }
+                        },
+                        {
+                            "name": "max"
+                        },
+                        {
+                            "name": "min"
+                        }
+                    ]
+                },
+                {
+                    "name": "throughput",
+                    "type": "throughput",
+                    "data_config": "evaluation_data_config",
+                    "sub_types": [
+                        {
+                            "name": "avg"
+                        },
+                        {
+                            "name": "max"
+                        },
+                        {
+                            "name": "min"
+                        }
+                    ]
+                }
+            ]
+        }
+    },
+    "passes": {
+        "conversion": {
+            "type": "OnnxConversion",
+            "target_opset": 20,
+            "save_as_external_data": true
+        },
+        "transformer_optimizer": {
+            "type": "OrtTransformersOptimization",
+            "model_type": "bert",
+            "opt_level": 0,
+            "float16": true,
+            "use_gpu": true,
+            "keep_io_types": false,
+            "optimization_options": {
+                "enable_gelu": true,
+                "enable_layer_norm": true,
+                "enable_attention": true,
+                "use_multi_head_attention": true,
+                "enable_skip_layer_norm": false,
+                "enable_embed_layer_norm": false,
+                "enable_bias_skip_layer_norm": false,
+                "enable_bias_gelu": false,
+                "enable_gelu_approximation": false,
+                "enable_qordered_matmul": false,
+                "enable_shape_inference": true,
+                "enable_gemm_fast_gelu": false,
+                "enable_nhwc_conv": false,
+                "enable_group_norm": false,
+                "enable_bias_splitgelu": false,
+                "enable_packed_qkv": true,
+                "enable_packed_kv": true,
+                "enable_bias_add": false,
+                "enable_rotary_embeddings": true
+            },
+            "save_as_external_data": true
+        }
+    },
+    "host": "host_system",
+    "target": "target_system",
+    "evaluator": "common_evaluator",
+    "cache_dir": "cache",
+    "output_dir": "model/google_bert",
+    "evaluate_input_model": false
+}
diff --git a/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_dml.json.config b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_dml.json.config
new file mode 100644
index 00000000..7ee69518
--- /dev/null
+++ b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_dml.json.config
@@ -0,0 +1,123 @@
+{
+    "name": "Convert to DirectML",
+    "addCpu": false,
+    "runtime": {
+        "autoGenerated": true,
+        "name": "Evaluate on",
+        "type": "enum",
+        "displayNames": [
+            "DirectML"
+        ],
+        "path": "systems.target_system.accelerators.0.execution_providers.0",
+        "values": [
+            "DmlExecutionProvider"
+        ],
+        "readOnly": false
+    },
+    "sections": [
+        {
+            "autoGenerated": true,
+            "name": "Convert",
+            "phase": "Conversion",
+            "parameters": [],
+            "toggle": {
+                "autoGenerated": true,
+                "name": "Convert to ONNX format",
+                "type": "bool",
+                "path": "passes.conversion",
+                "actions": [
+                    [],
+                    []
+                ],
+                "readOnly": true
+            }
+        },
+        {
+            "name": "Evaluate",
+            "phase": "Evaluation",
+            "parameters": [
+                {
+                    "name": "Evaluation Dataset",
+                    "tags": [
+                        "EvaluationDataset"
+                    ],
+                    "type": "enum",
+                    "path": "data_configs[0].load_dataset_config.data_name",
+                    "values": [
+                        "facebook/xnli"
+                    ],
+                    "template": {
+                        "path": "data_configs[0].load_dataset_config.data_name",
+                        "values": [
+                            "facebook/xnli"
+                        ],
+                        "template": "EvaluationDataset"
+                    }
+                },
+                {
+                    "name": "Evaluation Dataset Subset",
+                    "tags": [
+                        "EvaluationDatasetSubset",
+                        "DependsOnDataset"
+                    ],
+                    "type": "enum",
+                    "path": "data_configs[0].load_dataset_config.subset",
+                    "values": [
+                        "en",
+                        "all_languages"
+                    ],
+                    "template": {
+                        "path": "data_configs[0].load_dataset_config.subset",
+                        "values": [
+                            "en",
+                            "all_languages"
+                        ],
+                        "template": "EvaluationDatasetSubset"
+                    }
+                },
+                {
+                    "name": "Evaluation Dataset Split",
+                    "tags": [
+                        "EvaluationDatasetSplit",
+                        "DependsOnDataset"
+                    ],
+                    "type": "enum",
+                    "path": "data_configs[0].load_dataset_config.split",
+                    "values": [
+                        "train",
+                        "validation",
+                        "test"
+                    ],
+                    "template": {
+                        "path": "data_configs[0].load_dataset_config.split",
+                        "template": "EvaluationDatasetSplit"
+                    }
+                },
+                {
+                    "name": "Quantization Dataset Size",
+                    "type": "int",
+                    "path": "data_configs[0].pre_process_data_config.max_samples",
+                    "template": {
+                        "path": "data_configs[0].pre_process_data_config.max_samples",
+                        "template": "QuantizationDatasetSize"
+                    }
+                }
+            ],
+            "toggle": {
+                "autoGenerated": true,
+                "name": "Evaluate model performance",
+                "type": "bool",
+                "path": "evaluator",
+                "actions": [
+                    [],
+                    [
+                        {
+                            "type": "delete",
+                            "path": "evaluator"
+                        }
+                    ]
+                ]
+            }
+        }
+    ]
+}
diff --git a/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_qdq_amd.json b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_qdq_amd.json
new file mode 100644
index 00000000..e9715a38
--- /dev/null
+++ b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_qdq_amd.json
@@ -0,0 +1,168 @@
+{
+    "input_model": {
+        "type": "HfModel",
+        "model_path": "cross-encoder/ms-marco-MiniLM-L-6-v2",
+        "task": "feature-extraction"
+    },
+    "systems": {
+        "qnn_system": {
+            "type": "LocalSystem",
+            "accelerators": [
+                {
+                    "device": "npu",
+                    "execution_providers": [
+                        "VitisAIExecutionProvider"
+                    ]
+                }
+            ]
+        }
+    },
+    "data_configs": [
+        {
+            "name": "quantization_data_config",
+            "type": "HuggingfaceContainer",
+            "load_dataset_config": {
+                "data_name": "facebook/xnli",
+                "subset": "en",
+                "split": "validation"
+            },
+            "pre_process_data_config": {
+                "input_cols": [
+                    "premise"
+                ],
+                "padding": "max_length",
+                "max_length": 128,
+                "max_samples": 10
+            },
+            "dataloader_config": {
+                "batch_size": 1
+            }
+        },
+        {
+            "name": "evaluation_data_config",
+            "type": "HuggingfaceContainer",
+            "load_dataset_config": {
+                "data_name": "facebook/xnli",
+                "subset": "en",
+                "split": "validation"
+            },
+            "pre_process_data_config": {
+                "input_cols": [
+                    "premise"
+                ],
+                "padding": "max_length",
+                "max_length": 128,
+                "max_samples": 10
+            },
+            "dataloader_config": {
+                "batch_size": 1
+            }
+        }
+    ],
+    "evaluators": {
+        "common_evaluator": {
+            "metrics": [
+                {
+                    "name": "latency",
+                    "type": "latency",
+                    "data_config": "evaluation_data_config",
+                    "sub_types": [
+                        {
+                            "name": "avg",
+                            "priority": 1,
+                            "goal": {
+                                "type": "percent-min-improvement",
+                                "value": 0.1
+                            }
+                        },
+                        {
+                            "name": "max"
+                        },
+                        {
+                            "name": "min"
+                        }
+                    ]
+                },
+                {
+                    "name": "throughput",
+                    "type": "throughput",
+                    "data_config": "evaluation_data_config",
+                    "sub_types": [
+                        {
+                            "name": "avg"
+                        },
+                        {
+                            "name": "max"
+                        },
+                        {
+                            "name": "min"
+                        }
+                    ]
+                }
+            ]
+        }
+    },
+    "passes": {
+        "conversion": {
+            "type": "OnnxConversion",
+            "target_opset": 17,
+            "save_as_external_data": true
+        },
+        "transformer_optimizer": {
+            "type": "orttransformersoptimization",
+            "model_type": "bert",
+            "opt_level": 1,
+            "optimization_options": {
+                "enable_gelu": true,
+                "enable_bias_gelu": false,
+                "enable_layer_norm": true,
+                "enable_skip_layer_norm": false,
+                "enable_bias_skip_layer_norm": false,
+                "enable_attention": false
+            },
+            "save_as_external_data": true
+        },
+        "dynamic_shape_to_fixed": {
+            "type": "DynamicToFixedShape",
+            "dim_param": [
+                "batch_size",
+                "sequence_length"
+            ],
+            "dim_value": [
+                1,
+                128
+            ]
+        },
+        "surgery": {
+            "type": "GraphSurgeries",
+            "surgeries": [
+                {
+                    "surgeon": "ReplaceAttentionMaskValue"
+                }
+            ]
+        },
+        "OnnxQuantization": {
+            "type": "OnnxStaticQuantization",
+            "data_config": "quantization_data_config",
+            "activation_type": "uint16",
+            "precision": "uint8",
+            "save_as_external_data": true
+        },
+        "addmetadata": {
+            "type": "VitisAIAddMetaData",
+            "config_meta_data_keys": [
+                "architectures",
+                "model_type"
+            ],
+            "activation_type": "uint16",
+            "weight_type": "uint8",
+            "quant_type": "OnnxStaticQuantization"
+        }
+    },
+    "host": "qnn_system",
+    "target": "qnn_system",
+    "evaluator": "common_evaluator",
+    "cache_dir": "cache",
+    "output_dir": "model/google_bert",
+    "evaluate_input_model": false
+}
diff --git a/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_qdq_amd.json.config b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_qdq_amd.json.config
new file mode 100644
index 00000000..19476bf7
--- /dev/null
+++ b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_qdq_amd.json.config
@@ -0,0 +1,273 @@
+{
+    "name": "Convert to AMD NPU",
+    "oliveFile": "bert/google_bert_qdq_vitis_ai.json",
+    "runtime": {
+        "autoGenerated": true,
+        "name": "Evaluate on",
+        "type": "enum",
+        "displayNames": [
+            "AMD NPU",
+            "CPU"
+        ],
+        "path": "systems.qnn_system.accelerators.0.execution_providers.0",
+        "values": [
+            "VitisAIExecutionProvider",
+            "CPUExecutionProvider"
+        ],
+        "readOnly": false
+    },
+    "sections": [
+        {
+            "autoGenerated": true,
+            "name": "Convert",
+            "phase": "Conversion",
+            "parameters": [],
+            "toggle": {
+                "autoGenerated": true,
+                "name": "Convert to ONNX format",
+                "type": "bool",
+                "path": "passes.conversion",
+                "actions": [
+                    [],
+                    []
+                ],
+                "readOnly": true
+            }
+        },
+        {
+            "name": "Quantize",
+            "phase": "Quantization",
+            "parameters": [
+                {
+                    "name": "Activation Type",
+                    "tags": [
+                        "ActivationType"
+                    ],
+                    "description": "Quantization data type of activation. ‘Int8’ for signed 8-bit integer, ‘UInt8’ for unsigned 8-bit integer etc.",
+                    "descriptionLink": "https://onnxruntime.ai/docs/performance/model-optimizations/quantization.html",
+                    "type": "enum",
+                    "displayNames": [
+                        "Int8",
+                        "UInt8",
+                        "Int16",
+                        "UInt16"
+                    ],
+                    "displayType": "RadioGroup",
+                    "path": "passes.OnnxQuantization.activation_type",
+                    "values": [
+                        "int8",
+                        "uint8",
+                        "int16",
+                        "uint16"
+                    ],
+                    "template": {
+                        "path": "passes.OnnxQuantization.activation_type",
+                        "template": "ActivationType"
+                    }
+                },
+                {
+                    "name": "Weight Type",
+                    "tags": [
+                        "WeightType"
+                    ],
+                    "description": "Data type for quantizing weights. ‘Int8’ for signed 8-bit integer, ‘UInt8’ for unsigned 8-bit integer etc.",
+                    "descriptionLink": "https://onnxruntime.ai/docs/performance/model-optimizations/quantization.html",
+                    "type": "enum",
+                    "displayNames": [
+                        "Int8",
+                        "UInt8",
+                        "Int16",
+                        "UInt16"
+                    ],
+                    "displayType": "RadioGroup",
+                    "path": "passes.OnnxQuantization.precision",
+                    "values": [
+                        "int8",
+                        "uint8",
+                        "int16",
+                        "uint16"
+                    ],
+                    "template": {
+                        "path": "passes.OnnxQuantization.precision",
+                        "template": "WeightType"
+                    }
+                },
+                {
+                    "name": "Quantization Dataset",
+                    "tags": [
+                        "QuantizationDataset"
+                    ],
+                    "type": "enum",
+                    "path": "data_configs[0].load_dataset_config.data_name",
+                    "values": [
+                        "facebook/xnli"
+                    ],
+                    "template": {
+                        "path": "data_configs[0].load_dataset_config.data_name",
+                        "values": [
+                            "facebook/xnli"
+                        ],
+                        "template": "QuantizationDataset"
+                    }
+                },
+                {
+                    "name": "Quantization Dataset Subset",
+                    "tags": [
+                        "QuantizationDatasetSubset",
+                        "DependsOnDataset"
+                    ],
+                    "type": "enum",
+                    "path": "data_configs[0].load_dataset_config.subset",
+                    "values": [
+                        "en",
+                        "all_languages"
+                    ],
+                    "template": {
+                        "path": "data_configs[0].load_dataset_config.subset",
+                        "values": [
+                            "en",
+                            "all_languages"
+                        ],
+                        "template": "QuantizationDatasetSubset"
+                    }
+                },
+                {
+                    "name": "Quantization Dataset Split",
+                    "tags": [
+                        "QuantizationDatasetSplit",
+                        "DependsOnDataset"
+                    ],
+                    "type": "enum",
+                    "path": "data_configs[0].load_dataset_config.split",
+                    "values": [
+                        "train",
+                        "validation",
+                        "test"
+                    ],
+                    "template": {
+                        "path": "data_configs[0].load_dataset_config.split",
+                        "template": "QuantizationDatasetSplit"
+                    }
+                },
+                {
+                    "name": "Quantization Dataset Size",
+                    "type": "int",
+                    "path": "data_configs[0].pre_process_data_config.max_samples",
+                    "template": {
+                        "path": "data_configs[0].pre_process_data_config.max_samples",
+                        "template": "QuantizationDatasetSize"
+                    }
+                }
+            ],
+            "toggle": {
+                "autoGenerated": true,
+                "name": "Quantize model",
+                "type": "bool",
+                "path": "passes.OnnxQuantization",
+                "actions": [
+                    [],
+                    [
+                        {
+                            "type": "update",
+                            "path": "passes",
+                            "value": {
+                                "conversion": {
+                                    "type": "OnnxConversion",
+                                    "target_opset": 17,
+                                    "save_as_external_data": true
+                                }
+                            }
+                        }
+                    ]
+                ]
+            }
+        },
+        {
+            "name": "Evaluate",
+            "phase": "Evaluation",
+            "parameters": [
+                {
+                    "name": "Evaluation Dataset",
+                    "tags": [
+                        "EvaluationDataset"
+                    ],
+                    "type": "enum",
+                    "path": "data_configs[1].load_dataset_config.data_name",
+                    "values": [
+                        "facebook/xnli"
+                    ],
+                    "template": {
+                        "path": "data_configs[1].load_dataset_config.data_name",
+                        "values": [
+                            "facebook/xnli"
+                        ],
+                        "template": "EvaluationDataset"
+                    }
+                },
+                {
+                    "name": "Evaluation Dataset Subset",
+                    "tags": [
+                        "EvaluationDatasetSubset",
+                        "DependsOnDataset"
+                    ],
+                    "type": "enum",
+                    "path": "data_configs[1].load_dataset_config.subset",
+                    "values": [
+                        "en",
+                        "all_languages"
+                    ],
+                    "template": {
+                        "path": "data_configs[1].load_dataset_config.subset",
+                        "values": [
+                            "en",
+                            "all_languages"
+                        ],
+                        "template": "EvaluationDatasetSubset"
+                    }
+                },
+                {
+                    "name": "Evaluation Dataset Split",
+                    "tags": [
+                        "EvaluationDatasetSplit",
+                        "DependsOnDataset"
+                    ],
+                    "type": "enum",
+                    "path": "data_configs[1].load_dataset_config.split",
+                    "values": [
+                        "train",
+                        "validation",
+                        "test"
+                    ],
+                    "template": {
+                        "path": "data_configs[1].load_dataset_config.split",
+                        "template": "EvaluationDatasetSplit"
+                    }
+                },
+                {
+                    "name": "Quantization Dataset Size",
+                    "type": "int",
+                    "path": "data_configs[1].pre_process_data_config.max_samples",
+                    "template": {
+                        "path": "data_configs[1].pre_process_data_config.max_samples",
+                        "template": "QuantizationDatasetSize"
+                    }
+                }
+            ],
+            "toggle": {
+                "autoGenerated": true,
+                "name": "Evaluate model performance",
+                "type": "bool",
+                "path": "evaluator",
+                "actions": [
+                    [],
+                    [
+                        {
+                            "type": "delete",
+                            "path": "evaluator"
+                        }
+                    ]
+                ]
+            }
+        }
+    ]
+}
diff --git a/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_qdq_qnn.json b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_qdq_qnn.json
new file mode 100644
index 00000000..c2000412
--- /dev/null
+++ b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_qdq_qnn.json
@@ -0,0 +1,163 @@
+{
+    "input_model": {
+        "type": "HfModel",
+        "model_path": "cross-encoder/ms-marco-MiniLM-L-6-v2",
+        "task": "feature-extraction"
+    },
+    "systems": {
+        "qnn_system": {
+            "type": "LocalSystem",
+            "accelerators": [
+                {
+                    "device": "npu",
+                    "execution_providers": [
+                        "QNNExecutionProvider"
+                    ]
+                }
+            ]
+        }
+    },
+    "data_configs": [
+        {
+            "name": "quantization_data_config",
+            "type": "HuggingfaceContainer",
+            "load_dataset_config": {
+                "data_name": "facebook/xnli",
+                "subset": "en",
+                "split": "validation"
+            },
+            "pre_process_data_config": {
+                "input_cols": [
+                    "premise"
+                ],
+                "padding": "max_length",
+                "max_length": 128,
+                "max_samples": 10
+            },
+            "dataloader_config": {
+                "batch_size": 1
+            }
+        },
+        {
+            "name": "evaluation_data_config",
+            "type": "HuggingfaceContainer",
+            "load_dataset_config": {
+                "data_name": "facebook/xnli",
+                "subset": "en",
+                "split": "validation"
+            },
+            "pre_process_data_config": {
+                "input_cols": [
+                    "premise"
+                ],
+                "padding": "max_length",
+                "max_length": 128,
+                "max_samples": 10
+            },
+            "dataloader_config": {
+                "batch_size": 1
+            }
+        }
+    ],
+    "evaluators": {
+        "common_evaluator": {
+            "metrics": [
+                {
+                    "name": "latency",
+                    "type": "latency",
+                    "data_config": "evaluation_data_config",
+                    "sub_types": [
+                        {
+                            "name": "avg",
+                            "priority": 1,
+                            "goal": {
+                                "type": "percent-min-improvement",
+                                "value": 0.1
+                            }
+                        },
+                        {
+                            "name": "max"
+                        },
+                        {
+                            "name": "min"
+                        }
+                    ]
+                },
+                {
+                    "name": "throughput",
+                    "type": "throughput",
+                    "data_config": "evaluation_data_config",
+                    "sub_types": [
+                        {
+                            "name": "avg"
+                        },
+                        {
+                            "name": "max"
+                        },
+                        {
+                            "name": "min"
+                        }
+                    ]
+                }
+            ]
+        }
+    },
+    "passes": {
+        "conversion": {
+            "type": "OnnxConversion",
+            "target_opset": 20,
+            "save_as_external_data": true
+        },
+        "to_fixed_shape": {
+            "type": "DynamicToFixedShape",
+            "dim_param": [
+                "batch_size",
+                "sequence_length"
+            ],
+            "dim_value": [
+                1,
+                128
+            ]
+        },
+        "surgery": {
+            "type": "GraphSurgeries",
+            "surgeries": [
+                {
+                    "surgeon": "ReplaceAttentionMaskValue",
+                    "replacement": -100.0
+                },
+                {
+                    "surgeon": "MatMulAddToGemm"
+                }
+            ]
+        },
+        "transformer_optimizer": {
+            "type": "OrtTransformersOptimization",
+            "model_type": "bert",
+            "opt_level": 1,
+            "optimization_options": {
+                "enable_gelu": true,
+                "enable_bias_gelu": false,
+                "enable_layer_norm": true,
+                "enable_skip_layer_norm": false,
+                "enable_bias_skip_layer_norm": false,
+                "enable_attention": false
+            },
+            "save_as_external_data": true
+        },
+        "OnnxQuantization": {
+            "type": "OnnxStaticQuantization",
+            "data_config": "quantization_data_config",
+            "quant_preprocess": true,
+            "activation_type": "uint16",
+            "precision": "uint8",
+            "save_as_external_data": true
+        }
+    },
+    "host": "qnn_system",
+    "target": "qnn_system",
+    "evaluator": "common_evaluator",
+    "cache_dir": "cache",
+    "output_dir": "model/google_bert",
+    "evaluate_input_model": false
+}
diff --git a/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_qdq_qnn.json.config b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_qdq_qnn.json.config
new file mode 100644
index 00000000..45b6868c
--- /dev/null
+++ b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_qdq_qnn.json.config
@@ -0,0 +1,273 @@
+{
+    "name": "Convert to Qualcomm NPU",
+    "oliveFile": "bert/google_bert_qdq.json",
+    "runtime": {
+        "autoGenerated": true,
+        "name": "Evaluate on",
+        "type": "enum",
+        "displayNames": [
+            "Qualcomm NPU",
+            "CPU"
+        ],
+        "path": "systems.qnn_system.accelerators.0.execution_providers.0",
+        "values": [
+            "QNNExecutionProvider",
+            "CPUExecutionProvider"
+        ],
+        "readOnly": false
+    },
+    "sections": [
+        {
+            "autoGenerated": true,
+            "name": "Convert",
+            "phase": "Conversion",
+            "parameters": [],
+            "toggle": {
+                "autoGenerated": true,
+                "name": "Convert to ONNX format",
+                "type": "bool",
+                "path": "passes.conversion",
+                "actions": [
+                    [],
+                    []
+                ],
+                "readOnly": true
+            }
+        },
+        {
+            "name": "Quantize",
+            "phase": "Quantization",
+            "parameters": [
+                {
+                    "name": "Activation Type",
+                    "tags": [
+                        "ActivationType"
+                    ],
+                    "description": "Quantization data type of activation. ‘Int8’ for signed 8-bit integer, ‘UInt8’ for unsigned 8-bit integer etc.",
+                    "descriptionLink": "https://onnxruntime.ai/docs/performance/model-optimizations/quantization.html",
+                    "type": "enum",
+                    "displayNames": [
+                        "Int8",
+                        "UInt8",
+                        "Int16",
+                        "UInt16"
+                    ],
+                    "displayType": "RadioGroup",
+                    "path": "passes.OnnxQuantization.activation_type",
+                    "values": [
+                        "int8",
+                        "uint8",
+                        "int16",
+                        "uint16"
+                    ],
+                    "template": {
+                        "path": "passes.OnnxQuantization.activation_type",
+                        "template": "ActivationType"
+                    }
+                },
+                {
+                    "name": "Weight Type",
+                    "tags": [
+                        "WeightType"
+                    ],
+                    "description": "Data type for quantizing weights. ‘Int8’ for signed 8-bit integer, ‘UInt8’ for unsigned 8-bit integer etc.",
+                    "descriptionLink": "https://onnxruntime.ai/docs/performance/model-optimizations/quantization.html",
+                    "type": "enum",
+                    "displayNames": [
+                        "Int8",
+                        "UInt8",
+                        "Int16",
+                        "UInt16"
+                    ],
+                    "displayType": "RadioGroup",
+                    "path": "passes.OnnxQuantization.precision",
+                    "values": [
+                        "int8",
+                        "uint8",
+                        "int16",
+                        "uint16"
+                    ],
+                    "template": {
+                        "path": "passes.OnnxQuantization.precision",
+                        "template": "WeightType"
+                    }
+                },
+                {
+                    "name": "Quantization Dataset",
+                    "tags": [
+                        "QuantizationDataset"
+                    ],
+                    "type": "enum",
+                    "path": "data_configs[0].load_dataset_config.data_name",
+                    "values": [
+                        "facebook/xnli"
+                    ],
+                    "template": {
+                        "path": "data_configs[0].load_dataset_config.data_name",
+                        "values": [
+                            "facebook/xnli"
+                        ],
+                        "template": "QuantizationDataset"
+                    }
+                },
+                {
+                    "name": "Quantization Dataset Subset",
+                    "tags": [
+                        "QuantizationDatasetSubset",
+                        "DependsOnDataset"
+                    ],
+                    "type": "enum",
+                    "path": "data_configs[0].load_dataset_config.subset",
+                    "values": [
+                        "en",
+                        "all_languages"
+                    ],
+                    "template": {
+                        "path": "data_configs[0].load_dataset_config.subset",
+                        "values": [
+                            "en",
+                            "all_languages"
+                        ],
+                        "template": "QuantizationDatasetSubset"
+                    }
+                },
+                {
+                    "name": "Quantization Dataset Split",
+                    "tags": [
+                        "QuantizationDatasetSplit",
+                        "DependsOnDataset"
+                    ],
+                    "type": "enum",
+                    "path": "data_configs[0].load_dataset_config.split",
+                    "values": [
+                        "train",
+                        "validation",
+                        "test"
+                    ],
+                    "template": {
+                        "path": "data_configs[0].load_dataset_config.split",
+                        "template": "QuantizationDatasetSplit"
+                    }
+                },
+                {
+                    "name": "Quantization Dataset Size",
+                    "type": "int",
+                    "path": "data_configs[0].pre_process_data_config.max_samples",
+                    "template": {
+                        "path": "data_configs[0].pre_process_data_config.max_samples",
+                        "template": "QuantizationDatasetSize"
+                    }
+                }
+            ],
+            "toggle": {
+                "autoGenerated": true,
+                "name": "Quantize model",
+                "type": "bool",
+                "path": "passes.OnnxQuantization",
+                "actions": [
+                    [],
+                    [
+                        {
+                            "type": "update",
+                            "path": "passes",
+                            "value": {
+                                "conversion": {
+                                    "type": "OnnxConversion",
+                                    "target_opset": 20,
+                                    "save_as_external_data": true
+                                }
+                            }
+                        }
+                    ]
+                ]
+            }
+        },
+        {
+            "name": "Evaluate",
+            "phase": "Evaluation",
+            "parameters": [
+                {
+                    "name": "Evaluation Dataset",
+                    "tags": [
+                        "EvaluationDataset"
+                    ],
+                    "type": "enum",
+                    "path": "data_configs[1].load_dataset_config.data_name",
+                    "values": [
+                        "facebook/xnli"
+                    ],
+                    "template": {
+                        "path": "data_configs[1].load_dataset_config.data_name",
+                        "values": [
+                            "facebook/xnli"
+                        ],
+                        "template": "EvaluationDataset"
+                    }
+                },
+                {
+                    "name": "Evaluation Dataset Subset",
+                    "tags": [
+                        "EvaluationDatasetSubset",
+                        "DependsOnDataset"
+                    ],
+                    "type": "enum",
+                    "path": "data_configs[1].load_dataset_config.subset",
+                    "values": [
+                        "en",
+                        "all_languages"
+                    ],
+                    "template": {
+                        "path": "data_configs[1].load_dataset_config.subset",
+                        "values": [
+                            "en",
+                            "all_languages"
+                        ],
+                        "template": "EvaluationDatasetSubset"
+                    }
+                },
+                {
+                    "name": "Evaluation Dataset Split",
+                    "tags": [
+                        "EvaluationDatasetSplit",
+                        "DependsOnDataset"
+                    ],
+                    "type": "enum",
+                    "path": "data_configs[1].load_dataset_config.split",
+                    "values": [
+                        "train",
+                        "validation",
+                        "test"
+                    ],
+                    "template": {
+                        "path": "data_configs[1].load_dataset_config.split",
+                        "template": "EvaluationDatasetSplit"
+                    }
+                },
+                {
+                    "name": "Quantization Dataset Size",
+                    "type": "int",
+                    "path": "data_configs[1].pre_process_data_config.max_samples",
+                    "template": {
+                        "path": "data_configs[1].pre_process_data_config.max_samples",
+                        "template": "QuantizationDatasetSize"
+                    }
+                }
+            ],
+            "toggle": {
+                "autoGenerated": true,
+                "name": "Evaluate model performance",
+                "type": "bool",
+                "path": "evaluator",
+                "actions": [
+                    [],
+                    [
+                        {
+                            "type": "delete",
+                            "path": "evaluator"
+                        }
+                    ]
+                ]
+            }
+        }
+    ]
+}
diff --git a/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_trtrtx.json b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_trtrtx.json
new file mode 100644
index 00000000..b4714fe2
--- /dev/null
+++ b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_trtrtx.json
@@ -0,0 +1,128 @@
+{
+    "input_model": {
+        "type": "HfModel",
+        "model_path": "cross-encoder/ms-marco-MiniLM-L-6-v2",
+        "task": "feature-extraction"
+    },
+    "systems": {
+        "local_system": {
+            "type": "LocalSystem",
+            "accelerators": [
+                {
+                    "device": "gpu",
+                    "execution_providers": [
+                        "NvTensorRTRTXExecutionProvider"
+                    ]
+                }
+            ]
+        }
+    },
+    "data_configs": [
+        {
+            "name": "xnli",
+            "type": "HuggingfaceContainer",
+            "load_dataset_config": {
+                "data_name": "facebook/xnli",
+                "subset": "en",
+                "split": "validation"
+            },
+            "pre_process_data_config": {
+                "input_cols": [
+                    "premise"
+                ],
+                "padding": "max_length",
+                "max_length": 128,
+                "max_samples": 10
+            },
+            "dataloader_config": {
+                "batch_size": 1
+            }
+        }
+    ],
+    "evaluators": {
+        "common_evaluator": {
+            "metrics": [
+                {
+                    "name": "latency",
+                    "type": "latency",
+                    "data_config": "xnli",
+                    "sub_types": [
+                        {
+                            "name": "avg",
+                            "priority": 1,
+                            "goal": {
+                                "type": "percent-min-improvement",
+                                "value": 0.1
+                            }
+                        },
+                        {
+                            "name": "max"
+                        },
+                        {
+                            "name": "min"
+                        }
+                    ]
+                },
+                {
+                    "name": "throughput",
+                    "type": "throughput",
+                    "data_config": "xnli",
+                    "sub_types": [
+                        {
+                            "name": "avg"
+                        },
+                        {
+                            "name": "max"
+                        },
+                        {
+                            "name": "min"
+                        }
+                    ]
+                }
+            ]
+        }
+    },
+    "passes": {
+        "conversion": {
+            "type": "OnnxConversion",
+            "target_opset": 17,
+            "save_as_external_data": true
+        },
+        "onnx_float_to_float16": {
+            "type": "OnnxFloatToFloat16",
+            "save_as_external_data": true
+        },
+        "dynamic_shape_to_fixed": {
+            "type": "DynamicToFixedShape",
+            "dim_param": [
+                "batch_size",
+                "sequence_length"
+            ],
+            "dim_value": [
+                1,
+                128
+            ]
+        },
+        "surgery": {
+            "type": "GraphSurgeries",
+            "save_as_external_data": true,
+            "surgeries": [
+                {
+                    "surgeon": "ReplaceAttentionMaskValue"
+                }
+            ]
+        },
+        "session_params_tuning": {
+            "type": "OrtSessionParamsTuning",
+            "io_bind": false,
+            "data_config": "xnli"
+        }
+    },
+    "host": "local_system",
+    "target": "local_system",
+    "evaluator": "common_evaluator",
+    "cache_dir": "cache",
+    "output_dir": "model/google_bert_trtrtx",
+    "log_severity_level": 0,
+    "evaluate_input_model": false
+}
diff --git a/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_trtrtx.json.config b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_trtrtx.json.config
new file mode 100644
index 00000000..90a60833
--- /dev/null
+++ b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_trtrtx.json.config
@@ -0,0 +1,125 @@
+{
+    "name": "Convert to NVIDIA TRT for RTX",
+    "oliveFile": "bert/google_bert_trtrtx.json",
+    "runtime": {
+        "autoGenerated": true,
+        "name": "Evaluate on",
+        "type": "enum",
+        "displayNames": [
+            "NVIDIA TensorRT for RTX",
+            "CPU"
+        ],
+        "path": "systems.local_system.accelerators.0.execution_providers.0",
+        "values": [
+            "NvTensorRTRTXExecutionProvider",
+            "CPUExecutionProvider"
+        ],
+        "readOnly": false
+    },
+    "sections": [
+        {
+            "autoGenerated": true,
+            "name": "Convert",
+            "phase": "Conversion",
+            "parameters": [],
+            "toggle": {
+                "autoGenerated": true,
+                "name": "Convert to ONNX format",
+                "type": "bool",
+                "path": "passes.conversion",
+                "actions": [
+                    [],
+                    []
+                ],
+                "readOnly": true
+            }
+        },
+        {
+            "name": "Evaluate",
+            "phase": "Evaluation",
+            "parameters": [
+                {
+                    "name": "Evaluation Dataset",
+                    "tags": [
+                        "EvaluationDataset"
+                    ],
+                    "type": "enum",
+                    "path": "data_configs[0].load_dataset_config.data_name",
+                    "values": [
+                        "facebook/xnli"
+                    ],
+                    "template": {
+                        "path": "data_configs[0].load_dataset_config.data_name",
+                        "values": [
+                            "facebook/xnli"
+                        ],
+                        "template": "EvaluationDataset"
+                    }
+                },
+                {
+                    "name": "Evaluation Dataset Subset",
+                    "tags": [
+                        "EvaluationDatasetSubset",
+                        "DependsOnDataset"
+                    ],
+                    "type": "enum",
+                    "path": "data_configs[0].load_dataset_config.subset",
+                    "values": [
+                        "en",
+                        "all_languages"
+                    ],
+                    "template": {
+                        "path": "data_configs[0].load_dataset_config.subset",
+                        "values": [
+                            "en",
+                            "all_languages"
+                        ],
+                        "template": "EvaluationDatasetSubset"
+                    }
+                },
+                {
+                    "name": "Evaluation Dataset Split",
+                    "tags": [
+                        "EvaluationDatasetSplit",
+                        "DependsOnDataset"
+                    ],
+                    "type": "enum",
+                    "path": "data_configs[0].load_dataset_config.split",
+                    "values": [
+                        "train",
+                        "validation",
+                        "test"
+                    ],
+                    "template": {
+                        "path": "data_configs[0].load_dataset_config.split",
+                        "template": "EvaluationDatasetSplit"
+                    }
+                },
+                {
+                    "name": "Quantization Dataset Size",
+                    "type": "int",
+                    "path": "data_configs[0].pre_process_data_config.max_samples",
+                    "template": {
+                        "path": "data_configs[0].pre_process_data_config.max_samples",
+                        "template": "QuantizationDatasetSize"
+                    }
+                }
+            ],
+            "toggle": {
+                "autoGenerated": true,
+                "name": "Evaluate model performance",
+                "type": "bool",
+                "path": "evaluator",
+                "actions": [
+                    [],
+                    [
+                        {
+                            "type": "delete",
+                            "path": "evaluator"
+                        }
+                    ]
+                ]
+            }
+        }
+    ]
+}
diff --git a/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/requirements.txt b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/requirements.txt
new file mode 100644
index 00000000..db86d4b4
--- /dev/null
+++ b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/requirements.txt
@@ -0,0 +1,4 @@
+# This file will be installed together with AITK runtime requirements
+# For the full requirements, see AITK
+olive-ai
+datasets
diff --git a/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/user_script.py b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/user_script.py
new file mode 100644
index 00000000..f7442c2f
--- /dev/null
+++ b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/user_script.py
@@ -0,0 +1,83 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Intel Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import datasets
+import numpy as np
+import torch
+from transformers import BertTokenizer
+
+from olive.data.registry import Registry
+
+# -------------------------------------------------------------------------
+# Common Dataset
+# -------------------------------------------------------------------------
+
+seed = 0
+# seed everything to 0 for reproducibility, https://pytorch.org/docs/stable/notes/randomness.html
+# do not set random seed and np.random.seed for aml test, since it will cause aml job name conflict
+torch.manual_seed(seed)
+# the following are needed only for GPU
+torch.cuda.manual_seed(seed)
+torch.backends.cudnn.deterministic = True
+torch.backends.cudnn.benchmark = False
+
+# set max sequence length
+MAX_SEQ_LENGTH = 128
+
+# define the tokenizer
+tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-multilingual-cased")
+VOCAB_SIZE = len(tokenizer)
+
+# set default input
+default_input = torch.ones(1, MAX_SEQ_LENGTH, dtype=torch.int64)
+
+# define model inputs
+model_inputs = {
+    "input_ids": default_input,
+    "attention_mask": default_input,
+    "token_type_ids": default_input,
+}
+
+# capture input names
+INPUT_NAMES = list(model_inputs)
+
+
+@Registry.register_dataset()
+def bert_base_multilingual_cased_dataset(data_name, split, max_samples):
+    # load the raw wikipedia dataset for tuning. Load just 300 examples for speed.
+    raw_dataset = datasets.load_dataset(data_name, "20220301.en", split=f"{split}[:{max_samples}]", trust_remote_code=True)
+
+    def _preprocess_fn(examples):
+        return tokenizer(
+            examples["text"],
+            padding="max_length",
+            max_length=MAX_SEQ_LENGTH,
+            truncation=True,
+        )
+
+    # preprocess the dataset
+    return raw_dataset.map(_preprocess_fn, batched=True, batch_size=1)
+
+
+def custom_transform_func(data_item):
+    return {
+        name: np.asarray([np.array([g.flatten() for g in data_item[name]]).flatten()], dtype=np.int64)
+        for name in INPUT_NAMES
+    }
+
+
+def custom_example_func():
+    vocab_size = VOCAB_SIZE
+    batch_size = 1
+    sequence_length = MAX_SEQ_LENGTH
+
+    input_ids = torch.randint(0, vocab_size, (batch_size, sequence_length))
+
+    # Generate random attention_mask (1s for actual tokens, 0s for padding)
+    attention_mask = default_input
+
+    # Generate random token_type_ids (0 for sentence 1, 1 for sentence 2)
+    token_type_ids = default_input
+
+    return [input_ids, attention_mask, token_type_ids]

From 17b5cadbed4d50d980fc289dd8444b23bcefc593 Mon Sep 17 00:00:00 2001
From: "Peiyao Zhao (from Dev Box)" <peiyaozhao@microsoft.com>
Date: Tue, 2 Sep 2025 18:34:13 +0800
Subject: [PATCH 2/2] update readme

---
 .../aitk/README.md                            | 138 ++++++++++++++++--
 1 file changed, 125 insertions(+), 13 deletions(-)

diff --git a/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/README.md b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/README.md
index 46ba8a03..89819f71 100644
--- a/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/README.md
+++ b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/README.md
@@ -1,21 +1,133 @@
-# BERT Optimization
+# Cross-Encoder (MS MARCO MiniLM) optimization
 
-This folder contains examples of BERT optimization using different workflows.
+This folder contains examples of optimization for `cross-encoder/ms-marco-MiniLM-L-6-v2` across multiple runtimes.
 
-- QDQ for Qualcomm NPU / AMD NPU
+- QDQ for AMD NPU
+- QDQ for Qualcomm NPU
 - OpenVINO for Intel NPU
-- Float downcasting for NVIDIA TRT for RTX GPU / DML for general GPU
+- Float downcasting for NVIDIA TRT (RTX) / DML for general GPU
 
-## QDQ for Qualcomm NPU / AMD NPU
+## QDQ for AMD NPU
 
-This workflow quantizes the model. It performs the pipeline:
-- *HF Model-> ONNX Model ->Quantized Onnx Model*
+Quantize and package the model for AMD NPU.
 
-### Latency / Throughput
+- Pipeline: *HuggingFace Model -> ONNX -> Quantized ONNX*  
+- Configuration File: `ms-marco-MiniLM-L-6-v2_qdq_amd.json` ([aitk/ms-marco-MiniLM-L-6-v2_qdq_amd.json](cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_qdq_amd.json))
 
-| Model Version         | Latency (ms/sample)  | Throughput (token per second)| Dataset       |
-|-----------------------|----------------------|------------------------------|---------------|
-| PyTorch FP32          | 1162                 | 0.81                         | facebook/xnli |
-| ONNX INT8 (QDQ)       | 590                  | 1.75                         | facebook/xnli |
+Key features:
+- Produce VitisAI-ready artifacts for AMD NPU deployment.  
+- Uses `cross-encoder/ms-marco-MiniLM-L-6-v2` (pairwise scoring); inputs padded to 128 tokens.  
+- Calibration/evaluation with XNLI (en) "premise" (≤10 samples).  
+- ONNX export + transformer optimizations; static quantization to reduce latency.
 
-*Note: Latency can vary significantly depending on the hardware and system environment. The values provided here are for reference only and may not reflect performance on all devices.*
+## QDQ for Qualcomm NPU
+
+Quantize the model for Qualcomm NPU runtimes.
+
+- Pipeline: *HuggingFace Model -> ONNX -> Quantized ONNX*  
+- Configuration File: `ms-marco-MiniLM-L-6-v2_qdq_qnn.json` ([aitk/ms-marco-MiniLM-L-6-v2_qdq_qnn.json](cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_qdq_qnn.json))
+
+Key features:
+- Target: Qualcomm NPU (QNN).  
+- Calibration/evaluation uses XNLI (en) "premise" (≤10 samples, pad to 128).  
+- ONNX export (opset 20), fix dynamic shapes to [1,128], apply light graph fusions, then static quantization (activations/weights tuned for QNN).
+
+## OpenVINO (Intel NPU)
+
+Convert and quantize the model for OpenVINO.
+
+- Pipeline: *HuggingFace Model -> ONNX -> OpenVINO quantized model*  
+- Configuration File: `ms-marco-MiniLM-L-6-v2_context_ov_static.json` ([aitk/ms-marco-MiniLM-L-6-v2_context_ov_static.json](cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_context_ov_static.json))
+
+Key features:
+- OpenVINO Optimum conversion and encapsulation for Intel NPU.  
+- Calibration uses Wikipedia (train, 300 samples) with a custom transform for transformer inputs.  
+- Enforces static I/O (three inputs of [1,128]) and applies transformer-specific quantization.
+
+## Float downcasting for NVIDIA TRT / DML
+
+FP16 export for GPU backends to improve throughput.
+
+- Pipeline: *HuggingFace Model -> ONNX -> FP16 ONNX*  
+- Configuration Files:  
+  - TRT (RTX): `ms-marco-MiniLM-L-6-v2_trtrtx.json` ([aitk/ms-marco-MiniLM-L-6-v2_trtrtx.json](cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_trtrtx.json))  
+  - DML: `ms-marco-MiniLM-L-6-v2_dml.json` ([aitk/ms-marco-MiniLM-L-6-v2_dml.json](cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_dml.json))
+
+Key features:
+- FP16 conversion for TensorRT and DML-optimized exports for GPU inference.  
+- Fixed-shape [1,128] inputs and transformer optimizations for stable latency/throughput.  
+- Evaluate latency and throughput on XNLI example inputs.
+
+## Dataset Information
+
+### Quantization Datasets
+- **QNN/AMD NPU**: XNLI (English validation) "premise" (≤10 samples, padded to 128).  
+- **Intel NPU (OpenVINO)**: Wikipedia train (300 samples) with transformer-specific preprocessing.
+
+### Evaluation Datasets
+- **Primary**: XNLI (English) validation for quick latency/throughput checks.  
+- **Metric**: Pairwise scoring (ranking) metrics or embedding/latency metrics depending on evaluation script.  
+- **Benchmark**: Use task-appropriate benchmarks (MS MARCO for ranking, XNLI for latency/throughput examples).
+
+### Evaluation Datasets
+- **Primary**: XNLI (English) validation split for NLI classification.  
+- **Evaluation Metric**: Embedding-based classification accuracy (or feature-extraction latency/throughput).  
+- **Benchmark**: XNLI (Cross-lingual Natural Language Inference).
+
+## Performance Evaluation Results
+The following results are based on comprehensive evaluation using standard embedding benchmarks and performance metrics. 
+
+### Qualcomm NPU (QNN) Performance
+
+| Metric | Value |
+|--------|-------|
+| **Latency (avg)** | 5.59 ms |
+| **Latency (min)** | 4.87 ms |
+| **Latency (max)** | 7.13 ms |
+| **Throughput (avg)** | 190.41 tokens/sec |
+| **Throughput (max)** | 206.31 tokens/sec |
+| **Throughput (min)** | 138.28 tokens/sec |
+
+### AMD NPU Performance
+
+| Metric | Value |
+|--------|-------|
+| **Latency (avg)** | 5.78 ms |
+| **Latency (min)** | 4.92 ms |
+| **Latency (max)** | 7.77 ms |
+| **Throughput (avg)** | 186.78 tokens/sec |
+| **Throughput (max)** | 229.40 tokens/sec |
+| **Throughput (min)** | 137.03 tokens/sec |
+
+### Intel NPU Performance
+
+| Metric | Value |
+|--------|-------|
+| **Latency (avg)** | 2.59 ms |
+| **Latency (p90)** | 3.26 ms |
+| **Similarity** | 0.9830 |
+
+### TRT Performance
+
+| Metric | Value |
+|--------|-------|
+| **Latency (avg)** | 0.63 ms |
+| **Latency (min)** | 0.59 ms |
+| **Latency (max)** | 0.72 ms |
+| **Throughput (avg)** | 780.33 tokens/sec |
+| **Throughput (max)** | 1557.39 tokens/sec |
+| **Throughput (min)** | 171.56 tokens/sec |
+
+### DML Performance
+
+| Metric | Value |
+|--------|-------|
+| **Latency (max)** | 2.02 ms |
+| **Latency (min)** | 1.34 ms |
+| **Throughput (avg)** | 684.74 tokens/sec |
+| **Throughput (max)** | 721.03 tokens/sec |
+| **Throughput (min)** | 560.07 tokens/sec |
+
+## Notes
+- Model: `cross-encoder/ms-marco-MiniLM-L-6-v2` (pairwise cross-encoder for ranking).  
+- Use the listed config files to reproduce conversion, quantization, and benchmark runs.
\ No newline at end of file