From 68631903c1479906e68d0b3edd23e0cfc0ad885f Mon Sep 17 00:00:00 2001 From: "Peiyao Zhao (from Dev Box)" Date: Tue, 2 Sep 2025 17:51:50 +0800 Subject: [PATCH 1/2] add cross-encoder/ms-marco-MiniLM-L-6-v2 --- .aitk/configs/checks.json | 14 +- .aitk/configs/model_list.json | 20 ++ .aitk/scripts/project_processor.py | 1 + cross-encoder-ms-marco-MiniLM-L-6-v2/LICENSE | 201 +++++++++++++ .../aitk/.gitignore | 5 + .../aitk/README.md | 21 ++ .../aitk/_copy.json.config | 18 ++ .../aitk/inference_sample.ipynb | 151 ++++++++++ .../aitk/info.yml | 27 ++ .../aitk/model_project.config | 28 ++ ...marco-MiniLM-L-6-v2_context_ov_static.json | 97 +++++++ ...iniLM-L-6-v2_context_ov_static.json.config | 182 ++++++++++++ .../aitk/ms-marco-MiniLM-L-6-v2_dml.json | 139 +++++++++ .../ms-marco-MiniLM-L-6-v2_dml.json.config | 123 ++++++++ .../aitk/ms-marco-MiniLM-L-6-v2_qdq_amd.json | 168 +++++++++++ ...ms-marco-MiniLM-L-6-v2_qdq_amd.json.config | 273 ++++++++++++++++++ .../aitk/ms-marco-MiniLM-L-6-v2_qdq_qnn.json | 163 +++++++++++ ...ms-marco-MiniLM-L-6-v2_qdq_qnn.json.config | 273 ++++++++++++++++++ .../aitk/ms-marco-MiniLM-L-6-v2_trtrtx.json | 128 ++++++++ .../ms-marco-MiniLM-L-6-v2_trtrtx.json.config | 125 ++++++++ .../aitk/requirements.txt | 4 + .../aitk/user_script.py | 83 ++++++ 22 files changed, 2237 insertions(+), 7 deletions(-) create mode 100644 cross-encoder-ms-marco-MiniLM-L-6-v2/LICENSE create mode 100644 cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/.gitignore create mode 100644 cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/README.md create mode 100644 cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/_copy.json.config create mode 100644 cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/inference_sample.ipynb create mode 100644 cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/info.yml create mode 100644 cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/model_project.config create mode 100644 cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_context_ov_static.json create mode 100644 cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_context_ov_static.json.config create mode 100644 cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_dml.json create mode 100644 cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_dml.json.config create mode 100644 cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_qdq_amd.json create mode 100644 cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_qdq_amd.json.config create mode 100644 cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_qdq_qnn.json create mode 100644 cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_qdq_qnn.json.config create mode 100644 cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_trtrtx.json create mode 100644 cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_trtrtx.json.config create mode 100644 cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/requirements.txt create mode 100644 cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/user_script.py diff --git a/.aitk/configs/checks.json b/.aitk/configs/checks.json index dd390c5a..4008640c 100644 --- a/.aitk/configs/checks.json +++ b/.aitk/configs/checks.json @@ -1,13 +1,13 @@ { - "configCheck": 86, + "configCheck": 91, "extensionCheck": 1, - "gitignoreCheck": 34, + "gitignoreCheck": 35, "inferenceModelCheck": 24, - "ipynbCheck": 34, - "modelProjectCheck": 35, + "ipynbCheck": 35, + "modelProjectCheck": 36, "oliveCheck": 0, - "oliveJsonCheck": 86, - "pathCheck": 640, - "requirementsCheck": 34, + "oliveJsonCheck": 91, + "pathCheck": 708, + "requirementsCheck": 35, "venvRequirementsCheck": 12 } diff --git a/.aitk/configs/model_list.json b/.aitk/configs/model_list.json index 7fa5917a..ed7e1886 100644 --- a/.aitk/configs/model_list.json +++ b/.aitk/configs/model_list.json @@ -1,5 +1,25 @@ { "models": [ + { + "displayName": "cross-encoder/ms-marco-MiniLM-L-6-v2", + "icon": "HuggingFace", + "modelLink": "https://huggingface.co/cross-encoder/ms-marco-MiniLM-L-6-v2", + "id": "huggingface/cross-encoder/ms-marco-MiniLM-L-6-v2", + "runtimes": [ + "QNN", + "AMDNPU", + "NvidiaTRTRTX", + "IntelCPU", + "IntelGPU", + "IntelNPU", + "DML" + ], + "architecture": "Transformer", + "status": "Ready", + "relativePath": "cross-encoder-ms-marco-MiniLM-L-6-v2/aitk", + "version": 1, + "p0": true + }, { "displayName": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", "icon": "DeepSeek", diff --git a/.aitk/scripts/project_processor.py b/.aitk/scripts/project_processor.py index b84f571f..4940bdfd 100644 --- a/.aitk/scripts/project_processor.py +++ b/.aitk/scripts/project_processor.py @@ -21,6 +21,7 @@ "Qwen": IconEnum.qwen, "meta-llama": IconEnum.Meta, "mistralai": IconEnum.mistralai, + "cross-encoder": IconEnum.HuggingFace } diff --git a/cross-encoder-ms-marco-MiniLM-L-6-v2/LICENSE b/cross-encoder-ms-marco-MiniLM-L-6-v2/LICENSE new file mode 100644 index 00000000..29f81d81 --- /dev/null +++ b/cross-encoder-ms-marco-MiniLM-L-6-v2/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/.gitignore b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/.gitignore new file mode 100644 index 00000000..48c03882 --- /dev/null +++ b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/.gitignore @@ -0,0 +1,5 @@ +__pycache__ +/cache +/history/*/* +!/history/*/history.config +!/history/*/olive_config.json diff --git a/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/README.md b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/README.md new file mode 100644 index 00000000..46ba8a03 --- /dev/null +++ b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/README.md @@ -0,0 +1,21 @@ +# BERT Optimization + +This folder contains examples of BERT optimization using different workflows. + +- QDQ for Qualcomm NPU / AMD NPU +- OpenVINO for Intel NPU +- Float downcasting for NVIDIA TRT for RTX GPU / DML for general GPU + +## QDQ for Qualcomm NPU / AMD NPU + +This workflow quantizes the model. It performs the pipeline: +- *HF Model-> ONNX Model ->Quantized Onnx Model* + +### Latency / Throughput + +| Model Version | Latency (ms/sample) | Throughput (token per second)| Dataset | +|-----------------------|----------------------|------------------------------|---------------| +| PyTorch FP32 | 1162 | 0.81 | facebook/xnli | +| ONNX INT8 (QDQ) | 590 | 1.75 | facebook/xnli | + +*Note: Latency can vary significantly depending on the hardware and system environment. The values provided here are for reference only and may not reflect performance on all devices.* diff --git a/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/_copy.json.config b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/_copy.json.config new file mode 100644 index 00000000..7274f6c6 --- /dev/null +++ b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/_copy.json.config @@ -0,0 +1,18 @@ +{ + "copies": [ + { + "src": "ms-marco-MiniLM-L-6-v2_qdq_amd.json.config", + "dst": "ms-marco-MiniLM-L-6-v2_qdq_qnn.json.config", + "replacements": [ + { + "find": "bert/google_bert_qdq_vitis_ai.json", + "replace": "bert/google_bert_qdq.json" + }, + { + "find": "Convert to AMD NPU", + "replace": "Convert to Qualcomm NPU" + } + ] + } + ] +} diff --git a/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/inference_sample.ipynb b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/inference_sample.ipynb new file mode 100644 index 00000000..194fbd3c --- /dev/null +++ b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/inference_sample.ipynb @@ -0,0 +1,151 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "onnx_model_path = \"./model/model.onnx\"\n", + "\n", + "ExecutionProvider=\"QNNExecutionProvider\"\n", + "if ExecutionProvider == \"OpenVINOExecutionProvider\":\n", + " onnx_model_path = \"./model/openvino_model_st_quant.onnx\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "inputs = \"This is an example sentence.\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import onnxruntime as ort\n", + "import torch\n", + "import torch.nn.functional as F\n", + "\n", + "from transformers import AutoModel, AutoTokenizer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def mean_pooling(model_output, attention_mask):\n", + " token_embeddings = torch.tensor(model_output[0])\n", + " input_mask_expanded = attention_mask.unsqueeze(-1).expand_as(token_embeddings).float()\n", + " return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-multilingual-cased')\n", + "encoded_input = tokenizer(\n", + " inputs,\n", + " padding=\"max_length\",\n", + " max_length=128,\n", + " truncation=True,\n", + " add_special_tokens=True,\n", + " return_tensors=\"pt\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def add_ep_for_device(session_options, ep_name, device_type, ep_options=None):\n", + " ep_devices = ort.get_ep_devices()\n", + " for ep_device in ep_devices:\n", + " if ep_device.ep_name == ep_name and ep_device.device.type == device_type:\n", + " print(f\"Adding {ep_name} for {device_type}\")\n", + " session_options.add_provider_for_devices([ep_device], {} if ep_options is None else ep_options)\n", + " break\n", + "\n", + "\n", + "session_options = ort.SessionOptions()\n", + "\n", + "add_ep_for_device(session_options, ExecutionProvider, ort.OrtHardwareDeviceType.NPU)\n", + "\n", + "session = ort.InferenceSession(\n", + " onnx_model_path, # a model with QNN EPContext nodes\n", + " sess_options=session_options,\n", + ")\n", + "\n", + "input_ids = encoded_input[\"input_ids\"]\n", + "attention_mask = encoded_input[\"attention_mask\"]\n", + "token_type_ids = encoded_input[\"token_type_ids\"]\n", + "inputs = {\n", + " \"input_ids\": input_ids.long().cpu().numpy(),\n", + " \"attention_mask\": attention_mask.long().cpu().numpy(),\n", + " \"token_type_ids\": token_type_ids.long().cpu().numpy()\n", + "}\n", + "\n", + "outputs = session.run(None, inputs)\n", + "embeds_1 = mean_pooling(outputs, encoded_input['attention_mask'])\n", + "embeds_1 = F.normalize(embeds_1, p=2, dim=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# get text embedding from orinal model, as ground truth.\n", + "model = AutoModel.from_pretrained('google-bert/bert-base-multilingual-cased').eval()\n", + "with torch.no_grad():\n", + " outputs = model(**encoded_input)\n", + " embeds_2 = mean_pooling(outputs, encoded_input['attention_mask'])\n", + " embeds_2 = F.normalize(embeds_2, p=2, dim=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "similarity = F.cosine_similarity(embeds_1, embeds_2).item()\n", + "print(\"Similarity: \", similarity)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/info.yml b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/info.yml new file mode 100644 index 00000000..28be7948 --- /dev/null +++ b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/info.yml @@ -0,0 +1,27 @@ +keywords: + aitk +arch: bert +recipes: + - file: "ms-marco-MiniLM-L-6-v2_qdq_qnn.json" + device: npu + ep: QNNExecutionProvider + - file: "ms-marco-MiniLM-L-6-v2_qdq_amd.json" + device: npu + ep: VitisAIExecutionProvider + - file: "ms-marco-MiniLM-L-6-v2_context_ov_static.json" + devices: + - npu + - cpu + - gpu + ep: OpenVINOExecutionProvider + - file: "ms-marco-MiniLM-L-6-v2_trtrtx.json" + device: gpu + ep: NvTensorRTRTXExecutionProvider + - file: "ms-marco-MiniLM-L-6-v2_dml.json" + device: gpu + ep: DmlExecutionProvider +aitk: + modelInfo: + id: "huggingface/cross-encoder/ms-marco-MiniLM-L-6-v2" + version: 1 + p0: true diff --git a/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/model_project.config b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/model_project.config new file mode 100644 index 00000000..e99faf9d --- /dev/null +++ b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/model_project.config @@ -0,0 +1,28 @@ +{ + "workflows": [ + { + "file": "ms-marco-MiniLM-L-6-v2_qdq_qnn.json", + "templateName": "ms-marco-MiniLM-L-6-v2_qdq_qnn" + }, + { + "file": "ms-marco-MiniLM-L-6-v2_qdq_amd.json", + "templateName": "ms-marco-MiniLM-L-6-v2_qdq_amd" + }, + { + "file": "ms-marco-MiniLM-L-6-v2_context_ov_static.json", + "templateName": "ms-marco-MiniLM-L-6-v2_context_ov_static" + }, + { + "file": "ms-marco-MiniLM-L-6-v2_trtrtx.json", + "templateName": "ms-marco-MiniLM-L-6-v2_trtrtx" + }, + { + "file": "ms-marco-MiniLM-L-6-v2_dml.json", + "templateName": "ms-marco-MiniLM-L-6-v2_dml" + } + ], + "modelInfo": { + "id": "huggingface/cross-encoder/ms-marco-MiniLM-L-6-v2", + "version": 1 + } +} diff --git a/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_context_ov_static.json b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_context_ov_static.json new file mode 100644 index 00000000..c371fe69 --- /dev/null +++ b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_context_ov_static.json @@ -0,0 +1,97 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "cross-encoder/ms-marco-MiniLM-L-6-v2", + "task": "fill-mask" + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "npu", + "execution_providers": [ + "OpenVINOExecutionProvider" + ] + } + ] + } + }, + "data_configs": [ + { + "name": "quantize_data_config", + "user_script": "user_script.py", + "load_dataset_config": { + "type": "bert_base_multilingual_cased_dataset", + "data_name": "wikipedia", + "split": "train", + "max_samples": 300 + }, + "dataloader_config": { + "batch_size": 1, + "drop_last": true + } + } + ], + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "latency", + "type": "latency", + "sub_types": [ + { "name": "avg", "priority": 1, "metric_config": { "warmup_num": 20, "repeat_test_num": 100 } }, + { "name": "p90", "metric_config": { "warmup_num": 20, "repeat_test_num": 100 } } + ] + } + ] + } + }, + "passes": { + "optimum_convert": { + "type": "OpenVINOOptimumConversion", + "extra_args": { + "device": "npu", + "task": "feature-extraction" + } + }, + "io_update": { + "type": "OpenVINOIoUpdate", + "input_shapes": [ + [ + 1, + 128 + ], + [ + 1, + 128 + ], + [ + 1, + 128 + ] + ], + "static": true + }, + "ov_quantize": { + "type": "OpenVINOQuantization", + "target_device": "npu", + "data_config": "quantize_data_config", + "model_type": "TRANSFORMER", + "user_script": "user_script.py", + "transform_fn": "custom_transform_func" + }, + "encapsulation": { + "type": "OpenVINOEncapsulation", + "target_device": "npu", + "ov_version": "2025.1" + } + }, + "search_strategy": false, + "host": "local_system", + "target": "local_system", + "cache_dir": "cache", + "evaluator": "common_evaluator", + "evaluate_input_model": false, + "output_dir": "model/bert-base-multilingual-cased_context_ov_static" +} diff --git a/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_context_ov_static.json.config b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_context_ov_static.json.config new file mode 100644 index 00000000..9ed8f88e --- /dev/null +++ b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_context_ov_static.json.config @@ -0,0 +1,182 @@ +{ + "name": "Convert to Intel CPU/NPU/GPU", + "oliveFile": "bert/openvino/ms-marco-MiniLM-L-6-v2/ms-marco-MiniLM-L-6-v2_context_ov_static.json", + "isIntel": true, + "debugInfo": { + "autoGenerated": true, + "useOpenVINOOptimumConversion": "optimum_convert" + }, + "addCpu": false, + "runtime": { + "autoGenerated": true, + "name": "Evaluate on", + "type": "enum", + "displayNames": [ + "Intel CPU", + "Intel GPU", + "Intel NPU" + ], + "path": "systems.local_system.accelerators.0.device", + "values": [ + "cpu", + "gpu", + "npu" + ], + "readOnly": false + }, + "runtimeInConversion": { + "autoGenerated": true, + "name": "Convert/Quantize to", + "type": "enum", + "displayNames": [ + "Intel CPU", + "Intel GPU", + "Intel NPU" + ], + "path": "passes.optimum_convert.extra_args.device", + "values": [ + "cpu", + "gpu", + "npu" + ], + "actions": [ + [ + { + "type": "update", + "path": "passes.ov_quantize.target_device", + "value": "cpu" + }, + { + "type": "update", + "path": "passes.encapsulation.target_device", + "value": "cpu" + } + ], + [ + { + "type": "update", + "path": "passes.ov_quantize.target_device", + "value": "gpu" + }, + { + "type": "update", + "path": "passes.encapsulation.target_device", + "value": "gpu" + } + ], + [ + { + "type": "update", + "path": "passes.ov_quantize.target_device", + "value": "npu" + }, + { + "type": "update", + "path": "passes.encapsulation.target_device", + "value": "npu" + } + ] + ] + }, + "sections": [ + { + "autoGenerated": true, + "name": "Convert", + "phase": "Conversion", + "parameters": [], + "toggle": { + "autoGenerated": true, + "name": "Convert to ONNX format", + "type": "bool", + "path": "passes.optimum_convert", + "actions": [ + [], + [] + ], + "readOnly": true + } + }, + { + "name": "Quantize", + "phase": "Quantization", + "parameters": [ + { + "name": "Quantization Dataset", + "tags": [ + "QuantizationDataset" + ], + "type": "enum", + "path": "data_configs[0].load_dataset_config.data_name", + "values": [ + "wikipedia" + ], + "template": { + "path": "data_configs[0].load_dataset_config.data_name", + "values": [ + "wikipedia" + ], + "template": "QuantizationDataset" + } + }, + { + "name": "Quantization Dataset Split", + "tags": [ + "QuantizationDatasetSplit", + "DependsOnDataset" + ], + "type": "enum", + "path": "data_configs[0].load_dataset_config.split", + "values": [ + "train", + "validation", + "test" + ], + "template": { + "path": "data_configs[0].load_dataset_config.split", + "template": "QuantizationDatasetSplit" + } + }, + { + "name": "Quantization Dataset Size", + "type": "int", + "path": "data_configs[0].load_dataset_config.max_samples", + "template": { + "path": "data_configs[0].load_dataset_config.max_samples", + "template": "QuantizationDatasetSize" + } + } + ], + "toggle": { + "autoGenerated": true, + "name": "Quantize model", + "type": "bool", + "path": "passes.optimum_convert", + "actions": [ + [], + [] + ], + "readOnly": true + } + }, + { + "name": "Evaluate", + "phase": "Evaluation", + "parameters": [], + "toggle": { + "autoGenerated": true, + "name": "Evaluate model performance", + "type": "bool", + "path": "evaluator", + "actions": [ + [], + [ + { + "type": "delete", + "path": "evaluator" + } + ] + ] + } + } + ] +} diff --git a/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_dml.json b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_dml.json new file mode 100644 index 00000000..741cd73d --- /dev/null +++ b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_dml.json @@ -0,0 +1,139 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "cross-encoder/ms-marco-MiniLM-L-6-v2", + "task": "feature-extraction" + }, + "systems": { + "host_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "cpu", + "execution_providers": [ + "CPUExecutionProvider" + ] + } + ] + }, + "target_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "DmlExecutionProvider" + ] + } + ] + } + }, + "data_configs": [ + { + "name": "evaluation_data_config", + "type": "HuggingfaceContainer", + "load_dataset_config": { + "data_name": "facebook/xnli", + "subset": "en", + "split": "validation" + }, + "pre_process_data_config": { + "input_cols": [ + "premise" + ], + "padding": "max_length", + "max_length": 128, + "max_samples": 10 + }, + "dataloader_config": { + "batch_size": 1 + } + } + ], + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "latency", + "type": "latency", + "data_config": "evaluation_data_config", + "sub_types": [ + { + "name": "avg", + "priority": 1, + "goal": { + "type": "percent-min-improvement", + "value": 0.1 + } + }, + { + "name": "max" + }, + { + "name": "min" + } + ] + }, + { + "name": "throughput", + "type": "throughput", + "data_config": "evaluation_data_config", + "sub_types": [ + { + "name": "avg" + }, + { + "name": "max" + }, + { + "name": "min" + } + ] + } + ] + } + }, + "passes": { + "conversion": { + "type": "OnnxConversion", + "target_opset": 20, + "save_as_external_data": true + }, + "transformer_optimizer": { + "type": "OrtTransformersOptimization", + "model_type": "bert", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false, + "optimization_options": { + "enable_gelu": true, + "enable_layer_norm": true, + "enable_attention": true, + "use_multi_head_attention": true, + "enable_skip_layer_norm": false, + "enable_embed_layer_norm": false, + "enable_bias_skip_layer_norm": false, + "enable_bias_gelu": false, + "enable_gelu_approximation": false, + "enable_qordered_matmul": false, + "enable_shape_inference": true, + "enable_gemm_fast_gelu": false, + "enable_nhwc_conv": false, + "enable_group_norm": false, + "enable_bias_splitgelu": false, + "enable_packed_qkv": true, + "enable_packed_kv": true, + "enable_bias_add": false, + "enable_rotary_embeddings": true + }, + "save_as_external_data": true + } + }, + "host": "host_system", + "target": "target_system", + "evaluator": "common_evaluator", + "cache_dir": "cache", + "output_dir": "model/google_bert", + "evaluate_input_model": false +} diff --git a/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_dml.json.config b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_dml.json.config new file mode 100644 index 00000000..7ee69518 --- /dev/null +++ b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_dml.json.config @@ -0,0 +1,123 @@ +{ + "name": "Convert to DirectML", + "addCpu": false, + "runtime": { + "autoGenerated": true, + "name": "Evaluate on", + "type": "enum", + "displayNames": [ + "DirectML" + ], + "path": "systems.target_system.accelerators.0.execution_providers.0", + "values": [ + "DmlExecutionProvider" + ], + "readOnly": false + }, + "sections": [ + { + "autoGenerated": true, + "name": "Convert", + "phase": "Conversion", + "parameters": [], + "toggle": { + "autoGenerated": true, + "name": "Convert to ONNX format", + "type": "bool", + "path": "passes.conversion", + "actions": [ + [], + [] + ], + "readOnly": true + } + }, + { + "name": "Evaluate", + "phase": "Evaluation", + "parameters": [ + { + "name": "Evaluation Dataset", + "tags": [ + "EvaluationDataset" + ], + "type": "enum", + "path": "data_configs[0].load_dataset_config.data_name", + "values": [ + "facebook/xnli" + ], + "template": { + "path": "data_configs[0].load_dataset_config.data_name", + "values": [ + "facebook/xnli" + ], + "template": "EvaluationDataset" + } + }, + { + "name": "Evaluation Dataset Subset", + "tags": [ + "EvaluationDatasetSubset", + "DependsOnDataset" + ], + "type": "enum", + "path": "data_configs[0].load_dataset_config.subset", + "values": [ + "en", + "all_languages" + ], + "template": { + "path": "data_configs[0].load_dataset_config.subset", + "values": [ + "en", + "all_languages" + ], + "template": "EvaluationDatasetSubset" + } + }, + { + "name": "Evaluation Dataset Split", + "tags": [ + "EvaluationDatasetSplit", + "DependsOnDataset" + ], + "type": "enum", + "path": "data_configs[0].load_dataset_config.split", + "values": [ + "train", + "validation", + "test" + ], + "template": { + "path": "data_configs[0].load_dataset_config.split", + "template": "EvaluationDatasetSplit" + } + }, + { + "name": "Quantization Dataset Size", + "type": "int", + "path": "data_configs[0].pre_process_data_config.max_samples", + "template": { + "path": "data_configs[0].pre_process_data_config.max_samples", + "template": "QuantizationDatasetSize" + } + } + ], + "toggle": { + "autoGenerated": true, + "name": "Evaluate model performance", + "type": "bool", + "path": "evaluator", + "actions": [ + [], + [ + { + "type": "delete", + "path": "evaluator" + } + ] + ] + } + } + ] +} diff --git a/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_qdq_amd.json b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_qdq_amd.json new file mode 100644 index 00000000..e9715a38 --- /dev/null +++ b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_qdq_amd.json @@ -0,0 +1,168 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "cross-encoder/ms-marco-MiniLM-L-6-v2", + "task": "feature-extraction" + }, + "systems": { + "qnn_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "npu", + "execution_providers": [ + "VitisAIExecutionProvider" + ] + } + ] + } + }, + "data_configs": [ + { + "name": "quantization_data_config", + "type": "HuggingfaceContainer", + "load_dataset_config": { + "data_name": "facebook/xnli", + "subset": "en", + "split": "validation" + }, + "pre_process_data_config": { + "input_cols": [ + "premise" + ], + "padding": "max_length", + "max_length": 128, + "max_samples": 10 + }, + "dataloader_config": { + "batch_size": 1 + } + }, + { + "name": "evaluation_data_config", + "type": "HuggingfaceContainer", + "load_dataset_config": { + "data_name": "facebook/xnli", + "subset": "en", + "split": "validation" + }, + "pre_process_data_config": { + "input_cols": [ + "premise" + ], + "padding": "max_length", + "max_length": 128, + "max_samples": 10 + }, + "dataloader_config": { + "batch_size": 1 + } + } + ], + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "latency", + "type": "latency", + "data_config": "evaluation_data_config", + "sub_types": [ + { + "name": "avg", + "priority": 1, + "goal": { + "type": "percent-min-improvement", + "value": 0.1 + } + }, + { + "name": "max" + }, + { + "name": "min" + } + ] + }, + { + "name": "throughput", + "type": "throughput", + "data_config": "evaluation_data_config", + "sub_types": [ + { + "name": "avg" + }, + { + "name": "max" + }, + { + "name": "min" + } + ] + } + ] + } + }, + "passes": { + "conversion": { + "type": "OnnxConversion", + "target_opset": 17, + "save_as_external_data": true + }, + "transformer_optimizer": { + "type": "orttransformersoptimization", + "model_type": "bert", + "opt_level": 1, + "optimization_options": { + "enable_gelu": true, + "enable_bias_gelu": false, + "enable_layer_norm": true, + "enable_skip_layer_norm": false, + "enable_bias_skip_layer_norm": false, + "enable_attention": false + }, + "save_as_external_data": true + }, + "dynamic_shape_to_fixed": { + "type": "DynamicToFixedShape", + "dim_param": [ + "batch_size", + "sequence_length" + ], + "dim_value": [ + 1, + 128 + ] + }, + "surgery": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "ReplaceAttentionMaskValue" + } + ] + }, + "OnnxQuantization": { + "type": "OnnxStaticQuantization", + "data_config": "quantization_data_config", + "activation_type": "uint16", + "precision": "uint8", + "save_as_external_data": true + }, + "addmetadata": { + "type": "VitisAIAddMetaData", + "config_meta_data_keys": [ + "architectures", + "model_type" + ], + "activation_type": "uint16", + "weight_type": "uint8", + "quant_type": "OnnxStaticQuantization" + } + }, + "host": "qnn_system", + "target": "qnn_system", + "evaluator": "common_evaluator", + "cache_dir": "cache", + "output_dir": "model/google_bert", + "evaluate_input_model": false +} diff --git a/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_qdq_amd.json.config b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_qdq_amd.json.config new file mode 100644 index 00000000..19476bf7 --- /dev/null +++ b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_qdq_amd.json.config @@ -0,0 +1,273 @@ +{ + "name": "Convert to AMD NPU", + "oliveFile": "bert/google_bert_qdq_vitis_ai.json", + "runtime": { + "autoGenerated": true, + "name": "Evaluate on", + "type": "enum", + "displayNames": [ + "AMD NPU", + "CPU" + ], + "path": "systems.qnn_system.accelerators.0.execution_providers.0", + "values": [ + "VitisAIExecutionProvider", + "CPUExecutionProvider" + ], + "readOnly": false + }, + "sections": [ + { + "autoGenerated": true, + "name": "Convert", + "phase": "Conversion", + "parameters": [], + "toggle": { + "autoGenerated": true, + "name": "Convert to ONNX format", + "type": "bool", + "path": "passes.conversion", + "actions": [ + [], + [] + ], + "readOnly": true + } + }, + { + "name": "Quantize", + "phase": "Quantization", + "parameters": [ + { + "name": "Activation Type", + "tags": [ + "ActivationType" + ], + "description": "Quantization data type of activation. ‘Int8’ for signed 8-bit integer, ‘UInt8’ for unsigned 8-bit integer etc.", + "descriptionLink": "https://onnxruntime.ai/docs/performance/model-optimizations/quantization.html", + "type": "enum", + "displayNames": [ + "Int8", + "UInt8", + "Int16", + "UInt16" + ], + "displayType": "RadioGroup", + "path": "passes.OnnxQuantization.activation_type", + "values": [ + "int8", + "uint8", + "int16", + "uint16" + ], + "template": { + "path": "passes.OnnxQuantization.activation_type", + "template": "ActivationType" + } + }, + { + "name": "Weight Type", + "tags": [ + "WeightType" + ], + "description": "Data type for quantizing weights. ‘Int8’ for signed 8-bit integer, ‘UInt8’ for unsigned 8-bit integer etc.", + "descriptionLink": "https://onnxruntime.ai/docs/performance/model-optimizations/quantization.html", + "type": "enum", + "displayNames": [ + "Int8", + "UInt8", + "Int16", + "UInt16" + ], + "displayType": "RadioGroup", + "path": "passes.OnnxQuantization.precision", + "values": [ + "int8", + "uint8", + "int16", + "uint16" + ], + "template": { + "path": "passes.OnnxQuantization.precision", + "template": "WeightType" + } + }, + { + "name": "Quantization Dataset", + "tags": [ + "QuantizationDataset" + ], + "type": "enum", + "path": "data_configs[0].load_dataset_config.data_name", + "values": [ + "facebook/xnli" + ], + "template": { + "path": "data_configs[0].load_dataset_config.data_name", + "values": [ + "facebook/xnli" + ], + "template": "QuantizationDataset" + } + }, + { + "name": "Quantization Dataset Subset", + "tags": [ + "QuantizationDatasetSubset", + "DependsOnDataset" + ], + "type": "enum", + "path": "data_configs[0].load_dataset_config.subset", + "values": [ + "en", + "all_languages" + ], + "template": { + "path": "data_configs[0].load_dataset_config.subset", + "values": [ + "en", + "all_languages" + ], + "template": "QuantizationDatasetSubset" + } + }, + { + "name": "Quantization Dataset Split", + "tags": [ + "QuantizationDatasetSplit", + "DependsOnDataset" + ], + "type": "enum", + "path": "data_configs[0].load_dataset_config.split", + "values": [ + "train", + "validation", + "test" + ], + "template": { + "path": "data_configs[0].load_dataset_config.split", + "template": "QuantizationDatasetSplit" + } + }, + { + "name": "Quantization Dataset Size", + "type": "int", + "path": "data_configs[0].pre_process_data_config.max_samples", + "template": { + "path": "data_configs[0].pre_process_data_config.max_samples", + "template": "QuantizationDatasetSize" + } + } + ], + "toggle": { + "autoGenerated": true, + "name": "Quantize model", + "type": "bool", + "path": "passes.OnnxQuantization", + "actions": [ + [], + [ + { + "type": "update", + "path": "passes", + "value": { + "conversion": { + "type": "OnnxConversion", + "target_opset": 17, + "save_as_external_data": true + } + } + } + ] + ] + } + }, + { + "name": "Evaluate", + "phase": "Evaluation", + "parameters": [ + { + "name": "Evaluation Dataset", + "tags": [ + "EvaluationDataset" + ], + "type": "enum", + "path": "data_configs[1].load_dataset_config.data_name", + "values": [ + "facebook/xnli" + ], + "template": { + "path": "data_configs[1].load_dataset_config.data_name", + "values": [ + "facebook/xnli" + ], + "template": "EvaluationDataset" + } + }, + { + "name": "Evaluation Dataset Subset", + "tags": [ + "EvaluationDatasetSubset", + "DependsOnDataset" + ], + "type": "enum", + "path": "data_configs[1].load_dataset_config.subset", + "values": [ + "en", + "all_languages" + ], + "template": { + "path": "data_configs[1].load_dataset_config.subset", + "values": [ + "en", + "all_languages" + ], + "template": "EvaluationDatasetSubset" + } + }, + { + "name": "Evaluation Dataset Split", + "tags": [ + "EvaluationDatasetSplit", + "DependsOnDataset" + ], + "type": "enum", + "path": "data_configs[1].load_dataset_config.split", + "values": [ + "train", + "validation", + "test" + ], + "template": { + "path": "data_configs[1].load_dataset_config.split", + "template": "EvaluationDatasetSplit" + } + }, + { + "name": "Quantization Dataset Size", + "type": "int", + "path": "data_configs[1].pre_process_data_config.max_samples", + "template": { + "path": "data_configs[1].pre_process_data_config.max_samples", + "template": "QuantizationDatasetSize" + } + } + ], + "toggle": { + "autoGenerated": true, + "name": "Evaluate model performance", + "type": "bool", + "path": "evaluator", + "actions": [ + [], + [ + { + "type": "delete", + "path": "evaluator" + } + ] + ] + } + } + ] +} diff --git a/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_qdq_qnn.json b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_qdq_qnn.json new file mode 100644 index 00000000..c2000412 --- /dev/null +++ b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_qdq_qnn.json @@ -0,0 +1,163 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "cross-encoder/ms-marco-MiniLM-L-6-v2", + "task": "feature-extraction" + }, + "systems": { + "qnn_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "npu", + "execution_providers": [ + "QNNExecutionProvider" + ] + } + ] + } + }, + "data_configs": [ + { + "name": "quantization_data_config", + "type": "HuggingfaceContainer", + "load_dataset_config": { + "data_name": "facebook/xnli", + "subset": "en", + "split": "validation" + }, + "pre_process_data_config": { + "input_cols": [ + "premise" + ], + "padding": "max_length", + "max_length": 128, + "max_samples": 10 + }, + "dataloader_config": { + "batch_size": 1 + } + }, + { + "name": "evaluation_data_config", + "type": "HuggingfaceContainer", + "load_dataset_config": { + "data_name": "facebook/xnli", + "subset": "en", + "split": "validation" + }, + "pre_process_data_config": { + "input_cols": [ + "premise" + ], + "padding": "max_length", + "max_length": 128, + "max_samples": 10 + }, + "dataloader_config": { + "batch_size": 1 + } + } + ], + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "latency", + "type": "latency", + "data_config": "evaluation_data_config", + "sub_types": [ + { + "name": "avg", + "priority": 1, + "goal": { + "type": "percent-min-improvement", + "value": 0.1 + } + }, + { + "name": "max" + }, + { + "name": "min" + } + ] + }, + { + "name": "throughput", + "type": "throughput", + "data_config": "evaluation_data_config", + "sub_types": [ + { + "name": "avg" + }, + { + "name": "max" + }, + { + "name": "min" + } + ] + } + ] + } + }, + "passes": { + "conversion": { + "type": "OnnxConversion", + "target_opset": 20, + "save_as_external_data": true + }, + "to_fixed_shape": { + "type": "DynamicToFixedShape", + "dim_param": [ + "batch_size", + "sequence_length" + ], + "dim_value": [ + 1, + 128 + ] + }, + "surgery": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "ReplaceAttentionMaskValue", + "replacement": -100.0 + }, + { + "surgeon": "MatMulAddToGemm" + } + ] + }, + "transformer_optimizer": { + "type": "OrtTransformersOptimization", + "model_type": "bert", + "opt_level": 1, + "optimization_options": { + "enable_gelu": true, + "enable_bias_gelu": false, + "enable_layer_norm": true, + "enable_skip_layer_norm": false, + "enable_bias_skip_layer_norm": false, + "enable_attention": false + }, + "save_as_external_data": true + }, + "OnnxQuantization": { + "type": "OnnxStaticQuantization", + "data_config": "quantization_data_config", + "quant_preprocess": true, + "activation_type": "uint16", + "precision": "uint8", + "save_as_external_data": true + } + }, + "host": "qnn_system", + "target": "qnn_system", + "evaluator": "common_evaluator", + "cache_dir": "cache", + "output_dir": "model/google_bert", + "evaluate_input_model": false +} diff --git a/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_qdq_qnn.json.config b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_qdq_qnn.json.config new file mode 100644 index 00000000..45b6868c --- /dev/null +++ b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_qdq_qnn.json.config @@ -0,0 +1,273 @@ +{ + "name": "Convert to Qualcomm NPU", + "oliveFile": "bert/google_bert_qdq.json", + "runtime": { + "autoGenerated": true, + "name": "Evaluate on", + "type": "enum", + "displayNames": [ + "Qualcomm NPU", + "CPU" + ], + "path": "systems.qnn_system.accelerators.0.execution_providers.0", + "values": [ + "QNNExecutionProvider", + "CPUExecutionProvider" + ], + "readOnly": false + }, + "sections": [ + { + "autoGenerated": true, + "name": "Convert", + "phase": "Conversion", + "parameters": [], + "toggle": { + "autoGenerated": true, + "name": "Convert to ONNX format", + "type": "bool", + "path": "passes.conversion", + "actions": [ + [], + [] + ], + "readOnly": true + } + }, + { + "name": "Quantize", + "phase": "Quantization", + "parameters": [ + { + "name": "Activation Type", + "tags": [ + "ActivationType" + ], + "description": "Quantization data type of activation. ‘Int8’ for signed 8-bit integer, ‘UInt8’ for unsigned 8-bit integer etc.", + "descriptionLink": "https://onnxruntime.ai/docs/performance/model-optimizations/quantization.html", + "type": "enum", + "displayNames": [ + "Int8", + "UInt8", + "Int16", + "UInt16" + ], + "displayType": "RadioGroup", + "path": "passes.OnnxQuantization.activation_type", + "values": [ + "int8", + "uint8", + "int16", + "uint16" + ], + "template": { + "path": "passes.OnnxQuantization.activation_type", + "template": "ActivationType" + } + }, + { + "name": "Weight Type", + "tags": [ + "WeightType" + ], + "description": "Data type for quantizing weights. ‘Int8’ for signed 8-bit integer, ‘UInt8’ for unsigned 8-bit integer etc.", + "descriptionLink": "https://onnxruntime.ai/docs/performance/model-optimizations/quantization.html", + "type": "enum", + "displayNames": [ + "Int8", + "UInt8", + "Int16", + "UInt16" + ], + "displayType": "RadioGroup", + "path": "passes.OnnxQuantization.precision", + "values": [ + "int8", + "uint8", + "int16", + "uint16" + ], + "template": { + "path": "passes.OnnxQuantization.precision", + "template": "WeightType" + } + }, + { + "name": "Quantization Dataset", + "tags": [ + "QuantizationDataset" + ], + "type": "enum", + "path": "data_configs[0].load_dataset_config.data_name", + "values": [ + "facebook/xnli" + ], + "template": { + "path": "data_configs[0].load_dataset_config.data_name", + "values": [ + "facebook/xnli" + ], + "template": "QuantizationDataset" + } + }, + { + "name": "Quantization Dataset Subset", + "tags": [ + "QuantizationDatasetSubset", + "DependsOnDataset" + ], + "type": "enum", + "path": "data_configs[0].load_dataset_config.subset", + "values": [ + "en", + "all_languages" + ], + "template": { + "path": "data_configs[0].load_dataset_config.subset", + "values": [ + "en", + "all_languages" + ], + "template": "QuantizationDatasetSubset" + } + }, + { + "name": "Quantization Dataset Split", + "tags": [ + "QuantizationDatasetSplit", + "DependsOnDataset" + ], + "type": "enum", + "path": "data_configs[0].load_dataset_config.split", + "values": [ + "train", + "validation", + "test" + ], + "template": { + "path": "data_configs[0].load_dataset_config.split", + "template": "QuantizationDatasetSplit" + } + }, + { + "name": "Quantization Dataset Size", + "type": "int", + "path": "data_configs[0].pre_process_data_config.max_samples", + "template": { + "path": "data_configs[0].pre_process_data_config.max_samples", + "template": "QuantizationDatasetSize" + } + } + ], + "toggle": { + "autoGenerated": true, + "name": "Quantize model", + "type": "bool", + "path": "passes.OnnxQuantization", + "actions": [ + [], + [ + { + "type": "update", + "path": "passes", + "value": { + "conversion": { + "type": "OnnxConversion", + "target_opset": 20, + "save_as_external_data": true + } + } + } + ] + ] + } + }, + { + "name": "Evaluate", + "phase": "Evaluation", + "parameters": [ + { + "name": "Evaluation Dataset", + "tags": [ + "EvaluationDataset" + ], + "type": "enum", + "path": "data_configs[1].load_dataset_config.data_name", + "values": [ + "facebook/xnli" + ], + "template": { + "path": "data_configs[1].load_dataset_config.data_name", + "values": [ + "facebook/xnli" + ], + "template": "EvaluationDataset" + } + }, + { + "name": "Evaluation Dataset Subset", + "tags": [ + "EvaluationDatasetSubset", + "DependsOnDataset" + ], + "type": "enum", + "path": "data_configs[1].load_dataset_config.subset", + "values": [ + "en", + "all_languages" + ], + "template": { + "path": "data_configs[1].load_dataset_config.subset", + "values": [ + "en", + "all_languages" + ], + "template": "EvaluationDatasetSubset" + } + }, + { + "name": "Evaluation Dataset Split", + "tags": [ + "EvaluationDatasetSplit", + "DependsOnDataset" + ], + "type": "enum", + "path": "data_configs[1].load_dataset_config.split", + "values": [ + "train", + "validation", + "test" + ], + "template": { + "path": "data_configs[1].load_dataset_config.split", + "template": "EvaluationDatasetSplit" + } + }, + { + "name": "Quantization Dataset Size", + "type": "int", + "path": "data_configs[1].pre_process_data_config.max_samples", + "template": { + "path": "data_configs[1].pre_process_data_config.max_samples", + "template": "QuantizationDatasetSize" + } + } + ], + "toggle": { + "autoGenerated": true, + "name": "Evaluate model performance", + "type": "bool", + "path": "evaluator", + "actions": [ + [], + [ + { + "type": "delete", + "path": "evaluator" + } + ] + ] + } + } + ] +} diff --git a/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_trtrtx.json b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_trtrtx.json new file mode 100644 index 00000000..b4714fe2 --- /dev/null +++ b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_trtrtx.json @@ -0,0 +1,128 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "cross-encoder/ms-marco-MiniLM-L-6-v2", + "task": "feature-extraction" + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "NvTensorRTRTXExecutionProvider" + ] + } + ] + } + }, + "data_configs": [ + { + "name": "xnli", + "type": "HuggingfaceContainer", + "load_dataset_config": { + "data_name": "facebook/xnli", + "subset": "en", + "split": "validation" + }, + "pre_process_data_config": { + "input_cols": [ + "premise" + ], + "padding": "max_length", + "max_length": 128, + "max_samples": 10 + }, + "dataloader_config": { + "batch_size": 1 + } + } + ], + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "latency", + "type": "latency", + "data_config": "xnli", + "sub_types": [ + { + "name": "avg", + "priority": 1, + "goal": { + "type": "percent-min-improvement", + "value": 0.1 + } + }, + { + "name": "max" + }, + { + "name": "min" + } + ] + }, + { + "name": "throughput", + "type": "throughput", + "data_config": "xnli", + "sub_types": [ + { + "name": "avg" + }, + { + "name": "max" + }, + { + "name": "min" + } + ] + } + ] + } + }, + "passes": { + "conversion": { + "type": "OnnxConversion", + "target_opset": 17, + "save_as_external_data": true + }, + "onnx_float_to_float16": { + "type": "OnnxFloatToFloat16", + "save_as_external_data": true + }, + "dynamic_shape_to_fixed": { + "type": "DynamicToFixedShape", + "dim_param": [ + "batch_size", + "sequence_length" + ], + "dim_value": [ + 1, + 128 + ] + }, + "surgery": { + "type": "GraphSurgeries", + "save_as_external_data": true, + "surgeries": [ + { + "surgeon": "ReplaceAttentionMaskValue" + } + ] + }, + "session_params_tuning": { + "type": "OrtSessionParamsTuning", + "io_bind": false, + "data_config": "xnli" + } + }, + "host": "local_system", + "target": "local_system", + "evaluator": "common_evaluator", + "cache_dir": "cache", + "output_dir": "model/google_bert_trtrtx", + "log_severity_level": 0, + "evaluate_input_model": false +} diff --git a/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_trtrtx.json.config b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_trtrtx.json.config new file mode 100644 index 00000000..90a60833 --- /dev/null +++ b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_trtrtx.json.config @@ -0,0 +1,125 @@ +{ + "name": "Convert to NVIDIA TRT for RTX", + "oliveFile": "bert/google_bert_trtrtx.json", + "runtime": { + "autoGenerated": true, + "name": "Evaluate on", + "type": "enum", + "displayNames": [ + "NVIDIA TensorRT for RTX", + "CPU" + ], + "path": "systems.local_system.accelerators.0.execution_providers.0", + "values": [ + "NvTensorRTRTXExecutionProvider", + "CPUExecutionProvider" + ], + "readOnly": false + }, + "sections": [ + { + "autoGenerated": true, + "name": "Convert", + "phase": "Conversion", + "parameters": [], + "toggle": { + "autoGenerated": true, + "name": "Convert to ONNX format", + "type": "bool", + "path": "passes.conversion", + "actions": [ + [], + [] + ], + "readOnly": true + } + }, + { + "name": "Evaluate", + "phase": "Evaluation", + "parameters": [ + { + "name": "Evaluation Dataset", + "tags": [ + "EvaluationDataset" + ], + "type": "enum", + "path": "data_configs[0].load_dataset_config.data_name", + "values": [ + "facebook/xnli" + ], + "template": { + "path": "data_configs[0].load_dataset_config.data_name", + "values": [ + "facebook/xnli" + ], + "template": "EvaluationDataset" + } + }, + { + "name": "Evaluation Dataset Subset", + "tags": [ + "EvaluationDatasetSubset", + "DependsOnDataset" + ], + "type": "enum", + "path": "data_configs[0].load_dataset_config.subset", + "values": [ + "en", + "all_languages" + ], + "template": { + "path": "data_configs[0].load_dataset_config.subset", + "values": [ + "en", + "all_languages" + ], + "template": "EvaluationDatasetSubset" + } + }, + { + "name": "Evaluation Dataset Split", + "tags": [ + "EvaluationDatasetSplit", + "DependsOnDataset" + ], + "type": "enum", + "path": "data_configs[0].load_dataset_config.split", + "values": [ + "train", + "validation", + "test" + ], + "template": { + "path": "data_configs[0].load_dataset_config.split", + "template": "EvaluationDatasetSplit" + } + }, + { + "name": "Quantization Dataset Size", + "type": "int", + "path": "data_configs[0].pre_process_data_config.max_samples", + "template": { + "path": "data_configs[0].pre_process_data_config.max_samples", + "template": "QuantizationDatasetSize" + } + } + ], + "toggle": { + "autoGenerated": true, + "name": "Evaluate model performance", + "type": "bool", + "path": "evaluator", + "actions": [ + [], + [ + { + "type": "delete", + "path": "evaluator" + } + ] + ] + } + } + ] +} diff --git a/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/requirements.txt b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/requirements.txt new file mode 100644 index 00000000..db86d4b4 --- /dev/null +++ b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/requirements.txt @@ -0,0 +1,4 @@ +# This file will be installed together with AITK runtime requirements +# For the full requirements, see AITK +olive-ai +datasets diff --git a/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/user_script.py b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/user_script.py new file mode 100644 index 00000000..f7442c2f --- /dev/null +++ b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/user_script.py @@ -0,0 +1,83 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Intel Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +import datasets +import numpy as np +import torch +from transformers import BertTokenizer + +from olive.data.registry import Registry + +# ------------------------------------------------------------------------- +# Common Dataset +# ------------------------------------------------------------------------- + +seed = 0 +# seed everything to 0 for reproducibility, https://pytorch.org/docs/stable/notes/randomness.html +# do not set random seed and np.random.seed for aml test, since it will cause aml job name conflict +torch.manual_seed(seed) +# the following are needed only for GPU +torch.cuda.manual_seed(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = False + +# set max sequence length +MAX_SEQ_LENGTH = 128 + +# define the tokenizer +tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-multilingual-cased") +VOCAB_SIZE = len(tokenizer) + +# set default input +default_input = torch.ones(1, MAX_SEQ_LENGTH, dtype=torch.int64) + +# define model inputs +model_inputs = { + "input_ids": default_input, + "attention_mask": default_input, + "token_type_ids": default_input, +} + +# capture input names +INPUT_NAMES = list(model_inputs) + + +@Registry.register_dataset() +def bert_base_multilingual_cased_dataset(data_name, split, max_samples): + # load the raw wikipedia dataset for tuning. Load just 300 examples for speed. + raw_dataset = datasets.load_dataset(data_name, "20220301.en", split=f"{split}[:{max_samples}]", trust_remote_code=True) + + def _preprocess_fn(examples): + return tokenizer( + examples["text"], + padding="max_length", + max_length=MAX_SEQ_LENGTH, + truncation=True, + ) + + # preprocess the dataset + return raw_dataset.map(_preprocess_fn, batched=True, batch_size=1) + + +def custom_transform_func(data_item): + return { + name: np.asarray([np.array([g.flatten() for g in data_item[name]]).flatten()], dtype=np.int64) + for name in INPUT_NAMES + } + + +def custom_example_func(): + vocab_size = VOCAB_SIZE + batch_size = 1 + sequence_length = MAX_SEQ_LENGTH + + input_ids = torch.randint(0, vocab_size, (batch_size, sequence_length)) + + # Generate random attention_mask (1s for actual tokens, 0s for padding) + attention_mask = default_input + + # Generate random token_type_ids (0 for sentence 1, 1 for sentence 2) + token_type_ids = default_input + + return [input_ids, attention_mask, token_type_ids] From 17b5cadbed4d50d980fc289dd8444b23bcefc593 Mon Sep 17 00:00:00 2001 From: "Peiyao Zhao (from Dev Box)" Date: Tue, 2 Sep 2025 18:34:13 +0800 Subject: [PATCH 2/2] update readme --- .../aitk/README.md | 138 ++++++++++++++++-- 1 file changed, 125 insertions(+), 13 deletions(-) diff --git a/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/README.md b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/README.md index 46ba8a03..89819f71 100644 --- a/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/README.md +++ b/cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/README.md @@ -1,21 +1,133 @@ -# BERT Optimization +# Cross-Encoder (MS MARCO MiniLM) optimization -This folder contains examples of BERT optimization using different workflows. +This folder contains examples of optimization for `cross-encoder/ms-marco-MiniLM-L-6-v2` across multiple runtimes. -- QDQ for Qualcomm NPU / AMD NPU +- QDQ for AMD NPU +- QDQ for Qualcomm NPU - OpenVINO for Intel NPU -- Float downcasting for NVIDIA TRT for RTX GPU / DML for general GPU +- Float downcasting for NVIDIA TRT (RTX) / DML for general GPU -## QDQ for Qualcomm NPU / AMD NPU +## QDQ for AMD NPU -This workflow quantizes the model. It performs the pipeline: -- *HF Model-> ONNX Model ->Quantized Onnx Model* +Quantize and package the model for AMD NPU. -### Latency / Throughput +- Pipeline: *HuggingFace Model -> ONNX -> Quantized ONNX* +- Configuration File: `ms-marco-MiniLM-L-6-v2_qdq_amd.json` ([aitk/ms-marco-MiniLM-L-6-v2_qdq_amd.json](cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_qdq_amd.json)) -| Model Version | Latency (ms/sample) | Throughput (token per second)| Dataset | -|-----------------------|----------------------|------------------------------|---------------| -| PyTorch FP32 | 1162 | 0.81 | facebook/xnli | -| ONNX INT8 (QDQ) | 590 | 1.75 | facebook/xnli | +Key features: +- Produce VitisAI-ready artifacts for AMD NPU deployment. +- Uses `cross-encoder/ms-marco-MiniLM-L-6-v2` (pairwise scoring); inputs padded to 128 tokens. +- Calibration/evaluation with XNLI (en) "premise" (≤10 samples). +- ONNX export + transformer optimizations; static quantization to reduce latency. -*Note: Latency can vary significantly depending on the hardware and system environment. The values provided here are for reference only and may not reflect performance on all devices.* +## QDQ for Qualcomm NPU + +Quantize the model for Qualcomm NPU runtimes. + +- Pipeline: *HuggingFace Model -> ONNX -> Quantized ONNX* +- Configuration File: `ms-marco-MiniLM-L-6-v2_qdq_qnn.json` ([aitk/ms-marco-MiniLM-L-6-v2_qdq_qnn.json](cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_qdq_qnn.json)) + +Key features: +- Target: Qualcomm NPU (QNN). +- Calibration/evaluation uses XNLI (en) "premise" (≤10 samples, pad to 128). +- ONNX export (opset 20), fix dynamic shapes to [1,128], apply light graph fusions, then static quantization (activations/weights tuned for QNN). + +## OpenVINO (Intel NPU) + +Convert and quantize the model for OpenVINO. + +- Pipeline: *HuggingFace Model -> ONNX -> OpenVINO quantized model* +- Configuration File: `ms-marco-MiniLM-L-6-v2_context_ov_static.json` ([aitk/ms-marco-MiniLM-L-6-v2_context_ov_static.json](cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_context_ov_static.json)) + +Key features: +- OpenVINO Optimum conversion and encapsulation for Intel NPU. +- Calibration uses Wikipedia (train, 300 samples) with a custom transform for transformer inputs. +- Enforces static I/O (three inputs of [1,128]) and applies transformer-specific quantization. + +## Float downcasting for NVIDIA TRT / DML + +FP16 export for GPU backends to improve throughput. + +- Pipeline: *HuggingFace Model -> ONNX -> FP16 ONNX* +- Configuration Files: + - TRT (RTX): `ms-marco-MiniLM-L-6-v2_trtrtx.json` ([aitk/ms-marco-MiniLM-L-6-v2_trtrtx.json](cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_trtrtx.json)) + - DML: `ms-marco-MiniLM-L-6-v2_dml.json` ([aitk/ms-marco-MiniLM-L-6-v2_dml.json](cross-encoder-ms-marco-MiniLM-L-6-v2/aitk/ms-marco-MiniLM-L-6-v2_dml.json)) + +Key features: +- FP16 conversion for TensorRT and DML-optimized exports for GPU inference. +- Fixed-shape [1,128] inputs and transformer optimizations for stable latency/throughput. +- Evaluate latency and throughput on XNLI example inputs. + +## Dataset Information + +### Quantization Datasets +- **QNN/AMD NPU**: XNLI (English validation) "premise" (≤10 samples, padded to 128). +- **Intel NPU (OpenVINO)**: Wikipedia train (300 samples) with transformer-specific preprocessing. + +### Evaluation Datasets +- **Primary**: XNLI (English) validation for quick latency/throughput checks. +- **Metric**: Pairwise scoring (ranking) metrics or embedding/latency metrics depending on evaluation script. +- **Benchmark**: Use task-appropriate benchmarks (MS MARCO for ranking, XNLI for latency/throughput examples). + +### Evaluation Datasets +- **Primary**: XNLI (English) validation split for NLI classification. +- **Evaluation Metric**: Embedding-based classification accuracy (or feature-extraction latency/throughput). +- **Benchmark**: XNLI (Cross-lingual Natural Language Inference). + +## Performance Evaluation Results +The following results are based on comprehensive evaluation using standard embedding benchmarks and performance metrics. + +### Qualcomm NPU (QNN) Performance + +| Metric | Value | +|--------|-------| +| **Latency (avg)** | 5.59 ms | +| **Latency (min)** | 4.87 ms | +| **Latency (max)** | 7.13 ms | +| **Throughput (avg)** | 190.41 tokens/sec | +| **Throughput (max)** | 206.31 tokens/sec | +| **Throughput (min)** | 138.28 tokens/sec | + +### AMD NPU Performance + +| Metric | Value | +|--------|-------| +| **Latency (avg)** | 5.78 ms | +| **Latency (min)** | 4.92 ms | +| **Latency (max)** | 7.77 ms | +| **Throughput (avg)** | 186.78 tokens/sec | +| **Throughput (max)** | 229.40 tokens/sec | +| **Throughput (min)** | 137.03 tokens/sec | + +### Intel NPU Performance + +| Metric | Value | +|--------|-------| +| **Latency (avg)** | 2.59 ms | +| **Latency (p90)** | 3.26 ms | +| **Similarity** | 0.9830 | + +### TRT Performance + +| Metric | Value | +|--------|-------| +| **Latency (avg)** | 0.63 ms | +| **Latency (min)** | 0.59 ms | +| **Latency (max)** | 0.72 ms | +| **Throughput (avg)** | 780.33 tokens/sec | +| **Throughput (max)** | 1557.39 tokens/sec | +| **Throughput (min)** | 171.56 tokens/sec | + +### DML Performance + +| Metric | Value | +|--------|-------| +| **Latency (max)** | 2.02 ms | +| **Latency (min)** | 1.34 ms | +| **Throughput (avg)** | 684.74 tokens/sec | +| **Throughput (max)** | 721.03 tokens/sec | +| **Throughput (min)** | 560.07 tokens/sec | + +## Notes +- Model: `cross-encoder/ms-marco-MiniLM-L-6-v2` (pairwise cross-encoder for ranking). +- Use the listed config files to reproduce conversion, quantization, and benchmark runs. \ No newline at end of file