diff --git a/.aitk/configs/checks.json b/.aitk/configs/checks.json index 76dc4e99..ca6ffeda 100644 --- a/.aitk/configs/checks.json +++ b/.aitk/configs/checks.json @@ -1,13 +1,13 @@ { - "configCheck": 76, + "configCheck": 81, "extensionCheck": 1, - "gitignoreCheck": 32, + "gitignoreCheck": 33, "inferenceModelCheck": 22, - "ipynbCheck": 51, - "modelProjectCheck": 33, + "ipynbCheck": 52, + "modelProjectCheck": 34, "oliveCheck": 0, - "oliveJsonCheck": 76, - "pathCheck": 748, - "requirementsCheck": 32, + "oliveJsonCheck": 81, + "pathCheck": 816, + "requirementsCheck": 33, "venvRequirementsCheck": 12 } diff --git a/.aitk/configs/model_list.json b/.aitk/configs/model_list.json index 60be707d..d1b89e24 100644 --- a/.aitk/configs/model_list.json +++ b/.aitk/configs/model_list.json @@ -82,6 +82,25 @@ "relativePath": "google-bert-bert-base-multilingual-cased/aitk", "version": 1 }, + { + "displayName": "google-bert/bert-base-multilingual-uncased", + "icon": "gemini", + "modelLink": "https://huggingface.co/google-bert/bert-base-multilingual-uncased", + "id": "huggingface/google-bert/bert-base-multilingual-uncased", + "runtimes": [ + "QNN", + "AMDNPU", + "NvidiaTRTRTX", + "DML", + "IntelCPU", + "IntelGPU", + "IntelNPU" + ], + "architecture": "Transformer", + "status": "Ready", + "relativePath": "google-bert-bert-base-multilingual-uncased/aitk", + "version": 1 + }, { "displayName": "google/vit-base-patch16-224", "icon": "gemini", diff --git a/.gitignore b/.gitignore index 0a197900..8d9bb37e 100644 --- a/.gitignore +++ b/.gitignore @@ -172,3 +172,6 @@ cython_debug/ # PyPI configuration file .pypirc + +#VS +/.vs diff --git a/google-bert-bert-base-multilingual-uncased/LICENSE b/google-bert-bert-base-multilingual-uncased/LICENSE new file mode 100644 index 00000000..29f81d81 --- /dev/null +++ b/google-bert-bert-base-multilingual-uncased/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/google-bert-bert-base-multilingual-uncased/aitk/.gitignore b/google-bert-bert-base-multilingual-uncased/aitk/.gitignore new file mode 100644 index 00000000..48c03882 --- /dev/null +++ b/google-bert-bert-base-multilingual-uncased/aitk/.gitignore @@ -0,0 +1,5 @@ +__pycache__ +/cache +/history/*/* +!/history/*/history.config +!/history/*/olive_config.json diff --git a/google-bert-bert-base-multilingual-uncased/aitk/README.md b/google-bert-bert-base-multilingual-uncased/aitk/README.md new file mode 100644 index 00000000..66eaad31 --- /dev/null +++ b/google-bert-bert-base-multilingual-uncased/aitk/README.md @@ -0,0 +1,27 @@ +# BERT Optimization + +This folder contains examples of BERT optimization using different workflows. + +- QDQ for Qualcomm NPU / AMD NPU +- OpenVINO for Intel NPU +- Float downcasting for NVIDIA TRT for RTX GPU / DML for general GPU + +## QDQ for Qualcomm NPU / AMD NPU + +This workflow quantizes the model. It performs the pipeline: +- *HF Model-> ONNX Model ->Quantized Onnx Model* + +### Latency / Throughput + +| EP | Latency (ms/sample) | Throughput (token per second)| Dataset | +|-----------------------|----------------------|------------------------------|---------------| +| QNN | 11.17 | 58.51 | facebook/xnli | +| Intel NPU | 4.80 | | wikipedia | +| Intel GPU | 3.00 | | wikipedia | +| Intel CPU | 4.80 | | wikipedia | +| AMD NPU | 11.98 | 87.37 | facebook/xnli | +| NVIDIA TRT | 2.34 | 507.45 | facebook/xnli | +| DirectML | 13.73 | 149.38 | facebook/xnli | +|-----------------------|----------------------|------------------------------|---------------| + +*Note: Latency can vary significantly depending on the hardware and system environment. The values provided here are for reference only and may not reflect performance on all devices.* diff --git a/google-bert-bert-base-multilingual-uncased/aitk/bert-base-multilingual-uncased_context_ov_static.json b/google-bert-bert-base-multilingual-uncased/aitk/bert-base-multilingual-uncased_context_ov_static.json new file mode 100644 index 00000000..1c3c1677 --- /dev/null +++ b/google-bert-bert-base-multilingual-uncased/aitk/bert-base-multilingual-uncased_context_ov_static.json @@ -0,0 +1,97 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "google-bert/bert-base-multilingual-uncased", + "task": "fill-mask" + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "npu", + "execution_providers": [ + "OpenVINOExecutionProvider" + ] + } + ] + } + }, + "data_configs": [ + { + "name": "quantize_data_config", + "user_script": "user_script.py", + "load_dataset_config": { + "type": "bert_base_multilingual_uncased_dataset", + "data_name": "wikipedia", + "split": "train", + "max_samples": 300 + }, + "dataloader_config": { + "batch_size": 1, + "drop_last": true + } + } + ], + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "latency", + "type": "latency", + "sub_types": [ + { "name": "avg", "priority": 1, "metric_config": { "warmup_num": 20, "repeat_test_num": 100 } }, + { "name": "p90", "metric_config": { "warmup_num": 20, "repeat_test_num": 100 } } + ] + } + ] + } + }, + "passes": { + "optimum_convert": { + "type": "OpenVINOOptimumConversion", + "extra_args": { + "device": "npu", + "task": "feature-extraction" + } + }, + "io_update": { + "type": "OpenVINOIoUpdate", + "input_shapes": [ + [ + 1, + 128 + ], + [ + 1, + 128 + ], + [ + 1, + 128 + ] + ], + "static": true + }, + "ov_quantize": { + "type": "OpenVINOQuantization", + "target_device": "npu", + "data_config": "quantize_data_config", + "model_type": "TRANSFORMER", + "user_script": "user_script.py", + "transform_fn": "custom_transform_func" + }, + "encapsulation": { + "type": "OpenVINOEncapsulation", + "target_device": "npu", + "ov_version": "2025.1" + } + }, + "search_strategy": false, + "host": "local_system", + "target": "local_system", + "cache_dir": "cache", + "evaluator": "common_evaluator", + "evaluate_input_model": false, + "output_dir": "model/bert-base-multilingual-uncased_context_ov_static" +} diff --git a/google-bert-bert-base-multilingual-uncased/aitk/bert-base-multilingual-uncased_context_ov_static.json.config b/google-bert-bert-base-multilingual-uncased/aitk/bert-base-multilingual-uncased_context_ov_static.json.config new file mode 100644 index 00000000..8eaf4ef5 --- /dev/null +++ b/google-bert-bert-base-multilingual-uncased/aitk/bert-base-multilingual-uncased_context_ov_static.json.config @@ -0,0 +1,181 @@ +{ + "name": "Convert to Intel CPU/NPU/GPU", + "isIntel": true, + "debugInfo": { + "autoGenerated": true, + "useOpenVINOOptimumConversion": "optimum_convert" + }, + "addCpu": false, + "runtime": { + "autoGenerated": true, + "name": "Evaluate on", + "type": "enum", + "displayNames": [ + "Intel CPU", + "Intel GPU", + "Intel NPU" + ], + "path": "systems.local_system.accelerators.0.device", + "values": [ + "cpu", + "gpu", + "npu" + ], + "readOnly": false + }, + "runtimeInConversion": { + "autoGenerated": true, + "name": "Convert/Quantize to", + "type": "enum", + "displayNames": [ + "Intel CPU", + "Intel GPU", + "Intel NPU" + ], + "path": "passes.optimum_convert.extra_args.device", + "values": [ + "cpu", + "gpu", + "npu" + ], + "actions": [ + [ + { + "type": "update", + "path": "passes.ov_quantize.target_device", + "value": "cpu" + }, + { + "type": "update", + "path": "passes.encapsulation.target_device", + "value": "cpu" + } + ], + [ + { + "type": "update", + "path": "passes.ov_quantize.target_device", + "value": "gpu" + }, + { + "type": "update", + "path": "passes.encapsulation.target_device", + "value": "gpu" + } + ], + [ + { + "type": "update", + "path": "passes.ov_quantize.target_device", + "value": "npu" + }, + { + "type": "update", + "path": "passes.encapsulation.target_device", + "value": "npu" + } + ] + ] + }, + "sections": [ + { + "autoGenerated": true, + "name": "Convert", + "phase": "Conversion", + "parameters": [], + "toggle": { + "autoGenerated": true, + "name": "Convert to ONNX format", + "type": "bool", + "path": "passes.optimum_convert", + "actions": [ + [], + [] + ], + "readOnly": true + } + }, + { + "name": "Quantize", + "phase": "Quantization", + "parameters": [ + { + "name": "Quantization Dataset", + "tags": [ + "QuantizationDataset" + ], + "type": "enum", + "path": "data_configs[0].load_dataset_config.data_name", + "values": [ + "wikipedia" + ], + "template": { + "path": "data_configs[0].load_dataset_config.data_name", + "values": [ + "wikipedia" + ], + "template": "QuantizationDataset" + } + }, + { + "name": "Quantization Dataset Split", + "tags": [ + "QuantizationDatasetSplit", + "DependsOnDataset" + ], + "type": "enum", + "path": "data_configs[0].load_dataset_config.split", + "values": [ + "train", + "validation", + "test" + ], + "template": { + "path": "data_configs[0].load_dataset_config.split", + "template": "QuantizationDatasetSplit" + } + }, + { + "name": "Quantization Dataset Size", + "type": "int", + "path": "data_configs[0].load_dataset_config.max_samples", + "template": { + "path": "data_configs[0].load_dataset_config.max_samples", + "template": "QuantizationDatasetSize" + } + } + ], + "toggle": { + "autoGenerated": true, + "name": "Quantize model", + "type": "bool", + "path": "passes.optimum_convert", + "actions": [ + [], + [] + ], + "readOnly": true + } + }, + { + "name": "Evaluate", + "phase": "Evaluation", + "parameters": [], + "toggle": { + "autoGenerated": true, + "name": "Evaluate model performance", + "type": "bool", + "path": "evaluator", + "actions": [ + [], + [ + { + "type": "delete", + "path": "evaluator" + } + ] + ] + } + } + ] +} diff --git a/google-bert-bert-base-multilingual-uncased/aitk/bert-base-multilingual-uncased_dml.json b/google-bert-bert-base-multilingual-uncased/aitk/bert-base-multilingual-uncased_dml.json new file mode 100644 index 00000000..10985a12 --- /dev/null +++ b/google-bert-bert-base-multilingual-uncased/aitk/bert-base-multilingual-uncased_dml.json @@ -0,0 +1,139 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "google-bert/bert-base-multilingual-uncased", + "task": "feature-extraction" + }, + "systems": { + "host_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "cpu", + "execution_providers": [ + "CPUExecutionProvider" + ] + } + ] + }, + "target_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "DmlExecutionProvider" + ] + } + ] + } + }, + "data_configs": [ + { + "name": "evaluation_data_config", + "type": "HuggingfaceContainer", + "load_dataset_config": { + "data_name": "facebook/xnli", + "subset": "en", + "split": "validation" + }, + "pre_process_data_config": { + "input_cols": [ + "premise" + ], + "padding": "max_length", + "max_length": 128, + "max_samples": 10 + }, + "dataloader_config": { + "batch_size": 1 + } + } + ], + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "latency", + "type": "latency", + "data_config": "evaluation_data_config", + "sub_types": [ + { + "name": "avg", + "priority": 1, + "goal": { + "type": "percent-min-improvement", + "value": 0.1 + } + }, + { + "name": "max" + }, + { + "name": "min" + } + ] + }, + { + "name": "throughput", + "type": "throughput", + "data_config": "evaluation_data_config", + "sub_types": [ + { + "name": "avg" + }, + { + "name": "max" + }, + { + "name": "min" + } + ] + } + ] + } + }, + "passes": { + "conversion": { + "type": "OnnxConversion", + "target_opset": 20, + "save_as_external_data": true + }, + "transformer_optimizer": { + "type": "OrtTransformersOptimization", + "model_type": "bert", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false, + "optimization_options": { + "enable_gelu": true, + "enable_layer_norm": true, + "enable_attention": true, + "use_multi_head_attention": true, + "enable_skip_layer_norm": false, + "enable_embed_layer_norm": false, + "enable_bias_skip_layer_norm": false, + "enable_bias_gelu": false, + "enable_gelu_approximation": false, + "enable_qordered_matmul": false, + "enable_shape_inference": true, + "enable_gemm_fast_gelu": false, + "enable_nhwc_conv": false, + "enable_group_norm": false, + "enable_bias_splitgelu": false, + "enable_packed_qkv": true, + "enable_packed_kv": true, + "enable_bias_add": false, + "enable_rotary_embeddings": true + }, + "save_as_external_data": true + } + }, + "host": "host_system", + "target": "target_system", + "evaluator": "common_evaluator", + "cache_dir": "cache", + "output_dir": "model/google_bert", + "evaluate_input_model": false +} diff --git a/google-bert-bert-base-multilingual-uncased/aitk/bert-base-multilingual-uncased_dml.json.config b/google-bert-bert-base-multilingual-uncased/aitk/bert-base-multilingual-uncased_dml.json.config new file mode 100644 index 00000000..7ee69518 --- /dev/null +++ b/google-bert-bert-base-multilingual-uncased/aitk/bert-base-multilingual-uncased_dml.json.config @@ -0,0 +1,123 @@ +{ + "name": "Convert to DirectML", + "addCpu": false, + "runtime": { + "autoGenerated": true, + "name": "Evaluate on", + "type": "enum", + "displayNames": [ + "DirectML" + ], + "path": "systems.target_system.accelerators.0.execution_providers.0", + "values": [ + "DmlExecutionProvider" + ], + "readOnly": false + }, + "sections": [ + { + "autoGenerated": true, + "name": "Convert", + "phase": "Conversion", + "parameters": [], + "toggle": { + "autoGenerated": true, + "name": "Convert to ONNX format", + "type": "bool", + "path": "passes.conversion", + "actions": [ + [], + [] + ], + "readOnly": true + } + }, + { + "name": "Evaluate", + "phase": "Evaluation", + "parameters": [ + { + "name": "Evaluation Dataset", + "tags": [ + "EvaluationDataset" + ], + "type": "enum", + "path": "data_configs[0].load_dataset_config.data_name", + "values": [ + "facebook/xnli" + ], + "template": { + "path": "data_configs[0].load_dataset_config.data_name", + "values": [ + "facebook/xnli" + ], + "template": "EvaluationDataset" + } + }, + { + "name": "Evaluation Dataset Subset", + "tags": [ + "EvaluationDatasetSubset", + "DependsOnDataset" + ], + "type": "enum", + "path": "data_configs[0].load_dataset_config.subset", + "values": [ + "en", + "all_languages" + ], + "template": { + "path": "data_configs[0].load_dataset_config.subset", + "values": [ + "en", + "all_languages" + ], + "template": "EvaluationDatasetSubset" + } + }, + { + "name": "Evaluation Dataset Split", + "tags": [ + "EvaluationDatasetSplit", + "DependsOnDataset" + ], + "type": "enum", + "path": "data_configs[0].load_dataset_config.split", + "values": [ + "train", + "validation", + "test" + ], + "template": { + "path": "data_configs[0].load_dataset_config.split", + "template": "EvaluationDatasetSplit" + } + }, + { + "name": "Quantization Dataset Size", + "type": "int", + "path": "data_configs[0].pre_process_data_config.max_samples", + "template": { + "path": "data_configs[0].pre_process_data_config.max_samples", + "template": "QuantizationDatasetSize" + } + } + ], + "toggle": { + "autoGenerated": true, + "name": "Evaluate model performance", + "type": "bool", + "path": "evaluator", + "actions": [ + [], + [ + { + "type": "delete", + "path": "evaluator" + } + ] + ] + } + } + ] +} diff --git a/google-bert-bert-base-multilingual-uncased/aitk/bert-base-multilingual-uncased_qdq_amd.json b/google-bert-bert-base-multilingual-uncased/aitk/bert-base-multilingual-uncased_qdq_amd.json new file mode 100644 index 00000000..237b540c --- /dev/null +++ b/google-bert-bert-base-multilingual-uncased/aitk/bert-base-multilingual-uncased_qdq_amd.json @@ -0,0 +1,168 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "google-bert/bert-base-multilingual-uncased", + "task": "feature-extraction" + }, + "systems": { + "qnn_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "npu", + "execution_providers": [ + "VitisAIExecutionProvider" + ] + } + ] + } + }, + "data_configs": [ + { + "name": "quantization_data_config", + "type": "HuggingfaceContainer", + "load_dataset_config": { + "data_name": "facebook/xnli", + "subset": "en", + "split": "validation" + }, + "pre_process_data_config": { + "input_cols": [ + "premise" + ], + "padding": "max_length", + "max_length": 128, + "max_samples": 10 + }, + "dataloader_config": { + "batch_size": 1 + } + }, + { + "name": "evaluation_data_config", + "type": "HuggingfaceContainer", + "load_dataset_config": { + "data_name": "facebook/xnli", + "subset": "en", + "split": "validation" + }, + "pre_process_data_config": { + "input_cols": [ + "premise" + ], + "padding": "max_length", + "max_length": 128, + "max_samples": 10 + }, + "dataloader_config": { + "batch_size": 1 + } + } + ], + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "latency", + "type": "latency", + "data_config": "evaluation_data_config", + "sub_types": [ + { + "name": "avg", + "priority": 1, + "goal": { + "type": "percent-min-improvement", + "value": 0.1 + } + }, + { + "name": "max" + }, + { + "name": "min" + } + ] + }, + { + "name": "throughput", + "type": "throughput", + "data_config": "evaluation_data_config", + "sub_types": [ + { + "name": "avg" + }, + { + "name": "max" + }, + { + "name": "min" + } + ] + } + ] + } + }, + "passes": { + "conversion": { + "type": "OnnxConversion", + "target_opset": 17, + "save_as_external_data": true + }, + "transformer_optimizer": { + "type": "orttransformersoptimization", + "model_type": "bert", + "opt_level": 1, + "optimization_options": { + "enable_gelu": true, + "enable_bias_gelu": false, + "enable_layer_norm": true, + "enable_skip_layer_norm": false, + "enable_bias_skip_layer_norm": false, + "enable_attention": false + }, + "save_as_external_data": true + }, + "dynamic_shape_to_fixed": { + "type": "DynamicToFixedShape", + "dim_param": [ + "batch_size", + "sequence_length" + ], + "dim_value": [ + 1, + 128 + ] + }, + "surgery": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "ReplaceAttentionMaskValue" + } + ] + }, + "OnnxQuantization": { + "type": "OnnxStaticQuantization", + "data_config": "quantization_data_config", + "activation_type": "uint16", + "precision": "uint8", + "save_as_external_data": true + }, + "addmetadata": { + "type": "VitisAIAddMetaData", + "config_meta_data_keys": [ + "architectures", + "model_type" + ], + "activation_type": "uint16", + "weight_type": "uint8", + "quant_type": "OnnxStaticQuantization" + } + }, + "host": "qnn_system", + "target": "qnn_system", + "evaluator": "common_evaluator", + "cache_dir": "cache", + "output_dir": "model/google_bert", + "evaluate_input_model": false +} diff --git a/google-bert-bert-base-multilingual-uncased/aitk/bert-base-multilingual-uncased_qdq_amd.json.config b/google-bert-bert-base-multilingual-uncased/aitk/bert-base-multilingual-uncased_qdq_amd.json.config new file mode 100644 index 00000000..1bb2fe11 --- /dev/null +++ b/google-bert-bert-base-multilingual-uncased/aitk/bert-base-multilingual-uncased_qdq_amd.json.config @@ -0,0 +1,272 @@ +{ + "name": "Convert to AMD NPU", + "runtime": { + "autoGenerated": true, + "name": "Evaluate on", + "type": "enum", + "displayNames": [ + "AMD NPU", + "CPU" + ], + "path": "systems.qnn_system.accelerators.0.execution_providers.0", + "values": [ + "VitisAIExecutionProvider", + "CPUExecutionProvider" + ], + "readOnly": false + }, + "sections": [ + { + "autoGenerated": true, + "name": "Convert", + "phase": "Conversion", + "parameters": [], + "toggle": { + "autoGenerated": true, + "name": "Convert to ONNX format", + "type": "bool", + "path": "passes.conversion", + "actions": [ + [], + [] + ], + "readOnly": true + } + }, + { + "name": "Quantize", + "phase": "Quantization", + "parameters": [ + { + "name": "Activation Type", + "tags": [ + "ActivationType" + ], + "description": "Quantization data type of activation. ‘Int8’ for signed 8-bit integer, ‘UInt8’ for unsigned 8-bit integer etc.", + "descriptionLink": "https://onnxruntime.ai/docs/performance/model-optimizations/quantization.html", + "type": "enum", + "displayNames": [ + "Int8", + "UInt8", + "Int16", + "UInt16" + ], + "displayType": "RadioGroup", + "path": "passes.OnnxQuantization.activation_type", + "values": [ + "int8", + "uint8", + "int16", + "uint16" + ], + "template": { + "path": "passes.OnnxQuantization.activation_type", + "template": "ActivationType" + } + }, + { + "name": "Weight Type", + "tags": [ + "WeightType" + ], + "description": "Data type for quantizing weights. ‘Int8’ for signed 8-bit integer, ‘UInt8’ for unsigned 8-bit integer etc.", + "descriptionLink": "https://onnxruntime.ai/docs/performance/model-optimizations/quantization.html", + "type": "enum", + "displayNames": [ + "Int8", + "UInt8", + "Int16", + "UInt16" + ], + "displayType": "RadioGroup", + "path": "passes.OnnxQuantization.precision", + "values": [ + "int8", + "uint8", + "int16", + "uint16" + ], + "template": { + "path": "passes.OnnxQuantization.precision", + "template": "WeightType" + } + }, + { + "name": "Quantization Dataset", + "tags": [ + "QuantizationDataset" + ], + "type": "enum", + "path": "data_configs[0].load_dataset_config.data_name", + "values": [ + "facebook/xnli" + ], + "template": { + "path": "data_configs[0].load_dataset_config.data_name", + "values": [ + "facebook/xnli" + ], + "template": "QuantizationDataset" + } + }, + { + "name": "Quantization Dataset Subset", + "tags": [ + "QuantizationDatasetSubset", + "DependsOnDataset" + ], + "type": "enum", + "path": "data_configs[0].load_dataset_config.subset", + "values": [ + "en", + "all_languages" + ], + "template": { + "path": "data_configs[0].load_dataset_config.subset", + "values": [ + "en", + "all_languages" + ], + "template": "QuantizationDatasetSubset" + } + }, + { + "name": "Quantization Dataset Split", + "tags": [ + "QuantizationDatasetSplit", + "DependsOnDataset" + ], + "type": "enum", + "path": "data_configs[0].load_dataset_config.split", + "values": [ + "train", + "validation", + "test" + ], + "template": { + "path": "data_configs[0].load_dataset_config.split", + "template": "QuantizationDatasetSplit" + } + }, + { + "name": "Quantization Dataset Size", + "type": "int", + "path": "data_configs[0].pre_process_data_config.max_samples", + "template": { + "path": "data_configs[0].pre_process_data_config.max_samples", + "template": "QuantizationDatasetSize" + } + } + ], + "toggle": { + "autoGenerated": true, + "name": "Quantize model", + "type": "bool", + "path": "passes.OnnxQuantization", + "actions": [ + [], + [ + { + "type": "update", + "path": "passes", + "value": { + "conversion": { + "type": "OnnxConversion", + "target_opset": 17, + "save_as_external_data": true + } + } + } + ] + ] + } + }, + { + "name": "Evaluate", + "phase": "Evaluation", + "parameters": [ + { + "name": "Evaluation Dataset", + "tags": [ + "EvaluationDataset" + ], + "type": "enum", + "path": "data_configs[1].load_dataset_config.data_name", + "values": [ + "facebook/xnli" + ], + "template": { + "path": "data_configs[1].load_dataset_config.data_name", + "values": [ + "facebook/xnli" + ], + "template": "EvaluationDataset" + } + }, + { + "name": "Evaluation Dataset Subset", + "tags": [ + "EvaluationDatasetSubset", + "DependsOnDataset" + ], + "type": "enum", + "path": "data_configs[1].load_dataset_config.subset", + "values": [ + "en", + "all_languages" + ], + "template": { + "path": "data_configs[1].load_dataset_config.subset", + "values": [ + "en", + "all_languages" + ], + "template": "EvaluationDatasetSubset" + } + }, + { + "name": "Evaluation Dataset Split", + "tags": [ + "EvaluationDatasetSplit", + "DependsOnDataset" + ], + "type": "enum", + "path": "data_configs[1].load_dataset_config.split", + "values": [ + "train", + "validation", + "test" + ], + "template": { + "path": "data_configs[1].load_dataset_config.split", + "template": "EvaluationDatasetSplit" + } + }, + { + "name": "Quantization Dataset Size", + "type": "int", + "path": "data_configs[1].pre_process_data_config.max_samples", + "template": { + "path": "data_configs[1].pre_process_data_config.max_samples", + "template": "QuantizationDatasetSize" + } + } + ], + "toggle": { + "autoGenerated": true, + "name": "Evaluate model performance", + "type": "bool", + "path": "evaluator", + "actions": [ + [], + [ + { + "type": "delete", + "path": "evaluator" + } + ] + ] + } + } + ] +} diff --git a/google-bert-bert-base-multilingual-uncased/aitk/bert-base-multilingual-uncased_qdq_qnn.json b/google-bert-bert-base-multilingual-uncased/aitk/bert-base-multilingual-uncased_qdq_qnn.json new file mode 100644 index 00000000..3fe6a3a6 --- /dev/null +++ b/google-bert-bert-base-multilingual-uncased/aitk/bert-base-multilingual-uncased_qdq_qnn.json @@ -0,0 +1,163 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "google-bert/bert-base-multilingual-uncased", + "task": "feature-extraction" + }, + "systems": { + "qnn_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "npu", + "execution_providers": [ + "QNNExecutionProvider" + ] + } + ] + } + }, + "data_configs": [ + { + "name": "quantization_data_config", + "type": "HuggingfaceContainer", + "load_dataset_config": { + "data_name": "facebook/xnli", + "subset": "en", + "split": "validation" + }, + "pre_process_data_config": { + "input_cols": [ + "premise" + ], + "padding": "max_length", + "max_length": 128, + "max_samples": 10 + }, + "dataloader_config": { + "batch_size": 1 + } + }, + { + "name": "evaluation_data_config", + "type": "HuggingfaceContainer", + "load_dataset_config": { + "data_name": "facebook/xnli", + "subset": "en", + "split": "validation" + }, + "pre_process_data_config": { + "input_cols": [ + "premise" + ], + "padding": "max_length", + "max_length": 128, + "max_samples": 10 + }, + "dataloader_config": { + "batch_size": 1 + } + } + ], + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "latency", + "type": "latency", + "data_config": "evaluation_data_config", + "sub_types": [ + { + "name": "avg", + "priority": 1, + "goal": { + "type": "percent-min-improvement", + "value": 0.1 + } + }, + { + "name": "max" + }, + { + "name": "min" + } + ] + }, + { + "name": "throughput", + "type": "throughput", + "data_config": "evaluation_data_config", + "sub_types": [ + { + "name": "avg" + }, + { + "name": "max" + }, + { + "name": "min" + } + ] + } + ] + } + }, + "passes": { + "conversion": { + "type": "OnnxConversion", + "target_opset": 20, + "save_as_external_data": true + }, + "to_fixed_shape": { + "type": "DynamicToFixedShape", + "dim_param": [ + "batch_size", + "sequence_length" + ], + "dim_value": [ + 1, + 128 + ] + }, + "surgery": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "ReplaceAttentionMaskValue", + "replacement": -100.0 + }, + { + "surgeon": "MatMulAddToGemm" + } + ] + }, + "transformer_optimizer": { + "type": "OrtTransformersOptimization", + "model_type": "bert", + "opt_level": 1, + "optimization_options": { + "enable_gelu": true, + "enable_bias_gelu": false, + "enable_layer_norm": true, + "enable_skip_layer_norm": false, + "enable_bias_skip_layer_norm": false, + "enable_attention": false + }, + "save_as_external_data": true + }, + "OnnxQuantization": { + "type": "OnnxStaticQuantization", + "data_config": "quantization_data_config", + "quant_preprocess": true, + "activation_type": "uint16", + "precision": "uint8", + "save_as_external_data": true + } + }, + "host": "qnn_system", + "target": "qnn_system", + "evaluator": "common_evaluator", + "cache_dir": "cache", + "output_dir": "model/google_bert", + "evaluate_input_model": false +} diff --git a/google-bert-bert-base-multilingual-uncased/aitk/bert-base-multilingual-uncased_qdq_qnn.json.config b/google-bert-bert-base-multilingual-uncased/aitk/bert-base-multilingual-uncased_qdq_qnn.json.config new file mode 100644 index 00000000..42122d31 --- /dev/null +++ b/google-bert-bert-base-multilingual-uncased/aitk/bert-base-multilingual-uncased_qdq_qnn.json.config @@ -0,0 +1,272 @@ +{ + "name": "Convert to Qualcomm NPU", + "runtime": { + "autoGenerated": true, + "name": "Evaluate on", + "type": "enum", + "displayNames": [ + "Qualcomm NPU", + "CPU" + ], + "path": "systems.qnn_system.accelerators.0.execution_providers.0", + "values": [ + "QNNExecutionProvider", + "CPUExecutionProvider" + ], + "readOnly": false + }, + "sections": [ + { + "autoGenerated": true, + "name": "Convert", + "phase": "Conversion", + "parameters": [], + "toggle": { + "autoGenerated": true, + "name": "Convert to ONNX format", + "type": "bool", + "path": "passes.conversion", + "actions": [ + [], + [] + ], + "readOnly": true + } + }, + { + "name": "Quantize", + "phase": "Quantization", + "parameters": [ + { + "name": "Activation Type", + "tags": [ + "ActivationType" + ], + "description": "Quantization data type of activation. ‘Int8’ for signed 8-bit integer, ‘UInt8’ for unsigned 8-bit integer etc.", + "descriptionLink": "https://onnxruntime.ai/docs/performance/model-optimizations/quantization.html", + "type": "enum", + "displayNames": [ + "Int8", + "UInt8", + "Int16", + "UInt16" + ], + "displayType": "RadioGroup", + "path": "passes.OnnxQuantization.activation_type", + "values": [ + "int8", + "uint8", + "int16", + "uint16" + ], + "template": { + "path": "passes.OnnxQuantization.activation_type", + "template": "ActivationType" + } + }, + { + "name": "Weight Type", + "tags": [ + "WeightType" + ], + "description": "Data type for quantizing weights. ‘Int8’ for signed 8-bit integer, ‘UInt8’ for unsigned 8-bit integer etc.", + "descriptionLink": "https://onnxruntime.ai/docs/performance/model-optimizations/quantization.html", + "type": "enum", + "displayNames": [ + "Int8", + "UInt8", + "Int16", + "UInt16" + ], + "displayType": "RadioGroup", + "path": "passes.OnnxQuantization.precision", + "values": [ + "int8", + "uint8", + "int16", + "uint16" + ], + "template": { + "path": "passes.OnnxQuantization.precision", + "template": "WeightType" + } + }, + { + "name": "Quantization Dataset", + "tags": [ + "QuantizationDataset" + ], + "type": "enum", + "path": "data_configs[0].load_dataset_config.data_name", + "values": [ + "facebook/xnli" + ], + "template": { + "path": "data_configs[0].load_dataset_config.data_name", + "values": [ + "facebook/xnli" + ], + "template": "QuantizationDataset" + } + }, + { + "name": "Quantization Dataset Subset", + "tags": [ + "QuantizationDatasetSubset", + "DependsOnDataset" + ], + "type": "enum", + "path": "data_configs[0].load_dataset_config.subset", + "values": [ + "en", + "all_languages" + ], + "template": { + "path": "data_configs[0].load_dataset_config.subset", + "values": [ + "en", + "all_languages" + ], + "template": "QuantizationDatasetSubset" + } + }, + { + "name": "Quantization Dataset Split", + "tags": [ + "QuantizationDatasetSplit", + "DependsOnDataset" + ], + "type": "enum", + "path": "data_configs[0].load_dataset_config.split", + "values": [ + "train", + "validation", + "test" + ], + "template": { + "path": "data_configs[0].load_dataset_config.split", + "template": "QuantizationDatasetSplit" + } + }, + { + "name": "Quantization Dataset Size", + "type": "int", + "path": "data_configs[0].pre_process_data_config.max_samples", + "template": { + "path": "data_configs[0].pre_process_data_config.max_samples", + "template": "QuantizationDatasetSize" + } + } + ], + "toggle": { + "autoGenerated": true, + "name": "Quantize model", + "type": "bool", + "path": "passes.OnnxQuantization", + "actions": [ + [], + [ + { + "type": "update", + "path": "passes", + "value": { + "conversion": { + "type": "OnnxConversion", + "target_opset": 20, + "save_as_external_data": true + } + } + } + ] + ] + } + }, + { + "name": "Evaluate", + "phase": "Evaluation", + "parameters": [ + { + "name": "Evaluation Dataset", + "tags": [ + "EvaluationDataset" + ], + "type": "enum", + "path": "data_configs[1].load_dataset_config.data_name", + "values": [ + "facebook/xnli" + ], + "template": { + "path": "data_configs[1].load_dataset_config.data_name", + "values": [ + "facebook/xnli" + ], + "template": "EvaluationDataset" + } + }, + { + "name": "Evaluation Dataset Subset", + "tags": [ + "EvaluationDatasetSubset", + "DependsOnDataset" + ], + "type": "enum", + "path": "data_configs[1].load_dataset_config.subset", + "values": [ + "en", + "all_languages" + ], + "template": { + "path": "data_configs[1].load_dataset_config.subset", + "values": [ + "en", + "all_languages" + ], + "template": "EvaluationDatasetSubset" + } + }, + { + "name": "Evaluation Dataset Split", + "tags": [ + "EvaluationDatasetSplit", + "DependsOnDataset" + ], + "type": "enum", + "path": "data_configs[1].load_dataset_config.split", + "values": [ + "train", + "validation", + "test" + ], + "template": { + "path": "data_configs[1].load_dataset_config.split", + "template": "EvaluationDatasetSplit" + } + }, + { + "name": "Quantization Dataset Size", + "type": "int", + "path": "data_configs[1].pre_process_data_config.max_samples", + "template": { + "path": "data_configs[1].pre_process_data_config.max_samples", + "template": "QuantizationDatasetSize" + } + } + ], + "toggle": { + "autoGenerated": true, + "name": "Evaluate model performance", + "type": "bool", + "path": "evaluator", + "actions": [ + [], + [ + { + "type": "delete", + "path": "evaluator" + } + ] + ] + } + } + ] +} diff --git a/google-bert-bert-base-multilingual-uncased/aitk/bert-base-multilingual-uncased_trtrtx.json b/google-bert-bert-base-multilingual-uncased/aitk/bert-base-multilingual-uncased_trtrtx.json new file mode 100644 index 00000000..220fd8c2 --- /dev/null +++ b/google-bert-bert-base-multilingual-uncased/aitk/bert-base-multilingual-uncased_trtrtx.json @@ -0,0 +1,128 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "google-bert/bert-base-multilingual-uncased", + "task": "feature-extraction" + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "NvTensorRTRTXExecutionProvider" + ] + } + ] + } + }, + "data_configs": [ + { + "name": "xnli", + "type": "HuggingfaceContainer", + "load_dataset_config": { + "data_name": "facebook/xnli", + "subset": "en", + "split": "validation" + }, + "pre_process_data_config": { + "input_cols": [ + "premise" + ], + "padding": "max_length", + "max_length": 128, + "max_samples": 10 + }, + "dataloader_config": { + "batch_size": 1 + } + } + ], + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "latency", + "type": "latency", + "data_config": "xnli", + "sub_types": [ + { + "name": "avg", + "priority": 1, + "goal": { + "type": "percent-min-improvement", + "value": 0.1 + } + }, + { + "name": "max" + }, + { + "name": "min" + } + ] + }, + { + "name": "throughput", + "type": "throughput", + "data_config": "xnli", + "sub_types": [ + { + "name": "avg" + }, + { + "name": "max" + }, + { + "name": "min" + } + ] + } + ] + } + }, + "passes": { + "conversion": { + "type": "OnnxConversion", + "target_opset": 17, + "save_as_external_data": true + }, + "onnx_float_to_float16": { + "type": "OnnxFloatToFloat16", + "save_as_external_data": true + }, + "dynamic_shape_to_fixed": { + "type": "DynamicToFixedShape", + "dim_param": [ + "batch_size", + "sequence_length" + ], + "dim_value": [ + 1, + 128 + ] + }, + "surgery": { + "type": "GraphSurgeries", + "save_as_external_data": true, + "surgeries": [ + { + "surgeon": "ReplaceAttentionMaskValue" + } + ] + }, + "session_params_tuning": { + "type": "OrtSessionParamsTuning", + "io_bind": false, + "data_config": "xnli" + } + }, + "host": "local_system", + "target": "local_system", + "evaluator": "common_evaluator", + "cache_dir": "cache", + "output_dir": "model/google_bert_trtrtx", + "log_severity_level": 0, + "evaluate_input_model": false +} diff --git a/google-bert-bert-base-multilingual-uncased/aitk/bert-base-multilingual-uncased_trtrtx.json.config b/google-bert-bert-base-multilingual-uncased/aitk/bert-base-multilingual-uncased_trtrtx.json.config new file mode 100644 index 00000000..48dcc460 --- /dev/null +++ b/google-bert-bert-base-multilingual-uncased/aitk/bert-base-multilingual-uncased_trtrtx.json.config @@ -0,0 +1,124 @@ +{ + "name": "Convert to NVIDIA TRT for RTX", + "runtime": { + "autoGenerated": true, + "name": "Evaluate on", + "type": "enum", + "displayNames": [ + "NVIDIA TensorRT for RTX", + "CPU" + ], + "path": "systems.local_system.accelerators.0.execution_providers.0", + "values": [ + "NvTensorRTRTXExecutionProvider", + "CPUExecutionProvider" + ], + "readOnly": false + }, + "sections": [ + { + "autoGenerated": true, + "name": "Convert", + "phase": "Conversion", + "parameters": [], + "toggle": { + "autoGenerated": true, + "name": "Convert to ONNX format", + "type": "bool", + "path": "passes.conversion", + "actions": [ + [], + [] + ], + "readOnly": true + } + }, + { + "name": "Evaluate", + "phase": "Evaluation", + "parameters": [ + { + "name": "Evaluation Dataset", + "tags": [ + "EvaluationDataset" + ], + "type": "enum", + "path": "data_configs[0].load_dataset_config.data_name", + "values": [ + "facebook/xnli" + ], + "template": { + "path": "data_configs[0].load_dataset_config.data_name", + "values": [ + "facebook/xnli" + ], + "template": "EvaluationDataset" + } + }, + { + "name": "Evaluation Dataset Subset", + "tags": [ + "EvaluationDatasetSubset", + "DependsOnDataset" + ], + "type": "enum", + "path": "data_configs[0].load_dataset_config.subset", + "values": [ + "en", + "all_languages" + ], + "template": { + "path": "data_configs[0].load_dataset_config.subset", + "values": [ + "en", + "all_languages" + ], + "template": "EvaluationDatasetSubset" + } + }, + { + "name": "Evaluation Dataset Split", + "tags": [ + "EvaluationDatasetSplit", + "DependsOnDataset" + ], + "type": "enum", + "path": "data_configs[0].load_dataset_config.split", + "values": [ + "train", + "validation", + "test" + ], + "template": { + "path": "data_configs[0].load_dataset_config.split", + "template": "EvaluationDatasetSplit" + } + }, + { + "name": "Quantization Dataset Size", + "type": "int", + "path": "data_configs[0].pre_process_data_config.max_samples", + "template": { + "path": "data_configs[0].pre_process_data_config.max_samples", + "template": "QuantizationDatasetSize" + } + } + ], + "toggle": { + "autoGenerated": true, + "name": "Evaluate model performance", + "type": "bool", + "path": "evaluator", + "actions": [ + [], + [ + { + "type": "delete", + "path": "evaluator" + } + ] + ] + } + } + ] +} diff --git a/google-bert-bert-base-multilingual-uncased/aitk/inference_sample.ipynb b/google-bert-bert-base-multilingual-uncased/aitk/inference_sample.ipynb new file mode 100644 index 00000000..fdb385a6 --- /dev/null +++ b/google-bert-bert-base-multilingual-uncased/aitk/inference_sample.ipynb @@ -0,0 +1,151 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "onnx_model_path = \"./model/model.onnx\"\n", + "\n", + "ExecutionProvider=\"QNNExecutionProvider\"\n", + "if ExecutionProvider == \"OpenVINOExecutionProvider\":\n", + " onnx_model_path = \"./model/openvino_model_st_quant.onnx\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "inputs = \"This is an example sentence.\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import onnxruntime as ort\n", + "import torch\n", + "import torch.nn.functional as F\n", + "\n", + "from transformers import AutoModel, AutoTokenizer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def mean_pooling(model_output, attention_mask):\n", + " token_embeddings = torch.tensor(model_output[0])\n", + " input_mask_expanded = attention_mask.unsqueeze(-1).expand_as(token_embeddings).float()\n", + " return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-multilingual-uncased')\n", + "encoded_input = tokenizer(\n", + " inputs,\n", + " padding=\"max_length\",\n", + " max_length=128,\n", + " truncation=True,\n", + " add_special_tokens=True,\n", + " return_tensors=\"pt\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def add_ep_for_device(session_options, ep_name, device_type, ep_options=None):\n", + " ep_devices = ort.get_ep_devices()\n", + " for ep_device in ep_devices:\n", + " if ep_device.ep_name == ep_name and ep_device.device.type == device_type:\n", + " print(f\"Adding {ep_name} for {device_type}\")\n", + " session_options.add_provider_for_devices([ep_device], {} if ep_options is None else ep_options)\n", + " break\n", + "\n", + "\n", + "session_options = ort.SessionOptions()\n", + "\n", + "add_ep_for_device(session_options, ExecutionProvider, ort.OrtHardwareDeviceType.NPU)\n", + "\n", + "session = ort.InferenceSession(\n", + " onnx_model_path, # a model with QNN EPContext nodes\n", + " sess_options=session_options,\n", + ")\n", + "\n", + "input_ids = encoded_input[\"input_ids\"]\n", + "attention_mask = encoded_input[\"attention_mask\"]\n", + "token_type_ids = encoded_input[\"token_type_ids\"]\n", + "inputs = {\n", + " \"input_ids\": input_ids.long().cpu().numpy(),\n", + " \"attention_mask\": attention_mask.long().cpu().numpy(),\n", + " \"token_type_ids\": token_type_ids.long().cpu().numpy()\n", + "}\n", + "\n", + "outputs = session.run(None, inputs)\n", + "embeds_1 = mean_pooling(outputs, encoded_input['attention_mask'])\n", + "embeds_1 = F.normalize(embeds_1, p=2, dim=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# get text embedding from orinal model, as ground truth.\n", + "model = AutoModel.from_pretrained('google-bert/bert-base-multilingual-uncased').eval()\n", + "with torch.no_grad():\n", + " outputs = model(**encoded_input)\n", + " embeds_2 = mean_pooling(outputs, encoded_input['attention_mask'])\n", + " embeds_2 = F.normalize(embeds_2, p=2, dim=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "similarity = F.cosine_similarity(embeds_1, embeds_2).item()\n", + "print(\"Similarity: \", similarity)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/google-bert-bert-base-multilingual-uncased/aitk/info.yml b/google-bert-bert-base-multilingual-uncased/aitk/info.yml new file mode 100644 index 00000000..5745414f --- /dev/null +++ b/google-bert-bert-base-multilingual-uncased/aitk/info.yml @@ -0,0 +1,26 @@ +keywords: + aitk +arch: bert +recipes: + - file: "bert-base-multilingual-uncased_qdq_qnn.json" + device: npu + ep: QNNExecutionProvider + - file: "bert-base-multilingual-uncased_qdq_amd.json" + device: npu + ep: VitisAIExecutionProvider + - file: "bert-base-multilingual-uncased_context_ov_static.json" + devices: + - npu + - cpu + - gpu + ep: OpenVINOExecutionProvider + - file: "bert-base-multilingual-uncased_trtrtx.json" + device: gpu + ep: NvTensorRTRTXExecutionProvider + - file: "bert-base-multilingual-uncased_dml.json" + device: gpu + ep: DmlExecutionProvider +aitk: + modelInfo: + id: "huggingface/google-bert/bert-base-multilingual-uncased" + version: 1 diff --git a/google-bert-bert-base-multilingual-uncased/aitk/model_project.config b/google-bert-bert-base-multilingual-uncased/aitk/model_project.config new file mode 100644 index 00000000..582ddb31 --- /dev/null +++ b/google-bert-bert-base-multilingual-uncased/aitk/model_project.config @@ -0,0 +1,28 @@ +{ + "workflows": [ + { + "file": "bert-base-multilingual-uncased_qdq_qnn.json", + "templateName": "bert-base-multilingual-uncased_qdq_qnn" + }, + { + "file": "bert-base-multilingual-uncased_qdq_amd.json", + "templateName": "bert-base-multilingual-uncased_qdq_amd" + }, + { + "file": "bert-base-multilingual-uncased_context_ov_static.json", + "templateName": "bert-base-multilingual-uncased_context_ov_static" + }, + { + "file": "bert-base-multilingual-uncased_trtrtx.json", + "templateName": "bert-base-multilingual-uncased_trtrtx" + }, + { + "file": "bert-base-multilingual-uncased_dml.json", + "templateName": "bert-base-multilingual-uncased_dml" + } + ], + "modelInfo": { + "id": "huggingface/google-bert/bert-base-multilingual-uncased", + "version": 1 + } +} diff --git a/google-bert-bert-base-multilingual-uncased/aitk/requirements.txt b/google-bert-bert-base-multilingual-uncased/aitk/requirements.txt new file mode 100644 index 00000000..db86d4b4 --- /dev/null +++ b/google-bert-bert-base-multilingual-uncased/aitk/requirements.txt @@ -0,0 +1,4 @@ +# This file will be installed together with AITK runtime requirements +# For the full requirements, see AITK +olive-ai +datasets diff --git a/google-bert-bert-base-multilingual-uncased/aitk/user_script.py b/google-bert-bert-base-multilingual-uncased/aitk/user_script.py new file mode 100644 index 00000000..0c55c0ff --- /dev/null +++ b/google-bert-bert-base-multilingual-uncased/aitk/user_script.py @@ -0,0 +1,83 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Intel Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +import datasets +import numpy as np +import torch +from transformers import BertTokenizer + +from olive.data.registry import Registry + +# ------------------------------------------------------------------------- +# Common Dataset +# ------------------------------------------------------------------------- + +seed = 0 +# seed everything to 0 for reproducibility, https://pytorch.org/docs/stable/notes/randomness.html +# do not set random seed and np.random.seed for aml test, since it will cause aml job name conflict +torch.manual_seed(seed) +# the following are needed only for GPU +torch.cuda.manual_seed(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = False + +# set max sequence length +MAX_SEQ_LENGTH = 128 + +# define the tokenizer +tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-multilingual-uncased") +VOCAB_SIZE = len(tokenizer) + +# set default input +default_input = torch.ones(1, MAX_SEQ_LENGTH, dtype=torch.int64) + +# define model inputs +model_inputs = { + "input_ids": default_input, + "attention_mask": default_input, + "token_type_ids": default_input, +} + +# capture input names +INPUT_NAMES = list(model_inputs) + + +@Registry.register_dataset() +def bert_base_multilingual_uncased_dataset(data_name, split, max_samples): + # load the raw wikipedia dataset for tuning. Load just 300 examples for speed. + raw_dataset = datasets.load_dataset(data_name, "20220301.en", split=f"{split}[:{max_samples}]", trust_remote_code=True) + + def _preprocess_fn(examples): + return tokenizer( + examples["text"], + padding="max_length", + max_length=MAX_SEQ_LENGTH, + truncation=True, + ) + + # preprocess the dataset + return raw_dataset.map(_preprocess_fn, batched=True, batch_size=1) + + +def custom_transform_func(data_item): + return { + name: np.asarray([np.array([g.flatten() for g in data_item[name]]).flatten()], dtype=np.int64) + for name in INPUT_NAMES + } + + +def custom_example_func(): + vocab_size = VOCAB_SIZE + batch_size = 1 + sequence_length = MAX_SEQ_LENGTH + + input_ids = torch.randint(0, vocab_size, (batch_size, sequence_length)) + + # Generate random attention_mask (1s for actual tokens, 0s for padding) + attention_mask = default_input + + # Generate random token_type_ids (0 for sentence 1, 1 for sentence 2) + token_type_ids = default_input + + return [input_ids, attention_mask, token_type_ids]