From 7d7be8ad09407ad19e2845696f014a458924ebaf Mon Sep 17 00:00:00 2001 From: "Wang, Zhitao" Date: Thu, 26 Dec 2024 11:20:45 +0000 Subject: [PATCH] benchdnn: inputs: graph: add compressed sdpa with per-channel quant quant --- .../graph/complex_fusion/harness_mha_all | 1 + .../sdpa-compressed-k-int8-per-channel.json | 428 ++++++++++++++++++ 2 files changed, 429 insertions(+) create mode 100644 tests/benchdnn/inputs/graph/complex_fusion/mha/sdpa-compressed-k-int8-per-channel.json diff --git a/tests/benchdnn/inputs/graph/complex_fusion/harness_mha_all b/tests/benchdnn/inputs/graph/complex_fusion/harness_mha_all index 9cb4c804799..d8af9799e24 100644 --- a/tests/benchdnn/inputs/graph/complex_fusion/harness_mha_all +++ b/tests/benchdnn/inputs/graph/complex_fusion/harness_mha_all @@ -45,6 +45,7 @@ --reset --dt=f32,bf16,f16 --in-shapes=0:acbd+1:acbd+8:acbd --case=complex_fusion/mha/sdpa-plain-simplified-f16.json --reset --dt=f32,bf16,f16 --in-shapes=3:384,3:384x384,3:1x16x384x384 --case=complex_fusion/mha/sdpa-plain-scale-by-mul-f16.json --reset --op-attrs=34107656704:group_shape:1x1x1x32+34107654464:transpose_b:1 --in-shapes=0:1x32x32x128+1:1x32x32x4+2:1x32x32x4 --case=complex_fusion/mha/sdpa-compressed-k-int8-gs32.json +--reset --op-attrs=34107656704:axis:2+34107654464:transpose_b:1 --in-shapes=0:1x32x32x128 --case=complex_fusion/mha/sdpa-compressed-k-int8-per-channel.json # Re-written int8 graphs --reset --in-shapes=5:4x16x32x256+4:4x16x256x33+0:4x16x33x256+1:4x1x1x33+3:4x1x32x33 --case=complex_fusion/mha/MHA-GPT-inf-int8-bs1.json diff --git a/tests/benchdnn/inputs/graph/complex_fusion/mha/sdpa-compressed-k-int8-per-channel.json b/tests/benchdnn/inputs/graph/complex_fusion/mha/sdpa-compressed-k-int8-per-channel.json new file mode 100644 index 00000000000..f83fc99733e --- /dev/null +++ b/tests/benchdnn/inputs/graph/complex_fusion/mha/sdpa-compressed-k-int8-per-channel.json @@ -0,0 +1,428 @@ +{ + "version": "3.7.0", + "engine_kind": "cpu", + "fpmath_mode": "f16", + "fpmath_mode_apply_to_int": "true", + "input_ports": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8 + ], + "output_ports": [ + 50 + ], + "graph": [ + { + "id": 34107656704, + "name": "aten::dequantize", + "kind": "DynamicDequantize", + "attrs": { + "qtype": { + "type": "string", + "value": "per_channel" + }, + "axis": { + "type": "s64", + "value": 3 + } + }, + "inputs": [ + { + "id": 0, + "dtype": "s8", + "shape": [ + 1, + 32, + 128, + 32 + ], + "stride": [ + 131072, + 4096, + 1, + 128 + ], + "layout_type": "strided", + "property_type": "variable" + }, + { + "id": 1, + "dtype": "f16", + "shape": [ + 32 + ], + "stride": [ + 1 + ], + "layout_type": "strided", + "property_type": "undef" + }, + { + "id": 2, + "dtype": "s8", + "shape": [ + 1 + ], + "stride": [ + 1 + ], + "layout_type": "strided", + "property_type": "undef" + } + ], + "outputs": [ + { + "id": 10, + "dtype": "f16", + "shape": [ + 1, + 32, + 128, + 32 + ], + "stride": [ + 131072, + 4096, + 32, + 1 + ], + "layout_type": "strided", + "property_type": "variable" + } + ] + }, + { + "id": 34107654464, + "name": "aten::matmul", + "kind": "MatMul", + "attrs": { + "transpose_a": { + "type": "bool", + "value": 0 + }, + "transpose_b": { + "type": "bool", + "value": 0 + } + }, + "inputs": [ + { + "id": 3, + "dtype": "f16", + "shape": [ + 1, + 32, + 32, + 128 + ], + "stride": [ + 131072, + 4096, + 128, + 1 + ], + "layout_type": "strided", + "property_type": "variable" + }, + { + "id": 10, + "dtype": "f16", + "shape": [ + 1, + 32, + 128, + 32 + ], + "stride": [ + 131072, + 4096, + 32, + 1 + ], + "layout_type": "strided", + "property_type": "variable" + } + ], + "outputs": [ + { + "id": 15, + "dtype": "f16", + "shape": [ + 1, + 32, + 32, + 32 + ], + "stride": [ + 32768, + 1024, + 32, + 1 + ], + "layout_type": "strided", + "property_type": "variable" + } + ] + }, + { + "id": 34107661824, + "name": "aten::div", + "kind": "Divide", + "attrs": { + "auto_broadcast": { + "type": "string", + "value": "numpy" + } + }, + "inputs": [ + { + "id": 15, + "dtype": "f16", + "shape": [ + 1, + 32, + 32, + 32 + ], + "stride": [ + 32768, + 1024, + 32, + 1 + ], + "layout_type": "strided", + "property_type": "variable" + }, + { + "id": 4, + "dtype": "f16", + "shape": [], + "stride": [], + "layout_type": "strided", + "property_type": "undef" + } + ], + "outputs": [ + { + "id": 16, + "dtype": "f16", + "shape": [ + 1, + 32, + 32, + 32 + ], + "stride": [ + 32768, + 1024, + 32, + 1 + ], + "layout_type": "strided", + "property_type": "variable" + } + ] + }, + { + "id": 34106997632, + "name": "aten::add", + "kind": "Add", + "attrs": { + "auto_broadcast": { + "type": "string", + "value": "numpy" + } + }, + "inputs": [ + { + "id": 16, + "dtype": "f16", + "shape": [ + 1, + 32, + 32, + 32 + ], + "stride": [ + 32768, + 1024, + 32, + 1 + ], + "layout_type": "strided", + "property_type": "variable" + }, + { + "id": 5, + "dtype": "f16", + "shape": [ + 1, + 1, + 32, + 32 + ], + "stride": [ + 1024, + 1024, + 32, + 1 + ], + "layout_type": "strided", + "property_type": "variable" + } + ], + "outputs": [ + { + "id": 18, + "dtype": "f16", + "shape": [ + 1, + 32, + 32, + 32 + ], + "stride": [ + 32768, + 1024, + 32, + 1 + ], + "layout_type": "strided", + "property_type": "variable" + } + ] + }, + { + "id": 34426356992, + "name": "aten::softmax", + "kind": "SoftMax", + "attrs": { + "axis": { + "type": "s64", + "value": 3 + } + }, + "inputs": [ + { + "id": 18, + "dtype": "f16", + "shape": [ + 1, + 32, + 32, + 32 + ], + "stride": [ + 32768, + 1024, + 32, + 1 + ], + "layout_type": "strided", + "property_type": "variable" + } + ], + "outputs": [ + { + "id": 27, + "dtype": "f16", + "shape": [ + 1, + 32, + 32, + 32 + ], + "stride": [ + 32768, + 1024, + 32, + 1 + ], + "layout_type": "strided", + "property_type": "variable" + } + ] + }, + { + "id": 34105676800, + "name": "aten::matmul", + "kind": "MatMul", + "attrs": { + "transpose_a": { + "type": "bool", + "value": 0 + }, + "transpose_b": { + "type": "bool", + "value": 0 + } + }, + "inputs": [ + { + "id": 27, + "dtype": "f16", + "shape": [ + 1, + 32, + 32, + 32 + ], + "stride": [ + 32768, + 1024, + 32, + 1 + ], + "layout_type": "strided", + "property_type": "variable" + }, + { + "id": 45, + "dtype": "f16", + "shape": [ + 1, + 32, + 32, + 128 + ], + "stride": [ + 131072, + 4096, + 128, + 1 + ], + "layout_type": "strided", + "property_type": "variable" + } + ], + "outputs": [ + { + "id": 50, + "dtype": "f16", + "shape": [ + 1, + 32, + 32, + 128 + ], + "stride": [ + 131072, + 4096, + 128, + 1 + ], + "layout_type": "strided", + "property_type": "variable" + } + ] + } + ] + } + + \ No newline at end of file