26
26
#include " graph/unit/utils.hpp"
27
27
28
28
#include " backend/dnnl/dnnl_constant_tensor_cache.hpp"
29
+ #include " oneapi/dnnl/dnnl_graph.hpp"
29
30
30
31
namespace graph = dnnl::impl::graph;
31
32
namespace utils = dnnl::graph::tests::unit::utils;
@@ -7968,6 +7969,19 @@ TEST(test_matmul_execute_subgraph_int8, ShareCachedWeight) {
7968
7969
std::vector<std::vector<int64_t >> dst_shapes {{4 , 4096 }, {32 , 4096 }};
7969
7970
std::vector<std::shared_ptr<graph::compiled_partition_t >> cps;
7970
7971
size_t prv_cache_size = 0 ;
7972
+
7973
+ // random generate src, weight and bias data random seed = 7.
7974
+ // Weight tensor keeps same for different src.
7975
+ std::default_random_engine generator (7 );
7976
+ std::uniform_real_distribution<float > s8_distribution (-127 .0f , 128 .0f );
7977
+ std::vector<int8_t > weight_data (product (weight_shape));
7978
+ std::generate (weight_data.begin (), weight_data.end (),
7979
+ [&]() { return static_cast <int8_t >(s8_distribution (generator)); });
7980
+ test_tensor weight_s8_ts (weight_s8, engine, weight_data);
7981
+
7982
+ // set constant tensor cache capacity as 1GB
7983
+ dnnl::graph::set_constant_tensor_cache_capacity (
7984
+ static_cast <engine::kind>(engine->kind ()), 1024 );
7971
7985
for (size_t i = 0 ; i < src_shapes.size (); ++i) {
7972
7986
std::vector<int64_t > src_shape = src_shapes[i];
7973
7987
std::vector<int64_t > dst_shape = dst_shapes[i];
@@ -7988,23 +8002,162 @@ TEST(test_matmul_execute_subgraph_int8, ShareCachedWeight) {
7988
8002
cp.query_logical_tensor (dst_s8.id , &compiled_output);
7989
8003
7990
8004
std::vector<uint8_t > src_data (product (src_shape));
7991
- std::vector<int8_t > weight_data (product (weight_shape));
7992
8005
7993
- // random generate src, weight and bias data random seed = 7
7994
- std::default_random_engine generator (7 );
7995
8006
std::uniform_real_distribution<float > u8_distribution (0 .0f , 255 .0f );
7996
- std::uniform_real_distribution<float > s8_distribution (-127 .0f , 128 .0f );
7997
8007
std::generate (src_data.begin (), src_data.end (), [&]() {
7998
8008
return static_cast <uint8_t >(u8_distribution (generator));
7999
8009
});
8010
+
8011
+ test_tensor src_u8_ts (src_u8, engine, src_data);
8012
+ test_tensor dst_s8_ts (compiled_output, engine);
8013
+ ASSERT_EQ (cp.execute (strm, {src_u8_ts.get (), weight_s8_ts.get ()},
8014
+ {dst_s8_ts.get ()}),
8015
+ graph::status::success);
8016
+
8017
+ size_t curr_cache_size = graph::get_constant_tensor_cache (
8018
+ engine->kind (), engine->index ())
8019
+ ->get_size ();
8020
+ if (i != 0 ) {
8021
+ // cache size should not change since no new weight cached
8022
+ ASSERT_EQ (prv_cache_size, curr_cache_size);
8023
+ }
8024
+ prv_cache_size = curr_cache_size;
8025
+
8026
+ strm->wait ();
8027
+ }
8028
+ // Reset constant tensor cache capacity as 0
8029
+ dnnl::graph::set_constant_tensor_cache_capacity (
8030
+ static_cast <engine::kind>(engine->kind ()), 0 );
8031
+ }
8032
+
8033
+ TEST (test_matmul_execute_subgraph_int8, NoShareCachedWeight) {
8034
+ graph::engine_t *engine = get_engine ();
8035
+ graph::stream_t *strm = get_stream ();
8036
+ std::string qtype = " per_channel" ;
8037
+
8038
+ std::vector<int64_t > weight_shape = {1024 , 1024 };
8039
+
8040
+ float scale_src = 1 / 255 .f ; // map to 0~255
8041
+ float scale_out = 1 ;
8042
+ int64_t zp_src = 0 ;
8043
+ int64_t zp_out = engine->kind () == graph::engine_kind::gpu ? 0 : 78 ;
8044
+
8045
+ size_t scales_wei_sizes = weight_shape.back ();
8046
+ std::vector<float > scale_wei (scales_wei_sizes, 1 / 127 .f );
8047
+ std::vector<int64_t > zp_wei (scales_wei_sizes, 0 );
8048
+
8049
+ graph::op_t dqdata_op (1 , graph::op_kind::Dequantize, " dqdata_op" );
8050
+ dqdata_op.set_attr <std::string>(graph::op_attr::qtype, " per_tensor" );
8051
+ dqdata_op.set_attr <std::vector<int64_t >>(graph::op_attr::zps, {zp_src});
8052
+ dqdata_op.set_attr <std::vector<float >>(graph::op_attr::scales, {scale_src});
8053
+ dqdata_op.set_attr <int64_t >(graph::op_attr::axis, 0 );
8054
+
8055
+ graph::op_t dqweight_op (2 , graph::op_kind::Dequantize, " dqweight_op" );
8056
+ dqweight_op.set_attr <std::string>(graph::op_attr::qtype, " per_channel" );
8057
+ dqweight_op.set_attr <std::vector<int64_t >>(graph::op_attr::zps, zp_wei);
8058
+ dqweight_op.set_attr <std::vector<float >>(graph::op_attr::scales, scale_wei);
8059
+ dqweight_op.set_attr <int64_t >(graph::op_attr::axis, 1 );
8060
+
8061
+ graph::op_t matmul_op (3 , graph::op_kind::MatMul, " matmul_op" );
8062
+ matmul_op.set_attr <bool >(graph::op_attr::transpose_a, false );
8063
+ matmul_op.set_attr <bool >(graph::op_attr::transpose_b, false );
8064
+
8065
+ graph::op_t qout_op (4 , graph::op_kind::Quantize, " qout_op" );
8066
+ qout_op.set_attr <std::string>(graph::op_attr::qtype, " per_tensor" );
8067
+ qout_op.set_attr <std::vector<int64_t >>(graph::op_attr::zps, {zp_out});
8068
+ qout_op.set_attr <std::vector<float >>(graph::op_attr::scales, {scale_out});
8069
+ qout_op.set_attr <int64_t >(graph::op_attr::axis, 0 );
8070
+
8071
+ // prepare logical tensor
8072
+ auto src_u8 = utils::logical_tensor_init (1 , graph::data_type::u8);
8073
+ auto src_f32_dq = utils::logical_tensor_init (2 , graph::data_type::f32);
8074
+ auto weight_s8
8075
+ = utils::logical_tensor_init (4 , weight_shape, graph::data_type::s8);
8076
+ weight_s8.property = graph::property_type::constant;
8077
+ auto weight_f32_dq = utils::logical_tensor_init (
8078
+ 5 , weight_shape, graph::data_type::f32);
8079
+ auto dst_f32 = utils::logical_tensor_init (7 , graph::data_type::f32);
8080
+ auto dst_s8 = utils::logical_tensor_init (8 , graph::data_type::s8);
8081
+
8082
+ dqdata_op.add_input (src_u8);
8083
+ dqdata_op.add_output (src_f32_dq);
8084
+
8085
+ dqweight_op.add_input (weight_s8);
8086
+ dqweight_op.add_output (weight_f32_dq);
8087
+
8088
+ matmul_op.add_input (src_f32_dq);
8089
+ matmul_op.add_input (weight_f32_dq);
8090
+ matmul_op.add_output (dst_f32);
8091
+
8092
+ qout_op.add_input (dst_f32);
8093
+ qout_op.add_output (dst_s8);
8094
+
8095
+ graph::graph_t g (engine->kind ());
8096
+ g.add_op (&dqdata_op);
8097
+ g.add_op (&dqweight_op);
8098
+ g.add_op (&matmul_op);
8099
+ g.add_op (&qout_op);
8100
+ g.finalize ();
8101
+
8102
+ graph::pass::pass_base_ptr apass = get_pass (" x8x8x_matmul_post_ops" );
8103
+ apass->run (g);
8104
+ ASSERT_EQ (g.get_num_partitions (), 1U );
8105
+ auto part = g.get_partitions ()[0 ];
8106
+ ASSERT_EQ (part->get_ops ().size (), 4U );
8107
+
8108
+ graph::partition_t p;
8109
+ p.init (part);
8110
+
8111
+ std::vector<std::vector<int64_t >> src_shapes {{4 , 1024 }, {32 , 1024 }};
8112
+ std::vector<std::vector<int64_t >> dst_shapes {{4 , 1024 }, {32 , 1024 }};
8113
+ std::vector<std::shared_ptr<graph::compiled_partition_t >> cps;
8114
+ size_t prv_cache_size = 0 ;
8115
+
8116
+ // set constant tensor cache capacity as 1GB
8117
+ dnnl::graph::set_constant_tensor_cache_capacity (
8118
+ static_cast <engine::kind>(engine->kind ()), 1024 );
8119
+
8120
+ std::default_random_engine generator (7 );
8121
+ std::uniform_real_distribution<float > s8_distribution (-127 .0f , 128 .0f );
8122
+ std::vector<int8_t > weight_data (product (weight_shape));
8123
+ // Construct different weight tensor obejct with different memory address.
8124
+ std::vector<test_tensor> weight_s8_ts_vec;
8125
+ for (size_t i = 0 ; i < src_shapes.size (); i++) {
8000
8126
std::generate (weight_data.begin (), weight_data.end (), [&]() {
8001
8127
return static_cast <int8_t >(s8_distribution (generator));
8002
8128
});
8129
+ weight_s8_ts_vec.emplace_back (
8130
+ test_tensor (weight_s8, engine, weight_data));
8131
+ }
8132
+
8133
+ for (size_t i = 0 ; i < src_shapes.size (); ++i) {
8134
+ std::vector<int64_t > src_shape = src_shapes[i];
8135
+ std::vector<int64_t > dst_shape = dst_shapes[i];
8136
+
8137
+ src_u8 = utils::logical_tensor_init (1 , src_shape, graph::data_type::u8);
8138
+ dst_s8 = utils::logical_tensor_init (8 , dst_shape, graph::data_type::s8);
8139
+ std::vector<const graph::logical_tensor_t *> lt_ins {
8140
+ &src_u8, &weight_s8};
8141
+ std::vector<const graph::logical_tensor_t *> lt_outs {&dst_s8};
8142
+
8143
+ cps.push_back (std::make_shared<graph::compiled_partition_t >(p));
8144
+ auto &cp = *cps.back ();
8145
+
8146
+ ASSERT_EQ (p.compile (&cp, lt_ins, lt_outs, engine),
8147
+ graph::status::success);
8148
+
8149
+ graph::logical_tensor_t compiled_output;
8150
+ cp.query_logical_tensor (dst_s8.id , &compiled_output);
8151
+
8152
+ std::vector<uint8_t > src_data (product (src_shape));
8153
+ std::uniform_real_distribution<float > u8_distribution (0 .0f , 255 .0f );
8154
+ std::generate (src_data.begin (), src_data.end (), [&]() {
8155
+ return static_cast <uint8_t >(u8_distribution (generator));
8156
+ });
8003
8157
8004
8158
test_tensor src_u8_ts (src_u8, engine, src_data);
8005
- test_tensor weight_s8_ts (weight_s8, engine, weight_data);
8006
8159
test_tensor dst_s8_ts (compiled_output, engine);
8007
- ASSERT_EQ (cp.execute (strm, {src_u8_ts.get (), weight_s8_ts .get ()},
8160
+ ASSERT_EQ (cp.execute (strm, {src_u8_ts.get (), weight_s8_ts_vec[i] .get ()},
8008
8161
{dst_s8_ts.get ()}),
8009
8162
graph::status::success);
8010
8163
@@ -8013,11 +8166,16 @@ TEST(test_matmul_execute_subgraph_int8, ShareCachedWeight) {
8013
8166
->get_size ();
8014
8167
8015
8168
if (i != 0 ) {
8016
- // cache size should not change since no new weight cached
8017
- ASSERT_EQ (prv_cache_size, curr_cache_size);
8169
+ // cache size changes since new weight tensor with different address
8170
+ // will be cached
8171
+ ASSERT_NE (prv_cache_size, curr_cache_size);
8018
8172
}
8019
8173
prv_cache_size = curr_cache_size;
8020
8174
8021
8175
strm->wait ();
8022
8176
}
8177
+
8178
+ // Reset constant tensor cache capacity as 0
8179
+ dnnl::graph::set_constant_tensor_cache_capacity (
8180
+ static_cast <engine::kind>(engine->kind ()), 0 );
8023
8181
}
0 commit comments