@@ -1023,6 +1023,272 @@ pub(crate) fn memcopy(
1023
1023
Ok ( inc_pc ( pc) ?)
1024
1024
}
1025
1025
1026
+ #[ cfg( feature = "experimental" ) ]
1027
+ #[ cfg( all( target_arch = "aarch64" , target_feature = "neon" ) ) ]
1028
+ fn slices_equal_neon ( a : & [ u8 ] , b : & [ u8 ] ) -> bool {
1029
+ use std:: arch:: aarch64:: * ;
1030
+
1031
+ if a. len ( ) != b. len ( ) {
1032
+ return false ;
1033
+ }
1034
+
1035
+ let len = a. len ( ) ;
1036
+ let mut i = 0 ;
1037
+ const CHUNK : usize = 96 ;
1038
+
1039
+ // if the slices are small, we don't need to
1040
+ // use SIMD instructions due to overhead
1041
+ if a. len ( ) < CHUNK {
1042
+ return slices_equal_fallback ( a, b) ;
1043
+ }
1044
+
1045
+ unsafe {
1046
+ while i + CHUNK <= len {
1047
+ let mut cmp =
1048
+ vceqq_u8 ( vld1q_u8 ( a. as_ptr ( ) . add ( i) ) , vld1q_u8 ( b. as_ptr ( ) . add ( i) ) ) ;
1049
+
1050
+ cmp = vandq_u8 (
1051
+ cmp,
1052
+ vceqq_u8 (
1053
+ vld1q_u8 ( a. as_ptr ( ) . add ( i + 16 ) ) ,
1054
+ vld1q_u8 ( b. as_ptr ( ) . add ( i + 16 ) ) ,
1055
+ ) ,
1056
+ ) ;
1057
+ cmp = vandq_u8 (
1058
+ cmp,
1059
+ vceqq_u8 (
1060
+ vld1q_u8 ( a. as_ptr ( ) . add ( i + 32 ) ) ,
1061
+ vld1q_u8 ( b. as_ptr ( ) . add ( i + 32 ) ) ,
1062
+ ) ,
1063
+ ) ;
1064
+ cmp = vandq_u8 (
1065
+ cmp,
1066
+ vceqq_u8 (
1067
+ vld1q_u8 ( a. as_ptr ( ) . add ( i + 48 ) ) ,
1068
+ vld1q_u8 ( b. as_ptr ( ) . add ( i + 48 ) ) ,
1069
+ ) ,
1070
+ ) ;
1071
+ cmp = vandq_u8 (
1072
+ cmp,
1073
+ vceqq_u8 (
1074
+ vld1q_u8 ( a. as_ptr ( ) . add ( i + 64 ) ) ,
1075
+ vld1q_u8 ( b. as_ptr ( ) . add ( i + 64 ) ) ,
1076
+ ) ,
1077
+ ) ;
1078
+ cmp = vandq_u8 (
1079
+ cmp,
1080
+ vceqq_u8 (
1081
+ vld1q_u8 ( a. as_ptr ( ) . add ( i + 80 ) ) ,
1082
+ vld1q_u8 ( b. as_ptr ( ) . add ( i + 80 ) ) ,
1083
+ ) ,
1084
+ ) ;
1085
+
1086
+ if vmaxvq_u8 ( cmp) != 0xFF {
1087
+ return false ;
1088
+ }
1089
+
1090
+ i += CHUNK ;
1091
+ }
1092
+
1093
+ // Scalar comparison for the remainder
1094
+ a[ i..] == b[ i..]
1095
+ }
1096
+ }
1097
+
1098
+ #[ cfg( feature = "experimental" ) ]
1099
+ #[ cfg( all( target_arch = "x86_64" , target_feature = "avx2" ) ) ]
1100
+ fn slices_equal_avx2 ( a : & [ u8 ] , b : & [ u8 ] ) -> bool {
1101
+ use std:: arch:: x86_64:: * ;
1102
+
1103
+ if a. len ( ) != b. len ( ) {
1104
+ return false ;
1105
+ }
1106
+
1107
+ let len = a. len ( ) ;
1108
+ let mut i = 0 ;
1109
+ const CHUNK : usize = 256 ;
1110
+
1111
+ // if the slices are small, we don't need to
1112
+ // use SIMD instructions due to overhead
1113
+ if a. len ( ) < CHUNK {
1114
+ return slices_equal_fallback ( a, b) ;
1115
+ }
1116
+
1117
+ unsafe {
1118
+ let mut aggregate_mask_a = -1i32 ;
1119
+ let mut aggregate_mask_b = -1i32 ;
1120
+ let mut aggregate_mask_c = -1i32 ;
1121
+ let mut aggregate_mask_d = -1i32 ;
1122
+ let mut aggregate_mask_a_b = -1i32 ;
1123
+ let mut aggregate_mask_c_d = -1i32 ;
1124
+
1125
+ while i + CHUNK <= len {
1126
+ let simd_a1 = _mm256_loadu_si256 ( a. as_ptr ( ) . add ( i) as * const _ ) ;
1127
+ let simd_b1 = _mm256_loadu_si256 ( b. as_ptr ( ) . add ( i) as * const _ ) ;
1128
+
1129
+ let simd_a2 = _mm256_loadu_si256 ( a. as_ptr ( ) . add ( i + 32 ) as * const _ ) ;
1130
+ let simd_b2 = _mm256_loadu_si256 ( b. as_ptr ( ) . add ( i + 32 ) as * const _ ) ;
1131
+
1132
+ let simd_a3 = _mm256_loadu_si256 ( a. as_ptr ( ) . add ( i + 64 ) as * const _ ) ;
1133
+ let simd_b3 = _mm256_loadu_si256 ( b. as_ptr ( ) . add ( i + 64 ) as * const _ ) ;
1134
+
1135
+ let simd_a4 = _mm256_loadu_si256 ( a. as_ptr ( ) . add ( i + 96 ) as * const _ ) ;
1136
+ let simd_b4 = _mm256_loadu_si256 ( b. as_ptr ( ) . add ( i + 96 ) as * const _ ) ;
1137
+
1138
+ let simd_a5 = _mm256_loadu_si256 ( a. as_ptr ( ) . add ( i + 128 ) as * const _ ) ;
1139
+ let simd_b5 = _mm256_loadu_si256 ( b. as_ptr ( ) . add ( i + 128 ) as * const _ ) ;
1140
+
1141
+ let simd_a6 = _mm256_loadu_si256 ( a. as_ptr ( ) . add ( i + 160 ) as * const _ ) ;
1142
+ let simd_b6 = _mm256_loadu_si256 ( b. as_ptr ( ) . add ( i + 160 ) as * const _ ) ;
1143
+
1144
+ let simd_a7 = _mm256_loadu_si256 ( a. as_ptr ( ) . add ( i + 192 ) as * const _ ) ;
1145
+ let simd_b7 = _mm256_loadu_si256 ( b. as_ptr ( ) . add ( i + 192 ) as * const _ ) ;
1146
+
1147
+ let simd_a8 = _mm256_loadu_si256 ( a. as_ptr ( ) . add ( i + 224 ) as * const _ ) ;
1148
+ let simd_b8 = _mm256_loadu_si256 ( b. as_ptr ( ) . add ( i + 224 ) as * const _ ) ;
1149
+
1150
+ let cmp1 = _mm256_movemask_epi8 ( _mm256_cmpeq_epi8 ( simd_a1, simd_b1) ) ;
1151
+ let cmp2 = _mm256_movemask_epi8 ( _mm256_cmpeq_epi8 ( simd_a2, simd_b2) ) ;
1152
+ let cmp3 = _mm256_movemask_epi8 ( _mm256_cmpeq_epi8 ( simd_a3, simd_b3) ) ;
1153
+ let cmp4 = _mm256_movemask_epi8 ( _mm256_cmpeq_epi8 ( simd_a4, simd_b4) ) ;
1154
+ let cmp5 = _mm256_movemask_epi8 ( _mm256_cmpeq_epi8 ( simd_a5, simd_b5) ) ;
1155
+ let cmp6 = _mm256_movemask_epi8 ( _mm256_cmpeq_epi8 ( simd_a6, simd_b6) ) ;
1156
+ let cmp7 = _mm256_movemask_epi8 ( _mm256_cmpeq_epi8 ( simd_a7, simd_b7) ) ;
1157
+ let cmp8 = _mm256_movemask_epi8 ( _mm256_cmpeq_epi8 ( simd_a8, simd_b8) ) ;
1158
+
1159
+ aggregate_mask_a &= cmp1 & cmp2;
1160
+ aggregate_mask_b &= cmp3 & cmp4;
1161
+ aggregate_mask_c &= cmp5 & cmp6;
1162
+ aggregate_mask_d &= cmp7 & cmp8;
1163
+
1164
+ aggregate_mask_a_b &= aggregate_mask_a & aggregate_mask_b;
1165
+ aggregate_mask_c_d &= aggregate_mask_c & aggregate_mask_d;
1166
+
1167
+ if aggregate_mask_a_b & aggregate_mask_c_d != -1i32 {
1168
+ return false ;
1169
+ }
1170
+
1171
+ i += CHUNK ;
1172
+ }
1173
+
1174
+ a[ i..] == b[ i..]
1175
+ }
1176
+ }
1177
+
1178
+ #[ cfg( feature = "experimental" ) ]
1179
+ #[ cfg( all( target_arch = "x86_64" , target_feature = "avx512f" ) ) ]
1180
+ fn slices_equal_avx512 ( a : & [ u8 ] , b : & [ u8 ] ) -> bool {
1181
+ use std:: arch:: x86_64:: * ;
1182
+
1183
+ if a. len ( ) != b. len ( ) {
1184
+ return false ;
1185
+ }
1186
+
1187
+ let len = a. len ( ) ;
1188
+ let mut i = 0 ;
1189
+ const CHUNK : usize = 512 ;
1190
+
1191
+ // if the slices are small, we don't need to
1192
+ // use SIMD instructions due to overhead
1193
+ if a. len ( ) < CHUNK {
1194
+ return slices_equal_fallback ( a, b) ;
1195
+ }
1196
+
1197
+ unsafe {
1198
+ while i + CHUNK <= len {
1199
+ let simd_a1 = _mm512_loadu_si512 ( a. as_ptr ( ) . add ( i) as * const _ ) ;
1200
+ let simd_b1 = _mm512_loadu_si512 ( b. as_ptr ( ) . add ( i) as * const _ ) ;
1201
+
1202
+ let simd_a2 = _mm512_loadu_si512 ( a. as_ptr ( ) . add ( i + 64 ) as * const _ ) ;
1203
+ let simd_b2 = _mm512_loadu_si512 ( b. as_ptr ( ) . add ( i + 64 ) as * const _ ) ;
1204
+
1205
+ let simd_a3 = _mm512_loadu_si512 ( a. as_ptr ( ) . add ( i + 128 ) as * const _ ) ;
1206
+ let simd_b3 = _mm512_loadu_si512 ( b. as_ptr ( ) . add ( i + 128 ) as * const _ ) ;
1207
+
1208
+ let simd_a4 = _mm512_loadu_si512 ( a. as_ptr ( ) . add ( i + 192 ) as * const _ ) ;
1209
+ let simd_b4 = _mm512_loadu_si512 ( b. as_ptr ( ) . add ( i + 192 ) as * const _ ) ;
1210
+
1211
+ let simd_a5 = _mm512_loadu_si512 ( a. as_ptr ( ) . add ( i + 256 ) as * const _ ) ;
1212
+ let simd_b5 = _mm512_loadu_si512 ( b. as_ptr ( ) . add ( i + 256 ) as * const _ ) ;
1213
+
1214
+ let simd_a6 = _mm512_loadu_si512 ( a. as_ptr ( ) . add ( i + 320 ) as * const _ ) ;
1215
+ let simd_b6 = _mm512_loadu_si512 ( b. as_ptr ( ) . add ( i + 320 ) as * const _ ) ;
1216
+
1217
+ let simd_a7 = _mm512_loadu_si512 ( a. as_ptr ( ) . add ( i + 384 ) as * const _ ) ;
1218
+ let simd_b7 = _mm512_loadu_si512 ( b. as_ptr ( ) . add ( i + 384 ) as * const _ ) ;
1219
+
1220
+ let simd_a8 = _mm512_loadu_si512 ( a. as_ptr ( ) . add ( i + 448 ) as * const _ ) ;
1221
+ let simd_b8 = _mm512_loadu_si512 ( b. as_ptr ( ) . add ( i + 448 ) as * const _ ) ;
1222
+
1223
+ let cmp1 = _mm512_cmpeq_epi8_mask ( simd_a1, simd_b1) ;
1224
+ let cmp2 = _mm512_cmpeq_epi8_mask ( simd_a2, simd_b2) ;
1225
+ let cmp3 = _mm512_cmpeq_epi8_mask ( simd_a3, simd_b3) ;
1226
+ let cmp4 = _mm512_cmpeq_epi8_mask ( simd_a4, simd_b4) ;
1227
+ let cmp5 = _mm512_cmpeq_epi8_mask ( simd_a5, simd_b5) ;
1228
+ let cmp6 = _mm512_cmpeq_epi8_mask ( simd_a6, simd_b6) ;
1229
+ let cmp7 = _mm512_cmpeq_epi8_mask ( simd_a7, simd_b7) ;
1230
+ let cmp8 = _mm512_cmpeq_epi8_mask ( simd_a8, simd_b8) ;
1231
+
1232
+ let cmp1_2 = cmp1 & cmp2;
1233
+ let cmp3_4 = cmp3 & cmp4;
1234
+ let cmp5_6 = cmp5 & cmp6;
1235
+ let cmp7_8 = cmp7 & cmp8;
1236
+
1237
+ let cmp1_4 = cmp1_2 & cmp3_4;
1238
+ let cmp5_8 = cmp5_6 & cmp7_8;
1239
+
1240
+ let full_cmp = cmp1_4 & cmp5_8;
1241
+
1242
+ if full_cmp != u64:: MAX {
1243
+ return false ;
1244
+ }
1245
+
1246
+ i += CHUNK_SIZE ;
1247
+ }
1248
+
1249
+ a[ i..] == b[ i..]
1250
+ }
1251
+ }
1252
+
1253
+ #[ inline]
1254
+ fn slices_equal_fallback ( a : & [ u8 ] , b : & [ u8 ] ) -> bool {
1255
+ a == b
1256
+ }
1257
+
1258
+ #[ inline]
1259
+ fn slice_eq ( a : & [ u8 ] , b : & [ u8 ] ) -> bool {
1260
+ #[ cfg( feature = "experimental" ) ]
1261
+ {
1262
+ #[ cfg( all( target_arch = "x86_64" , target_feature = "avx512f" ) ) ]
1263
+ {
1264
+ return slices_equal_avx512 ( a, b) ;
1265
+ }
1266
+ #[ cfg( all( target_arch = "x86_64" , target_feature = "avx2" ) ) ]
1267
+ {
1268
+ return slices_equal_avx2 ( a, b) ;
1269
+ }
1270
+ #[ cfg( all( target_arch = "aarch64" , target_feature = "neon" ) ) ]
1271
+ {
1272
+ return slices_equal_neon ( a, b) ;
1273
+ }
1274
+
1275
+ #[ allow( unreachable_code) ]
1276
+ slices_equal_fallback ( a, b)
1277
+ }
1278
+ #[ cfg( not( feature = "experimental" ) ) ]
1279
+ {
1280
+ slices_equal_fallback ( a, b)
1281
+ }
1282
+ }
1283
+
1284
+ #[ test]
1285
+ fn slice_eq_test ( ) {
1286
+ let a = [ 1u8 ; 20000 ] ;
1287
+ let b = [ 1u8 ; 20000 ] ;
1288
+
1289
+ assert ! ( slice_eq( & a, & b) ) ;
1290
+ }
1291
+
1026
1292
pub ( crate ) fn memeq (
1027
1293
memory : & mut MemoryInstance ,
1028
1294
result : & mut Word ,
@@ -1031,7 +1297,9 @@ pub(crate) fn memeq(
1031
1297
c : Word ,
1032
1298
d : Word ,
1033
1299
) -> SimpleResult < ( ) > {
1034
- * result = ( memory. read ( b, d) ? == memory. read ( c, d) ?) as Word ;
1300
+ let range_a = memory. read ( b, d) ?;
1301
+ let range_b = memory. read ( c, d) ?;
1302
+ * result = slice_eq ( range_a, range_b) as Word ;
1035
1303
Ok ( inc_pc ( pc) ?)
1036
1304
}
1037
1305
0 commit comments