Skip to content

Commit f1b162f

Browse files
committed
feat(meq): use portable_simd to improve performance of meq
1 parent 9e3a4ea commit f1b162f

File tree

4 files changed

+371
-2
lines changed

4 files changed

+371
-2
lines changed

fuel-vm/Cargo.toml

+6
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,11 @@ name = "execution"
1515
harness = false
1616
required-features = ["std"]
1717

18+
[[bench]]
19+
name = "meq_performance"
20+
harness = false
21+
required-features = ["std"]
22+
1823
[dependencies]
1924
anyhow = { version = "1.0", optional = true }
2025
async-trait = "0.1"
@@ -110,6 +115,7 @@ test-helpers = [
110115
"tai64",
111116
"fuel-crypto/test-helpers",
112117
]
118+
experimental = []
113119

114120
[lints.rust]
115121
unexpected_cfgs = { level = "warn", check-cfg = ['cfg(fuzzing)'] }

fuel-vm/benches/meq_performance.rs

+95
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
use criterion::{
2+
criterion_group,
3+
criterion_main,
4+
Criterion,
5+
};
6+
use fuel_asm::{
7+
op,
8+
Instruction,
9+
RegId,
10+
};
11+
use fuel_tx::{
12+
Finalizable,
13+
GasCosts,
14+
Script,
15+
TransactionBuilder,
16+
};
17+
use fuel_types::{
18+
Immediate12,
19+
Word,
20+
};
21+
use fuel_vm::{
22+
interpreter::{
23+
Interpreter,
24+
InterpreterParams,
25+
},
26+
prelude::{
27+
IntoChecked,
28+
MemoryInstance,
29+
MemoryStorage,
30+
},
31+
};
32+
33+
/// from; fuel-vm/src/tests/test_helpers.rs
34+
/// Set a register `r` to a Word-sized number value using left-shifts
35+
pub fn set_full_word(r: RegId, v: Word) -> Vec<Instruction> {
36+
let r = r.to_u8();
37+
let mut ops = vec![op::movi(r, 0)];
38+
for byte in v.to_be_bytes() {
39+
ops.push(op::ori(r, r, byte as Immediate12));
40+
ops.push(op::slli(r, r, 8));
41+
}
42+
ops.pop().unwrap(); // Remove last shift
43+
ops
44+
}
45+
46+
fn meq_performance(c: &mut Criterion) {
47+
let benchmark_matrix = [
48+
1, 10, 100, 1000, 10_000, 50_000, 100_000, 500_000, 1_000_000, 2_000_000,
49+
2_500_000, 5_000_000, 10_000_000, 15_000_000, 20_000_000,
50+
// some exact multiples of 8 to verify alignment perf
51+
8, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072,
52+
262144, 524288, 1048576, 2097152, 4194304, 8388608,
53+
];
54+
55+
for size in benchmark_matrix.iter() {
56+
let mut interpreter = Interpreter::<_, _, Script>::with_storage(
57+
MemoryInstance::new(),
58+
MemoryStorage::default(),
59+
InterpreterParams {
60+
gas_costs: GasCosts::free(),
61+
..Default::default()
62+
},
63+
);
64+
65+
let reg_len = RegId::new_checked(0x13).unwrap();
66+
67+
let mut script = set_full_word(reg_len, *size as Word);
68+
script.extend(vec![
69+
op::cfe(0x13),
70+
op::meq(RegId::WRITABLE, RegId::ZERO, RegId::ZERO, reg_len),
71+
op::jmpb(RegId::ZERO, 0),
72+
]);
73+
74+
let tx_builder_script =
75+
TransactionBuilder::script(script.into_iter().collect(), vec![])
76+
.max_fee_limit(0)
77+
.add_fee_input()
78+
.finalize();
79+
let script = tx_builder_script
80+
.into_checked_basic(Default::default(), &Default::default())
81+
.unwrap();
82+
let script = script.test_into_ready();
83+
84+
interpreter.init_script(script).unwrap();
85+
86+
c.bench_function(&format!("meq_performance_{}", size), |b| {
87+
b.iter(|| {
88+
interpreter.execute().unwrap();
89+
});
90+
});
91+
}
92+
}
93+
94+
criterion_group!(benches, meq_performance);
95+
criterion_main!(benches);

fuel-vm/src/interpreter/memory.rs

+269-1
Original file line numberDiff line numberDiff line change
@@ -1023,6 +1023,272 @@ pub(crate) fn memcopy(
10231023
Ok(inc_pc(pc)?)
10241024
}
10251025

1026+
#[cfg(feature = "experimental")]
1027+
#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
1028+
fn slices_equal_neon(a: &[u8], b: &[u8]) -> bool {
1029+
use std::arch::aarch64::*;
1030+
1031+
if a.len() != b.len() {
1032+
return false;
1033+
}
1034+
1035+
let len = a.len();
1036+
let mut i = 0;
1037+
const CHUNK: usize = 96;
1038+
1039+
// if the slices are small, we don't need to
1040+
// use SIMD instructions due to overhead
1041+
if a.len() < CHUNK {
1042+
return slices_equal_fallback(a, b);
1043+
}
1044+
1045+
unsafe {
1046+
while i + CHUNK <= len {
1047+
let mut cmp =
1048+
vceqq_u8(vld1q_u8(a.as_ptr().add(i)), vld1q_u8(b.as_ptr().add(i)));
1049+
1050+
cmp = vandq_u8(
1051+
cmp,
1052+
vceqq_u8(
1053+
vld1q_u8(a.as_ptr().add(i + 16)),
1054+
vld1q_u8(b.as_ptr().add(i + 16)),
1055+
),
1056+
);
1057+
cmp = vandq_u8(
1058+
cmp,
1059+
vceqq_u8(
1060+
vld1q_u8(a.as_ptr().add(i + 32)),
1061+
vld1q_u8(b.as_ptr().add(i + 32)),
1062+
),
1063+
);
1064+
cmp = vandq_u8(
1065+
cmp,
1066+
vceqq_u8(
1067+
vld1q_u8(a.as_ptr().add(i + 48)),
1068+
vld1q_u8(b.as_ptr().add(i + 48)),
1069+
),
1070+
);
1071+
cmp = vandq_u8(
1072+
cmp,
1073+
vceqq_u8(
1074+
vld1q_u8(a.as_ptr().add(i + 64)),
1075+
vld1q_u8(b.as_ptr().add(i + 64)),
1076+
),
1077+
);
1078+
cmp = vandq_u8(
1079+
cmp,
1080+
vceqq_u8(
1081+
vld1q_u8(a.as_ptr().add(i + 80)),
1082+
vld1q_u8(b.as_ptr().add(i + 80)),
1083+
),
1084+
);
1085+
1086+
if vmaxvq_u8(cmp) != 0xFF {
1087+
return false;
1088+
}
1089+
1090+
i += CHUNK;
1091+
}
1092+
1093+
// Scalar comparison for the remainder
1094+
a[i..] == b[i..]
1095+
}
1096+
}
1097+
1098+
#[cfg(feature = "experimental")]
1099+
#[cfg(all(target_arch = "x86_64", target_feature = "avx2"))]
1100+
fn slices_equal_avx2(a: &[u8], b: &[u8]) -> bool {
1101+
use std::arch::x86_64::*;
1102+
1103+
if a.len() != b.len() {
1104+
return false;
1105+
}
1106+
1107+
let len = a.len();
1108+
let mut i = 0;
1109+
const CHUNK: usize = 256;
1110+
1111+
// if the slices are small, we don't need to
1112+
// use SIMD instructions due to overhead
1113+
if a.len() < CHUNK {
1114+
return slices_equal_fallback(a, b);
1115+
}
1116+
1117+
unsafe {
1118+
let mut aggregate_mask_a = -1i32;
1119+
let mut aggregate_mask_b = -1i32;
1120+
let mut aggregate_mask_c = -1i32;
1121+
let mut aggregate_mask_d = -1i32;
1122+
let mut aggregate_mask_a_b = -1i32;
1123+
let mut aggregate_mask_c_d = -1i32;
1124+
1125+
while i + CHUNK <= len {
1126+
let simd_a1 = _mm256_loadu_si256(a.as_ptr().add(i) as *const _);
1127+
let simd_b1 = _mm256_loadu_si256(b.as_ptr().add(i) as *const _);
1128+
1129+
let simd_a2 = _mm256_loadu_si256(a.as_ptr().add(i + 32) as *const _);
1130+
let simd_b2 = _mm256_loadu_si256(b.as_ptr().add(i + 32) as *const _);
1131+
1132+
let simd_a3 = _mm256_loadu_si256(a.as_ptr().add(i + 64) as *const _);
1133+
let simd_b3 = _mm256_loadu_si256(b.as_ptr().add(i + 64) as *const _);
1134+
1135+
let simd_a4 = _mm256_loadu_si256(a.as_ptr().add(i + 96) as *const _);
1136+
let simd_b4 = _mm256_loadu_si256(b.as_ptr().add(i + 96) as *const _);
1137+
1138+
let simd_a5 = _mm256_loadu_si256(a.as_ptr().add(i + 128) as *const _);
1139+
let simd_b5 = _mm256_loadu_si256(b.as_ptr().add(i + 128) as *const _);
1140+
1141+
let simd_a6 = _mm256_loadu_si256(a.as_ptr().add(i + 160) as *const _);
1142+
let simd_b6 = _mm256_loadu_si256(b.as_ptr().add(i + 160) as *const _);
1143+
1144+
let simd_a7 = _mm256_loadu_si256(a.as_ptr().add(i + 192) as *const _);
1145+
let simd_b7 = _mm256_loadu_si256(b.as_ptr().add(i + 192) as *const _);
1146+
1147+
let simd_a8 = _mm256_loadu_si256(a.as_ptr().add(i + 224) as *const _);
1148+
let simd_b8 = _mm256_loadu_si256(b.as_ptr().add(i + 224) as *const _);
1149+
1150+
let cmp1 = _mm256_movemask_epi8(_mm256_cmpeq_epi8(simd_a1, simd_b1));
1151+
let cmp2 = _mm256_movemask_epi8(_mm256_cmpeq_epi8(simd_a2, simd_b2));
1152+
let cmp3 = _mm256_movemask_epi8(_mm256_cmpeq_epi8(simd_a3, simd_b3));
1153+
let cmp4 = _mm256_movemask_epi8(_mm256_cmpeq_epi8(simd_a4, simd_b4));
1154+
let cmp5 = _mm256_movemask_epi8(_mm256_cmpeq_epi8(simd_a5, simd_b5));
1155+
let cmp6 = _mm256_movemask_epi8(_mm256_cmpeq_epi8(simd_a6, simd_b6));
1156+
let cmp7 = _mm256_movemask_epi8(_mm256_cmpeq_epi8(simd_a7, simd_b7));
1157+
let cmp8 = _mm256_movemask_epi8(_mm256_cmpeq_epi8(simd_a8, simd_b8));
1158+
1159+
aggregate_mask_a &= cmp1 & cmp2;
1160+
aggregate_mask_b &= cmp3 & cmp4;
1161+
aggregate_mask_c &= cmp5 & cmp6;
1162+
aggregate_mask_d &= cmp7 & cmp8;
1163+
1164+
aggregate_mask_a_b &= aggregate_mask_a & aggregate_mask_b;
1165+
aggregate_mask_c_d &= aggregate_mask_c & aggregate_mask_d;
1166+
1167+
if aggregate_mask_a_b & aggregate_mask_c_d != -1i32 {
1168+
return false;
1169+
}
1170+
1171+
i += CHUNK;
1172+
}
1173+
1174+
a[i..] == b[i..]
1175+
}
1176+
}
1177+
1178+
#[cfg(feature = "experimental")]
1179+
#[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
1180+
fn slices_equal_avx512(a: &[u8], b: &[u8]) -> bool {
1181+
use std::arch::x86_64::*;
1182+
1183+
if a.len() != b.len() {
1184+
return false;
1185+
}
1186+
1187+
let len = a.len();
1188+
let mut i = 0;
1189+
const CHUNK: usize = 512;
1190+
1191+
// if the slices are small, we don't need to
1192+
// use SIMD instructions due to overhead
1193+
if a.len() < CHUNK {
1194+
return slices_equal_fallback(a, b);
1195+
}
1196+
1197+
unsafe {
1198+
while i + CHUNK <= len {
1199+
let simd_a1 = _mm512_loadu_si512(a.as_ptr().add(i) as *const _);
1200+
let simd_b1 = _mm512_loadu_si512(b.as_ptr().add(i) as *const _);
1201+
1202+
let simd_a2 = _mm512_loadu_si512(a.as_ptr().add(i + 64) as *const _);
1203+
let simd_b2 = _mm512_loadu_si512(b.as_ptr().add(i + 64) as *const _);
1204+
1205+
let simd_a3 = _mm512_loadu_si512(a.as_ptr().add(i + 128) as *const _);
1206+
let simd_b3 = _mm512_loadu_si512(b.as_ptr().add(i + 128) as *const _);
1207+
1208+
let simd_a4 = _mm512_loadu_si512(a.as_ptr().add(i + 192) as *const _);
1209+
let simd_b4 = _mm512_loadu_si512(b.as_ptr().add(i + 192) as *const _);
1210+
1211+
let simd_a5 = _mm512_loadu_si512(a.as_ptr().add(i + 256) as *const _);
1212+
let simd_b5 = _mm512_loadu_si512(b.as_ptr().add(i + 256) as *const _);
1213+
1214+
let simd_a6 = _mm512_loadu_si512(a.as_ptr().add(i + 320) as *const _);
1215+
let simd_b6 = _mm512_loadu_si512(b.as_ptr().add(i + 320) as *const _);
1216+
1217+
let simd_a7 = _mm512_loadu_si512(a.as_ptr().add(i + 384) as *const _);
1218+
let simd_b7 = _mm512_loadu_si512(b.as_ptr().add(i + 384) as *const _);
1219+
1220+
let simd_a8 = _mm512_loadu_si512(a.as_ptr().add(i + 448) as *const _);
1221+
let simd_b8 = _mm512_loadu_si512(b.as_ptr().add(i + 448) as *const _);
1222+
1223+
let cmp1 = _mm512_cmpeq_epi8_mask(simd_a1, simd_b1);
1224+
let cmp2 = _mm512_cmpeq_epi8_mask(simd_a2, simd_b2);
1225+
let cmp3 = _mm512_cmpeq_epi8_mask(simd_a3, simd_b3);
1226+
let cmp4 = _mm512_cmpeq_epi8_mask(simd_a4, simd_b4);
1227+
let cmp5 = _mm512_cmpeq_epi8_mask(simd_a5, simd_b5);
1228+
let cmp6 = _mm512_cmpeq_epi8_mask(simd_a6, simd_b6);
1229+
let cmp7 = _mm512_cmpeq_epi8_mask(simd_a7, simd_b7);
1230+
let cmp8 = _mm512_cmpeq_epi8_mask(simd_a8, simd_b8);
1231+
1232+
let cmp1_2 = cmp1 & cmp2;
1233+
let cmp3_4 = cmp3 & cmp4;
1234+
let cmp5_6 = cmp5 & cmp6;
1235+
let cmp7_8 = cmp7 & cmp8;
1236+
1237+
let cmp1_4 = cmp1_2 & cmp3_4;
1238+
let cmp5_8 = cmp5_6 & cmp7_8;
1239+
1240+
let full_cmp = cmp1_4 & cmp5_8;
1241+
1242+
if full_cmp != u64::MAX {
1243+
return false;
1244+
}
1245+
1246+
i += CHUNK_SIZE;
1247+
}
1248+
1249+
a[i..] == b[i..]
1250+
}
1251+
}
1252+
1253+
#[inline]
1254+
fn slices_equal_fallback(a: &[u8], b: &[u8]) -> bool {
1255+
a == b
1256+
}
1257+
1258+
#[inline]
1259+
fn slice_eq(a: &[u8], b: &[u8]) -> bool {
1260+
#[cfg(feature = "experimental")]
1261+
{
1262+
#[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
1263+
{
1264+
return slices_equal_avx512(a, b);
1265+
}
1266+
#[cfg(all(target_arch = "x86_64", target_feature = "avx2"))]
1267+
{
1268+
return slices_equal_avx2(a, b);
1269+
}
1270+
#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
1271+
{
1272+
return slices_equal_neon(a, b);
1273+
}
1274+
1275+
#[allow(unreachable_code)]
1276+
slices_equal_fallback(a, b)
1277+
}
1278+
#[cfg(not(feature = "experimental"))]
1279+
{
1280+
slices_equal_fallback(a, b)
1281+
}
1282+
}
1283+
1284+
#[test]
1285+
fn slice_eq_test() {
1286+
let a = [1u8; 20000];
1287+
let b = [1u8; 20000];
1288+
1289+
assert!(slice_eq(&a, &b));
1290+
}
1291+
10261292
pub(crate) fn memeq(
10271293
memory: &mut MemoryInstance,
10281294
result: &mut Word,
@@ -1031,7 +1297,9 @@ pub(crate) fn memeq(
10311297
c: Word,
10321298
d: Word,
10331299
) -> SimpleResult<()> {
1034-
*result = (memory.read(b, d)? == memory.read(c, d)?) as Word;
1300+
let range_a = memory.read(b, d)?;
1301+
let range_b = memory.read(c, d)?;
1302+
*result = slice_eq(range_a, range_b) as Word;
10351303
Ok(inc_pc(pc)?)
10361304
}
10371305

0 commit comments

Comments
 (0)