diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 2aa4d49..d7039fe 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -1,6 +1,8 @@ name: Benchmark on: + pull_request: + branches: [ "main" ] workflow_dispatch: env: @@ -17,6 +19,9 @@ jobs: - name: Update rust run: rustup update + - name: Switch to nightly rust + run: rustup default nightly + - name: Benchmark run: cargo bench --bench throughput --features bench-plot @@ -32,6 +37,9 @@ jobs: steps: - uses: actions/checkout@v4 + - name: Update rust + run: rustup update + - name: Switch to nightly rust run: rustup default nightly @@ -53,6 +61,9 @@ jobs: - name: Update rust run: rustup update + - name: Switch to nightly rust + run: rustup default nightly + - name: Benchmark run: cargo bench --bench throughput --features bench-plot diff --git a/.github/workflows/build_test.yml b/.github/workflows/build_test.yml index da4a907..768d37c 100644 --- a/.github/workflows/build_test.yml +++ b/.github/workflows/build_test.yml @@ -17,6 +17,9 @@ jobs: steps: - uses: actions/checkout@v3 + - name: Switch to nightly rust + run: rustup default nightly + - name: Rust version run: cargo rustc -- --version @@ -52,6 +55,9 @@ jobs: steps: - uses: actions/checkout@v3 + - name: Switch to nightly rust + run: rustup default nightly + - name: Rust version run: cargo rustc -- --version diff --git a/Cargo.toml b/Cargo.toml index e410fa6..d503dc4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -31,7 +31,7 @@ itertools = "0.12.0" # Benchmarks criterion = { version = "0.5.1" } # Other hash algorithms, for comparison. -ahash = "0.8.6" +ahash = "0.8.11" t1ha = "0.1.0" twox-hash = "1.6.3" highway = "1.1.0" @@ -62,4 +62,8 @@ harness = false [[bench]] name = "quality" +harness = false + +[[bench]] +name = "read_beyond" harness = false \ No newline at end of file diff --git a/README.md b/README.md index 12379a4..4645698 100644 --- a/README.md +++ b/README.md @@ -109,11 +109,15 @@ cargo bench --bench throughput cargo bench --bench hashset ``` +Note: The `throughput` benchmark does not relies of criterion of timings measurements. In an attempt of reducing biais in this microbenchmark as much as possible, it shuffles seeds, input data, and alignment. It also has the benefit of being less of a "black box" compared to criterion. There is however a criterion-based throughput benchmark named `throughput_criterion` if you prefer. Results vary slightly between the two benchmarks, don't hesitate to submit an issue if you suspect biais and want to suggest improvements. + +Most importantly: if performance if a critical feature for your application, don't forget to benchmark the cost of hashing in your own context. Numbers shared here may be radically different in your environment and with your hardware. + ### Throughput Throughput is measured as the number of bytes hashed per second. -*Some prefer talking **latency** (time for generating a hash) or **hashrate** (the number of hashes generated per second) for measuring hash function performance, but those are all equivalent in the end as they all boil down to measuring the time it takes to hash some input and then apply different scalar transformation. For instance, if latency for a `4 bytes` hash is `1 ms`, then the throughput is `1 / 0.001 * 4 = 4000 bytes per second`. Throughput allows us to conveniently compare the performance of a hash function for any input size on a single graph.* +*Some prefer talking of **latency** (time for generating a hash) or **hashrate** (the number of hashes generated per second) for measuring hash function performance, but those are all equivalent in the end as they all boil down to measuring the time it takes to hash some input and then apply different scalar transformation. For instance, if latency for a `4 bytes` hash is `1 ms`, then the throughput is `1 / 0.001 * 4 = 4000 bytes per second`. Throughput allows us to conveniently compare the performance of a hash function for any input size on a single graph.* **Latest Benchmark Results:** ![aarch64](./benches/throughput/aarch64.svg) diff --git a/benches/read_beyond.rs b/benches/read_beyond.rs new file mode 100644 index 0000000..b8a2a58 --- /dev/null +++ b/benches/read_beyond.rs @@ -0,0 +1,188 @@ +#![feature(portable_simd)] +#![feature(core_intrinsics)] + +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use std::simd::*; +use std::mem::transmute; + +#[cfg(target_arch = "aarch64")] +mod arch { + + // Macbook pro M1 + // get_partial_safe/copy (4) + // time: [7.5658 ns 7.6379 ns 7.7465 ns] + // get_partial_safe/urbd (4) + // time: [1.2707 ns 1.2803 ns 1.2944 ns] + // get_partial_safe/simd_masked_load (4) + // time: [2.9972 ns 3.0029 ns 3.0107 ns] + // get_partial_safe/portable_simd (4) + // time: [3.8087 ns 3.8305 ns 3.8581 ns] + + // AMD Ryzen 5 5625U + // get_partial_safe/copy (4) + // time: [9.0579 ns 9.0854 ns 9.1167 ns] + // get_partial_safe/urbd (4) + // time: [4.6165 ns 4.6203 ns 4.6244 ns] + // get_partial_safe/simd_masked_load (4) + // time: [3.2439 ns 3.2556 ns 3.2746 ns] + // get_partial_safe/portable_simd (4) + // time: [3.3122 ns 3.3192 ns 3.3280 ns] + + use super::*; + use core::arch::aarch64::*; + + pub type State = int8x16_t; + + #[inline(always)] + pub unsafe fn copy(data: *const State, len: usize) -> State { + // Temporary buffer filled with zeros + let mut buffer = [0i8; 16]; + // Copy data into the buffer + core::ptr::copy(data as *const i8, buffer.as_mut_ptr(), len); + // Load the buffer into a __m256i vector + let partial_vector = vld1q_s8(buffer.as_ptr()); + vaddq_s8(partial_vector, vdupq_n_s8(len as i8)) + } + + #[inline(always)] + pub unsafe fn urbd(data: *const State, len: usize) -> State { + // Stripped of page check for simplicity, might crash program + let indices = vld1q_s8([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15].as_ptr()); + let mask = vcgtq_s8(vdupq_n_s8(len as i8), indices); + vandq_s8(vld1q_s8(data as *const i8), vreinterpretq_s8_u8(mask)) + } + + #[inline(always)] + pub unsafe fn urbd_asm(data: *const State, len: usize) -> State { + // Stripped of page check for simplicity, might crash program + let indices = vld1q_s8([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15].as_ptr()); + let mask = vcgtq_s8(vdupq_n_s8(len as i8), indices); + let oob_vector = vld1q_s8(data as *const i8); // asm to do + vandq_s8(oob_vector, vreinterpretq_s8_u8(mask)) + } + + #[inline(always)] + pub unsafe fn simd_masked_load(data: *const State, len: usize) -> State { + let indices = vld1q_s8([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15].as_ptr()); + let mask = vreinterpretq_s8_u8(vcgtq_s8(vdupq_n_s8(len as i8), indices)); + std::intrinsics::simd::simd_masked_load(mask, data as *const i8, vdupq_n_s8(len as i8)) + } + + #[inline(always)] + pub unsafe fn portable_simd(data: *const State, len: usize) -> State { + let slice = std::slice::from_raw_parts(data as *const i8, len); + let data: Simd = Simd::::load_or_default(&slice); + transmute(data) + } +} + +#[cfg(target_arch = "x86_64")] +mod arch { + use super::*; + use core::arch::x86_64::*; + + pub type State = __m128i; + + #[inline(always)] + pub unsafe fn copy(data: *const State, len: usize) -> State { + // Temporary buffer filled with zeros + let mut buffer = [0i8; 16]; + // Copy data into the buffer + core::ptr::copy(data as *const i8, buffer.as_mut_ptr(), len); + // // Load the buffer into a __m256i vector + let partial_vector = _mm_loadu_si128(buffer.as_ptr() as *const State); + _mm_add_epi8(partial_vector, _mm_set1_epi8(len as i8)) + } + + #[inline(always)] + pub unsafe fn urbd(data: *const State, len: usize) -> State { + // Stripped of page check for simplicity, might crash program + let indices = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + let mask = _mm_cmpgt_epi8(_mm_set1_epi8(len as i8), indices); + _mm_and_si128(_mm_loadu_si128(data), mask) + } + + #[inline(always)] + pub unsafe fn urbd_asm(data: *const State, len: usize) -> State { + use std::arch::asm; + // Stripped of page check for simplicity, might crash program + let indices = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + let mask = _mm_cmpgt_epi8(_mm_set1_epi8(len as i8), indices); + let mut oob_vector: State; + asm!("movdqu [{}], {}", in(reg) data, out(xmm_reg) oob_vector, options(pure, nomem, nostack)); + _mm_and_si128(oob_vector, mask) + } + + #[inline(always)] + pub unsafe fn simd_masked_load(data: *const State, len: usize) -> State { + let indices = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + let mask = _mm_cmpgt_epi8(_mm_set1_epi8(len as i8), indices); + State::from(std::intrinsics::simd::simd_masked_load(core::simd::i8x16::from(mask), data as *const i8, core::simd::i8x16::from(_mm_set1_epi8(len as i8)))) + } + + #[inline(always)] + pub unsafe fn portable_simd(data: *const State, len: usize) -> State { + let slice = std::slice::from_raw_parts(data as *const i8, len); + let data: Simd = Simd::::load_or_default(&slice); + transmute(data) + } +} + +fn benchmark(c: &mut Criterion) { + let mut group = c.benchmark_group("get_partial_safe"); + + // Prepare test data + let test_data: arch::State = unsafe { std::mem::zeroed() }; + + // Benchmark with different lengths + for &len in &[4, 8, 12, 16] { + group.bench_function(format!("copy ({})", len), |b| { + b.iter(|| unsafe { + black_box(arch::copy( + black_box(&test_data as *const arch::State), + black_box(len), + )) + }) + }); + + group.bench_function(format!("urbd ({})", len), |b| { + b.iter(|| unsafe { + black_box(arch::urbd( + black_box(&test_data as *const arch::State), + black_box(len), + )) + }) + }); + + group.bench_function(format!("urbd_asm ({})", len), |b| { + b.iter(|| unsafe { + black_box(arch::urbd( + black_box(&test_data as *const arch::State), + black_box(len), + )) + }) + }); + + group.bench_function(format!("simd_masked_load ({})", len), |b| { + b.iter(|| unsafe { + black_box(arch::simd_masked_load( + black_box(&test_data as *const arch::State), + black_box(len), + )) + }) + }); + + group.bench_function(format!("portable_simd ({})", len), |b| { + b.iter(|| unsafe { + black_box(arch::portable_simd( + black_box(&test_data as *const arch::State), + black_box(len), + )) + }) + }); + } + + group.finish(); +} +criterion_group!(benches, benchmark); +criterion_main!(benches); \ No newline at end of file diff --git a/benches/throughput/aarch64.svg b/benches/throughput/aarch64.svg index bebea7a..3bcf061 100644 --- a/benches/throughput/aarch64.svg +++ b/benches/throughput/aarch64.svg @@ -3,14 +3,9 @@ Throughput (aarch64) - - - - - - - - + + + Throughput (MiB/s) @@ -31,47 +26,22 @@ Input Size (bytes) - - - - - - - - + + + - -0 + +100 - - -5000 + + +1000 - - + + 10000 - - -15000 - - - -20000 - - - -25000 - - - -30000 - - - -35000 - - + 4 @@ -129,14 +99,14 @@ Input Size (bytes) 32768 - - - - - - - - + + + + + + + + diff --git a/benches/throughput/main.rs b/benches/throughput/main.rs index c84a9cc..cc4566d 100644 --- a/benches/throughput/main.rs +++ b/benches/throughput/main.rs @@ -2,8 +2,8 @@ mod result_processor; use result_processor::*; -use std::hash::Hasher; use std::hint::black_box; +use std::hash::Hasher; use std::time::{Instant, Duration}; use std::alloc::{alloc, dealloc, Layout}; use std::slice; @@ -14,7 +14,6 @@ use gxhash::*; const ITERATIONS: u32 = 1000; const MAX_RUN_DURATION: Duration = Duration::from_millis(1000); -const FORCE_NO_INLINING: bool = false; fn main() { let mut rng = rand::thread_rng(); @@ -49,7 +48,7 @@ fn main() { }); // AHash - let ahash_hasher = ahash::RandomState::with_seeds(0, 0, 0, 0); + let ahash_hasher = ahash::RandomState::with_seed(42); benchmark(processor.as_mut(), slice, "AHash", |data: &[u8], _: i32| -> u64 { ahash_hasher.hash_one(data) }); @@ -91,7 +90,7 @@ fn main() { } fn benchmark(processor: &mut dyn ResultProcessor, data: &[u8], name: &str, delegate: F) - where F: Fn(&[u8], S) -> u64, S: Default + TryFrom + TryInto + where F: Fn(&[u8], S) -> u64, S: Default + TryFrom + TryInto + Clone + Copy { processor.on_start(name); for i in 2.. { @@ -101,22 +100,20 @@ fn benchmark(processor: &mut dyn ResultProcessor, data: &[u8], name: &str, } // Warmup - black_box(time(ITERATIONS, &|| delegate(&data[..len], S::default()))); + time(ITERATIONS, &delegate, &data[..len], S::default()); let mut durations_s = vec![]; let now = Instant::now(); while now.elapsed() < MAX_RUN_DURATION { // Make seed unpredictable to prevent optimizations - let seed = S::try_from(now.elapsed().as_nanos()) - .unwrap_or_else(|_| panic!("Something went horribly wrong!")); + let seed = S::try_from(now.elapsed().as_nanos()).unwrap_or_else(|_| panic!()); // Offset slice by an unpredictable amount to prevent optimization (pre caching) // and make the benchmark use both aligned and unaligned data - let start = S::try_into(seed) - .unwrap_or_else(|_| panic!("Something went horribly wrong!")) & 0xFF; + let start = S::try_into(seed).unwrap_or_else(|_| panic!()) & 0xFF; let end = start + len; let slice = &data[start..end]; // Execute method for a new iterations - let duration = time(ITERATIONS, &|| delegate(slice, S::default())); + let duration = time(ITERATIONS, &delegate, slice, seed); durations_s.push(duration.as_secs_f64()); } let average_duration_s = calculate_average_without_outliers(&mut durations_s); @@ -128,31 +125,21 @@ fn benchmark(processor: &mut dyn ResultProcessor, data: &[u8], name: &str, } #[inline(never)] -fn time(iterations: u32, delegate: &F) -> Duration - where F: Fn() -> u64 +fn time(iterations: u32, delegate: F, slice: &[u8], seed: S) -> Duration + where F: Fn(&[u8], S) -> u64, S: Default + TryFrom + TryInto + Clone + Copy { let now = Instant::now(); - // Bench the same way to what is done in criterion.rs + // Bench a similar way to what is done in criterion.rs // https://github.com/bheisler/criterion.rs/blob/e1a8c9ab2104fbf2d15f700d0038b2675054a2c8/src/bencher.rs#L87 - for _ in 0..iterations { - if FORCE_NO_INLINING { - black_box(execute_noinlining(delegate)); - } else { - black_box(delegate()); - } + for _ in 0..iterations { + // Black box the result to prevent the compiler from optimizing the operation away + // Black box the slice to prevent the compiler to assume the slice is constant + // We don't black box the seed because it's likely to be constant in most real-world usage scenarios + black_box(delegate(black_box(slice), seed)); } now.elapsed() } -// Some algorithm are more likely to be inlined than others. -// This puts then all at the same level. But is it fair? -#[inline(never)] -fn execute_noinlining(delegate: &F) -> u64 - where F: Fn() -> u64 -{ - delegate() -} - // Outliers are inevitable, especially on a low number of iterations // To avoid computing a huge number of iterations we can use the interquartile range fn calculate_average_without_outliers(timings: &mut Vec) -> f64 { diff --git a/benches/throughput/result_processor.rs b/benches/throughput/result_processor.rs index c7dbbeb..8963cbe 100644 --- a/benches/throughput/result_processor.rs +++ b/benches/throughput/result_processor.rs @@ -132,7 +132,7 @@ impl ResultProcessor for OutputPlot { let x_min = self.series.iter().next().unwrap().1.iter().map(|(x, _)| *x as u32).min().unwrap(); let x_max = self.series.iter().next().unwrap().1.iter().map(|(x, _)| *x as u32).max().unwrap(); - let y_min = 0u32; + let y_min = self.series.iter().flat_map(|inner_map| inner_map.1.iter()).map(|(_, y)| (0.95 * *y) as u32).min().unwrap(); let y_max = self.series.iter().flat_map(|inner_map| inner_map.1.iter()).map(|(_, y)| (1.05 * *y) as u32).max().unwrap(); let mut chart = ChartBuilder::on(&canvas) @@ -144,8 +144,8 @@ impl ResultProcessor for OutputPlot { (x_min..x_max) .log_scale() .with_key_points(self.series.iter().next().unwrap().1.iter().map(|(x, _)| *x as u32).collect::>()), - y_min..y_max - //.log_scale(), + (y_min..y_max) + .log_scale(), ).unwrap(); chart diff --git a/benches/throughput/x86_64-hybrid.svg b/benches/throughput/x86_64-hybrid.svg index ea05a09..8d20c49 100644 --- a/benches/throughput/x86_64-hybrid.svg +++ b/benches/throughput/x86_64-hybrid.svg @@ -3,15 +3,10 @@ Throughput (x86_64-hybrid) - - - - - - - - - + + + + Throughput (MiB/s) @@ -32,52 +27,27 @@ Input Size (bytes) - - - - - - - - - + + + + - -0 + +100 - - -20000 + + +1000 - - -40000 + + +10000 - - -60000 - - - -80000 - - - + + 100000 - - -120000 - - - -140000 - - - -160000 - - + 4 @@ -135,14 +105,14 @@ Input Size (bytes) 32768 - - - - - - - - + + + + + + + + diff --git a/benches/throughput/x86_64.svg b/benches/throughput/x86_64.svg index e306731..5a2fccb 100644 --- a/benches/throughput/x86_64.svg +++ b/benches/throughput/x86_64.svg @@ -3,12 +3,9 @@ Throughput (x86_64) - - - - - - + + + Throughput (MiB/s) @@ -29,37 +26,22 @@ Input Size (bytes) - - - - - - + + + - -0 + +1000 - - -20000 + + +10000 - - -40000 - - - -60000 - - - -80000 - - - + + 100000 - + 4 @@ -117,14 +99,14 @@ Input Size (bytes) 32768 - - - - - - - - + + + + + + + + diff --git a/benches/throughput_criterion.rs b/benches/throughput_criterion.rs index 5fb7ce7..515610e 100644 --- a/benches/throughput_criterion.rs +++ b/benches/throughput_criterion.rs @@ -4,7 +4,7 @@ use std::slice; use std::hash::Hasher; use criterion::measurement::WallTime; -use criterion::{criterion_group, criterion_main, Criterion, Throughput, PlotConfiguration, AxisScale, BenchmarkGroup, BenchmarkId}; +use criterion::{criterion_group, criterion_main, Criterion, Throughput, PlotConfiguration, AxisScale, BenchmarkGroup, BenchmarkId, black_box}; use rand::Rng; use gxhash::*; @@ -21,9 +21,9 @@ fn benchmark(c: &mut BenchmarkGroup, data: &[u8], name: &str, deleg c.throughput(Throughput::Bytes(len as u64)); let slice = &data[0..len]; // Aligned - // let slice = &data[1..len]; // Unaligned + //let slice = &data[1..len]; // Unaligned c.bench_with_input(BenchmarkId::new(name, len), slice, |bencher, input| { - bencher.iter(|| delegate(criterion::black_box(input), criterion::black_box(42))) + bencher.iter(|| black_box(delegate(black_box(input), black_box(42)))) }); } } diff --git a/src/gxhash/platform/arm.rs b/src/gxhash/platform/arm.rs index fc40b92..0aba171 100644 --- a/src/gxhash/platform/arm.rs +++ b/src/gxhash/platform/arm.rs @@ -25,7 +25,7 @@ pub unsafe fn load_unaligned(p: *const State) -> State { vld1q_s8(p as *const i8) } -#[inline(always)] +#[inline(never)] pub unsafe fn get_partial_safe(data: *const State, len: usize) -> State { // Temporary buffer filled with zeros let mut buffer = [0i8; VECTOR_SIZE]; @@ -34,6 +34,37 @@ pub unsafe fn get_partial_safe(data: *const State, len: usize) -> State { // Load the buffer into a __m256i vector let partial_vector = vld1q_s8(buffer.as_ptr()); vaddq_s8(partial_vector, vdupq_n_s8(len as i8)) + + //let indices = vld1q_s8([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15].as_ptr()); + //let mask = vreinterpretq_s8_u8(vcgtq_s8(vdupq_n_s8(len as i8), indices)); + + // Using simd_masked_load + // State::from(std::intrinsics::simd::simd_masked_load(core::simd::i8x16::from(mask), data as *const i8, core::simd::i8x16::from(vdupq_n_s8(len as i8)))) + // std::intrinsics::simd::simd_masked_load(mask, data as *const i8, vdupq_n_s8(len as i8)) + + // Using std::simd + // use std::simd::*; + // use std::mem::transmute; + // let slice = std::slice::from_raw_parts(data as *const i8, len); + // let data: Simd = Simd::::load_or_default(&slice); + // let vector: State = transmute(data); + // return vector; +} + +#[inline(always)] +pub unsafe fn get_partial_unsafe_no_ub(data: *const State, len: usize) -> State { + let indices = vld1q_s8([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15].as_ptr()); + let mask = vcgtq_s8(vdupq_n_s8(len as i8), indices); + use std::arch::asm; + let mut result: State; + asm!( + "ld1 {{v2.16b}}, [{src}]", + src = in(reg) data, out("v2") result, + options(nomem, nostack) + ); + //let result = load_unaligned(data); + let partial_vector = vandq_s8(result, vreinterpretq_s8_u8(mask)); + vaddq_s8(partial_vector, vdupq_n_s8(len as i8)) } #[inline(always)] diff --git a/src/gxhash/platform/mod.rs b/src/gxhash/platform/mod.rs index f40d676..1c7a185 100644 --- a/src/gxhash/platform/mod.rs +++ b/src/gxhash/platform/mod.rs @@ -18,10 +18,12 @@ const PAGE_SIZE: usize = 0x1000; pub unsafe fn get_partial(p: *const State, len: usize) -> State { // Safety check if check_same_page(p) { - get_partial_unsafe(p, len) + get_partial_unsafe_no_ub(p, len) } else { get_partial_safe(p, len) } + + //get_partial_safe(p, len) } #[inline(always)] diff --git a/src/gxhash/platform/x86.rs b/src/gxhash/platform/x86.rs index a5735f1..84eba63 100644 --- a/src/gxhash/platform/x86.rs +++ b/src/gxhash/platform/x86.rs @@ -1,8 +1,8 @@ #[cfg(not(any(all(target_feature = "aes", target_feature = "sse2"), docsrs)))] // docs.rs bypasses the target_feature check compile_error!{"Gxhash requires aes and sse2 intrinsics. Make sure the processor supports it and build with RUSTFLAGS=\"-C target-cpu=native\" or RUSTFLAGS=\"-C target-feature=+aes,+sse2\"."} -#[cfg(all(feature = "hybrid", not(any(target_feature = "aes", target_feature = "vaes", target_feature = "avx2"))))] -compile_error!{"Hybrid feature is only available on x86 processors with avx2 and vaes intrinsics."} +#[cfg(all(feature = "hybrid", not(all(target_feature = "aes", target_feature = "sse2", target_feature = "avx2"))))] +compile_error!{"Hybrid feature is only available on x86 processors with avx2 intrinsics."} #[cfg(target_arch = "x86")] use core::arch::x86::*; @@ -28,22 +28,73 @@ pub unsafe fn load_unaligned(p: *const State) -> State { _mm_loadu_si128(p) } -#[inline(always)] +#[inline(never)] pub unsafe fn get_partial_safe(data: *const State, len: usize) -> State { // Temporary buffer filled with zeros let mut buffer = [0i8; VECTOR_SIZE]; - // Copy data into the buffer core::ptr::copy(data as *const i8, buffer.as_mut_ptr(), len); - // Load the buffer into a __m256i vector let partial_vector = _mm_loadu_si128(buffer.as_ptr() as *const State); _mm_add_epi8(partial_vector, _mm_set1_epi8(len as i8)) + + // Using URBD + //get_partial_unsafe(data, len) + + // Using simd_masked_load + // let indices = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + // let mask = _mm_cmpgt_epi8(_mm_set1_epi8(len as i8), indices); + // State::from(std::intrinsics::simd::simd_masked_load(core::simd::i8x16::from(mask), data as *const i8, core::simd::i8x16::from(_mm_set1_epi8(len as i8)))) + + // Using std::simd + // use std::simd::*; + // use std::mem::transmute; + // let slice = std::slice::from_raw_parts(data as *const i8, len); + // let data: Simd = Simd::::load_or_default(&slice); + // let vector: State = transmute(data); + // return vector; + + // Using inline assembly to load out-of-bounds + // use std::arch::asm; + // let indices = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + // let mask = _mm_cmpgt_epi8(_mm_set1_epi8(len as i8), indices); + // let mut result: State; + // asm!("movdqu [{}], {}", in(reg) data, out(xmm_reg) result, options(pure, nomem, nostack)); + // let partial_vector = _mm_and_si128(result, mask); + // _mm_add_epi8(partial_vector, _mm_set1_epi8(len as i8)) +} + +#[inline(always)] +pub unsafe fn get_partial_unsafe_no_ub(data: *const State, len: usize) -> State { + // Using inline assembly to load out-of-bounds + use std::arch::asm; + let indices = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + let mask = _mm_cmpgt_epi8(_mm_set1_epi8(len as i8), indices); + let mut result: State; + asm!("movdqu {0}, [{1}]", out(xmm_reg) result, in(reg) data, options(pure, nomem, nostack)); + let partial_vector = _mm_and_si128(result, mask); + _mm_add_epi8(partial_vector, _mm_set1_epi8(len as i8)) + + // Using simd_masked_load + // let indices = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + // let mask = _mm_cmpgt_epi8(_mm_set1_epi8(len as i8), indices); + // State::from(std::intrinsics::simd::simd_masked_load(core::simd::i8x16::from(mask), data as *const i8, core::simd::i8x16::from(_mm_set1_epi8(len as i8)))) + + // Using std::simd + // use std::simd::*; + // use std::mem::transmute; + // let slice = std::slice::from_raw_parts(data as *const i8, len); + // let data: Simd = Simd::::load_or_default(&slice); + // let vector: State = transmute(data); + // return vector; + + //return get_partial_safe(data, len); } #[inline(always)] pub unsafe fn get_partial_unsafe(data: *const State, len: usize) -> State { let indices = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); let mask = _mm_cmpgt_epi8(_mm_set1_epi8(len as i8), indices); - let partial_vector = _mm_and_si128(_mm_loadu_si128(data), mask); + let d: __m128i = _mm_loadu_si128(data); + let partial_vector = _mm_and_si128(d, mask); _mm_add_epi8(partial_vector, _mm_set1_epi8(len as i8)) } diff --git a/src/lib.rs b/src/lib.rs index 705a0bf..d675cf9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,5 @@ +#![feature(core_intrinsics)] +#![feature(portable_simd)] #![cfg_attr(not(feature = "std"), no_std)] // Hybrid SIMD width usage currently requires unstable 'stdsimd' #![cfg_attr(feature = "hybrid", feature(stdarch_x86_avx512))]