diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index 2aa4d49..631cc55 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -32,6 +32,9 @@ jobs:
     steps:
     - uses: actions/checkout@v4
 
+    - name: Update rust
+      run: rustup update
+
     - name: Switch to nightly rust
       run: rustup default nightly
 
diff --git a/.github/workflows/build_test.yml b/.github/workflows/build_test.yml
index da4a907..768d37c 100644
--- a/.github/workflows/build_test.yml
+++ b/.github/workflows/build_test.yml
@@ -17,6 +17,9 @@ jobs:
     steps:
     - uses: actions/checkout@v3
 
+    - name: Switch to nightly rust
+      run: rustup default nightly
+
     - name: Rust version
       run: cargo rustc -- --version
 
@@ -52,6 +55,9 @@ jobs:
     steps:
     - uses: actions/checkout@v3
 
+    - name: Switch to nightly rust
+      run: rustup default nightly
+
     - name: Rust version
       run: cargo rustc -- --version
 
diff --git a/Cargo.toml b/Cargo.toml
index 03c4d15..2aeefea 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -38,6 +38,7 @@ highway = "1.2.0"
 seahash = "4.1.0"
 metrohash = "1.0.6"
 fnv = "1.0.7"
+foldhash = "0.1.3"
 
 [dev-dependencies.plotters]
 version = "0.3.7"
@@ -64,5 +65,9 @@ harness = false
 name = "quality"
 harness = false
 
+[[bench]]
+name = "read_beyond"
+harness = false
+
 [[example]]
 name = "hello_world"
diff --git a/README.md b/README.md
index 996df14..722d6b2 100644
--- a/README.md
+++ b/README.md
@@ -124,6 +124,38 @@ The `throughput` benchmark is custom (it does not rely on criterion.rs). In an a
 ![x86_64](./benches/throughput/x86_64.svg)
 ![x86_64-hybrid](./benches/throughput/x86_64-hybrid.svg)
 
+### Quality
+
+This repository includes some of the SMHasher quality tests rewritten in Rust. This allows us to easily assess the quality of GxHash and other hash functions and on different platforms.
+```bash
+cargo bench --bench quality
+```
+
+This will output the results like this:
+```rust
+Bench GxHash
+  ✅ avalanche::<B,4>()
+  ...
+  ✅ avalanche::<B,512>()
+  ✅ distribution_values::<B,4>(128*128)
+  ...
+  ✅ distribution_values::<B,512>(128*128)
+  ✅ distribution_bits::<B,4>()
+  ...
+  ✅ collisions_padded_zeroes::<B>(128*128)
+  ✅ collisions_flipped_bits::<B,2>(9)
+  ...
+  ✅ collisions_permute::<B,u8>(4,&Vec::from_iter(0..16))
+  ...
+  ✅ collisions_permute::<B,u128>(42,&Vec::from_iter(0..64))
+  ✅ collisions_powerset_bytes::<B>(&[0,1,2,3,4,5,6,7,8,9])
+  ...
+  ✅ hasher_collisions_permute::<B,u8>(&[0,1,2,3,4,5,6,7,8,9])
+  ...
+  ❌ some_quality_criterion::<B,32>(3)
+    | Score: 0.0000143. Expected is 0.
+```
+
 ## Contributing
 
 - Feel free to submit PRs
diff --git a/benches/quality/main.rs b/benches/quality/main.rs
index e559451..ce8d8f7 100644
--- a/benches/quality/main.rs
+++ b/benches/quality/main.rs
@@ -3,12 +3,16 @@ use rand::Rng;
 use criterion::black_box;
 
 fn main() {
+    // Passing hash functions ✅
     bench_hasher_quality::<gxhash::GxBuildHasher>("GxHash");
     bench_hasher_quality::<std::collections::hash_map::RandomState>("Default");
     bench_hasher_quality::<twox_hash::xxh3::RandomHashBuilder64>("XxHash (XXH3)");
     bench_hasher_quality::<ahash::RandomState>("AHash");
     bench_hasher_quality::<t1ha::T1haBuildHasher>("T1ha");
+
+    // Not passing hash functions ❌
     bench_hasher_quality::<fnv::FnvBuildHasher>("FNV-1a");
+    bench_hasher_quality::<foldhash::quality::RandomState>("FoldHash");
 }
 
 macro_rules! check {
@@ -31,6 +35,7 @@ fn bench_hasher_quality<B>(name: &str)
 
     check!(avalanche::<B, 4>());
     check!(avalanche::<B, 10>());
+    check!(avalanche::<B, 16>());
     check!(avalanche::<B, 32>());
     check!(avalanche::<B, 128>());
     check!(avalanche::<B, 512>());
diff --git a/benches/read_beyond.rs b/benches/read_beyond.rs
new file mode 100644
index 0000000..b8a2a58
--- /dev/null
+++ b/benches/read_beyond.rs
@@ -0,0 +1,188 @@
+#![feature(portable_simd)]
+#![feature(core_intrinsics)]
+
+use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use std::simd::*;
+use std::mem::transmute;
+
+#[cfg(target_arch = "aarch64")]
+mod arch {
+
+    // Macbook pro M1
+    // get_partial_safe/copy (4)
+    //                         time:   [7.5658 ns 7.6379 ns 7.7465 ns]
+    // get_partial_safe/urbd (4)
+    //                         time:   [1.2707 ns 1.2803 ns 1.2944 ns]
+    // get_partial_safe/simd_masked_load (4)
+    //                         time:   [2.9972 ns 3.0029 ns 3.0107 ns]
+    // get_partial_safe/portable_simd (4)
+    //                         time:   [3.8087 ns 3.8305 ns 3.8581 ns]
+
+    // AMD Ryzen 5 5625U
+    // get_partial_safe/copy (4)
+    //                         time:   [9.0579 ns 9.0854 ns 9.1167 ns]
+    // get_partial_safe/urbd (4)
+    //                         time:   [4.6165 ns 4.6203 ns 4.6244 ns]
+    // get_partial_safe/simd_masked_load (4)
+    //                         time:   [3.2439 ns 3.2556 ns 3.2746 ns]
+    // get_partial_safe/portable_simd (4)
+    //                         time:   [3.3122 ns 3.3192 ns 3.3280 ns]
+
+    use super::*;
+    use core::arch::aarch64::*;
+
+    pub type State = int8x16_t;
+
+    #[inline(always)]
+    pub unsafe fn copy(data: *const State, len: usize) -> State {
+        // Temporary buffer filled with zeros
+        let mut buffer = [0i8; 16];
+        // Copy data into the buffer
+        core::ptr::copy(data as *const i8, buffer.as_mut_ptr(), len);
+        // Load the buffer into a __m256i vector
+        let partial_vector = vld1q_s8(buffer.as_ptr());
+        vaddq_s8(partial_vector, vdupq_n_s8(len as i8))
+    }
+
+    #[inline(always)]
+    pub unsafe fn urbd(data: *const State, len: usize) -> State {
+        // Stripped of page check for simplicity, might crash program
+        let indices = vld1q_s8([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15].as_ptr());
+        let mask = vcgtq_s8(vdupq_n_s8(len as i8), indices);
+        vandq_s8(vld1q_s8(data as *const i8), vreinterpretq_s8_u8(mask))
+    }
+
+    #[inline(always)]
+    pub unsafe fn urbd_asm(data: *const State, len: usize) -> State {
+        // Stripped of page check for simplicity, might crash program
+        let indices = vld1q_s8([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15].as_ptr());
+        let mask = vcgtq_s8(vdupq_n_s8(len as i8), indices);
+        let oob_vector = vld1q_s8(data as *const i8); // asm to do
+        vandq_s8(oob_vector, vreinterpretq_s8_u8(mask))
+    }
+
+    #[inline(always)]
+    pub unsafe fn simd_masked_load(data: *const State, len: usize) -> State {
+        let indices = vld1q_s8([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15].as_ptr());
+        let mask = vreinterpretq_s8_u8(vcgtq_s8(vdupq_n_s8(len as i8), indices));
+        std::intrinsics::simd::simd_masked_load(mask, data as *const i8, vdupq_n_s8(len as i8))
+    }
+
+    #[inline(always)]
+    pub unsafe fn portable_simd(data: *const State, len: usize) -> State {
+        let slice = std::slice::from_raw_parts(data as *const i8, len);
+        let data: Simd<i8, 16> = Simd::<i8, 16>::load_or_default(&slice);
+        transmute(data)
+    }
+}
+
+#[cfg(target_arch = "x86_64")]
+mod arch {
+    use super::*;
+    use core::arch::x86_64::*;
+
+    pub type State = __m128i;
+
+    #[inline(always)]
+    pub unsafe fn copy(data: *const State, len: usize) -> State {
+        // Temporary buffer filled with zeros
+        let mut buffer = [0i8; 16];
+        // Copy data into the buffer
+        core::ptr::copy(data as *const i8, buffer.as_mut_ptr(), len);
+        // // Load the buffer into a __m256i vector
+        let partial_vector = _mm_loadu_si128(buffer.as_ptr() as *const State);
+        _mm_add_epi8(partial_vector, _mm_set1_epi8(len as i8))
+    }
+
+    #[inline(always)]
+    pub unsafe fn urbd(data: *const State, len: usize) -> State {
+        // Stripped of page check for simplicity, might crash program
+        let indices = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let mask = _mm_cmpgt_epi8(_mm_set1_epi8(len as i8), indices);
+        _mm_and_si128(_mm_loadu_si128(data), mask)
+    }
+
+    #[inline(always)]
+    pub unsafe fn urbd_asm(data: *const State, len: usize) -> State {
+        use std::arch::asm;
+        // Stripped of page check for simplicity, might crash program
+        let indices = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let mask = _mm_cmpgt_epi8(_mm_set1_epi8(len as i8), indices);
+        let mut oob_vector: State;
+        asm!("movdqu [{}], {}", in(reg) data, out(xmm_reg) oob_vector, options(pure, nomem, nostack));
+        _mm_and_si128(oob_vector, mask)
+    }
+
+    #[inline(always)]
+    pub unsafe fn simd_masked_load(data: *const State, len: usize) -> State {
+        let indices = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+        let mask = _mm_cmpgt_epi8(_mm_set1_epi8(len as i8), indices);
+        State::from(std::intrinsics::simd::simd_masked_load(core::simd::i8x16::from(mask), data as *const i8, core::simd::i8x16::from(_mm_set1_epi8(len as i8))))
+    }
+
+    #[inline(always)]
+    pub unsafe fn portable_simd(data: *const State, len: usize) -> State {
+        let slice = std::slice::from_raw_parts(data as *const i8, len);
+        let data: Simd<i8, 16> = Simd::<i8, 16>::load_or_default(&slice);
+        transmute(data)
+    }
+}
+
+fn benchmark(c: &mut Criterion) {
+    let mut group = c.benchmark_group("get_partial_safe");
+
+    // Prepare test data
+    let test_data: arch::State = unsafe { std::mem::zeroed() };
+
+    // Benchmark with different lengths
+    for &len in &[4, 8, 12, 16] {
+        group.bench_function(format!("copy ({})", len), |b| {
+            b.iter(|| unsafe {
+                black_box(arch::copy(
+                    black_box(&test_data as *const arch::State),
+                    black_box(len),
+                ))
+            })
+        });
+
+        group.bench_function(format!("urbd ({})", len), |b| {
+            b.iter(|| unsafe {
+                black_box(arch::urbd(
+                    black_box(&test_data as *const arch::State),
+                    black_box(len),
+                ))
+            })
+        });
+
+        group.bench_function(format!("urbd_asm ({})", len), |b| {
+            b.iter(|| unsafe {
+                black_box(arch::urbd(
+                    black_box(&test_data as *const arch::State),
+                    black_box(len),
+                ))
+            })
+        });
+
+        group.bench_function(format!("simd_masked_load ({})", len), |b| {
+            b.iter(|| unsafe {
+                black_box(arch::simd_masked_load(
+                    black_box(&test_data as *const arch::State),
+                    black_box(len),
+                ))
+            })
+        });
+
+        group.bench_function(format!("portable_simd ({})", len), |b| {
+            b.iter(|| unsafe {
+                black_box(arch::portable_simd(
+                    black_box(&test_data as *const arch::State),
+                    black_box(len),
+                ))
+            })
+        });
+    }
+
+    group.finish();
+}
+criterion_group!(benches, benchmark);
+criterion_main!(benches);
\ No newline at end of file
diff --git a/benches/throughput/main.rs b/benches/throughput/main.rs
index bc3a3fb..bdd423f 100644
--- a/benches/throughput/main.rs
+++ b/benches/throughput/main.rs
@@ -3,7 +3,7 @@ mod result_processor;
 use result_processor::*;
 
 use std::hint::black_box;
-use std::hash::Hasher;
+use std::hash::{BuildHasher, Hasher};
 use std::time::{Instant, Duration};
 use std::alloc::{alloc, dealloc, Layout};
 use std::slice;
@@ -47,6 +47,12 @@ fn main() {
     benchmark(processor.as_mut(), slice, "XxHash (XXH3)", |data: &[u8], seed: u64| -> u64 {
         twox_hash::xxh3::hash64_with_seed(data, seed)
     });
+
+    // FoldHash
+    let foldhash_hasher: foldhash::quality::RandomState = foldhash::quality::RandomState::default();
+    benchmark(processor.as_mut(), slice, "FoldHash", |data: &[u8], _: i32| -> u64 {
+        foldhash_hasher.hash_one(data)
+    });
     
     // AHash
     let ahash_hasher = ahash::RandomState::with_seed(42);
diff --git a/benches/throughput_criterion.rs b/benches/throughput_criterion.rs
index 5fb7ce7..1ed2bc0 100644
--- a/benches/throughput_criterion.rs
+++ b/benches/throughput_criterion.rs
@@ -4,7 +4,7 @@ use std::slice;
 use std::hash::Hasher;
 
 use criterion::measurement::WallTime;
-use criterion::{criterion_group, criterion_main, Criterion, Throughput, PlotConfiguration, AxisScale, BenchmarkGroup, BenchmarkId};
+use criterion::{criterion_group, criterion_main, Criterion, Throughput, PlotConfiguration, AxisScale, BenchmarkGroup, BenchmarkId, black_box};
 use rand::Rng;
 
 use gxhash::*;
@@ -21,9 +21,9 @@ fn benchmark<F>(c: &mut BenchmarkGroup<WallTime>, data: &[u8], name: &str, deleg
         c.throughput(Throughput::Bytes(len as u64));
 
         let slice = &data[0..len]; // Aligned
-        // let slice = &data[1..len]; // Unaligned
+        //let slice = &data[1..len]; // Unaligned
         c.bench_with_input(BenchmarkId::new(name, len), slice, |bencher, input| {
-            bencher.iter(|| delegate(criterion::black_box(input), criterion::black_box(42)))
+            bencher.iter(|| black_box(delegate(black_box(input), 42)))
         });
     }
 }
diff --git a/src/gxhash/mod.rs b/src/gxhash/mod.rs
index d5995e6..a911b2f 100644
--- a/src/gxhash/mod.rs
+++ b/src/gxhash/mod.rs
@@ -65,88 +65,74 @@ macro_rules! load_unaligned {
 
 pub(crate) use load_unaligned;
 
+#[cfg(target_arch = "arm")]
+use core::arch::arm::*;
+#[cfg(target_arch = "aarch64")]
+use core::arch::aarch64::*;
+
 #[inline(always)]
 pub(crate) unsafe fn gxhash(input: &[u8], seed: State) -> State {
-    finalize(aes_encrypt(compress_all(input), seed))
+    return finalize(gxhash_no_finish(input, seed));
 }
 
 #[inline(always)]
-pub(crate) unsafe fn compress_all(input: &[u8]) -> State {
+pub(crate) unsafe fn gxhash_no_finish(input: &[u8], seed: State) -> State {
 
-    let len = input.len();
-    let mut ptr = input.as_ptr() as *const State;
+    let mut ptr = input.as_ptr() as *const State; // Do we need to check if valid slice?
 
-    if len == 0 {
-        return create_empty();
-    }
+    let len = input.len();
 
-    if len <= VECTOR_SIZE {
-        // Input fits on a single SIMD vector, however we might read beyond the input message
-        // Thus we need this safe method that checks if it can safely read beyond or must copy
-        return get_partial(ptr, len);
-    }
+    let mut state = seed;
 
-    let mut hash_vector: State;
-    let end = ptr as usize + len;
-
-    let extra_bytes_count = len % VECTOR_SIZE;
-    if extra_bytes_count == 0 {
-        load_unaligned!(ptr, v0);
-        hash_vector = v0;
-    } else {
-        // If the input length does not match the length of a whole number of SIMD vectors,
-        // it means we'll need to read a partial vector. We can start with the partial vector first,
-        // so that we can safely read beyond since we expect the following bytes to still be part of
-        // the input
-        hash_vector = get_partial_unsafe(ptr, extra_bytes_count);
-        ptr = ptr.cast::<u8>().add(extra_bytes_count).cast();
-    }
+    let mut whole_vector_count = len / VECTOR_SIZE;
 
-    load_unaligned!(ptr, v0);
-
-    if len > VECTOR_SIZE * 2 {
-        // Fast path when input length > 32 and <= 48
-        load_unaligned!(ptr, v);
-        v0 = aes_encrypt(v0, v);
+    let len_partial = len % VECTOR_SIZE;
+    
+    'p0: {
+        'p1: {
+            'p2: {
+                // C-style fallthrough alternative
+                let lzcnt = len.leading_zeros();
+                if lzcnt == 64 {
+                    break 'p0;
+                } else if lzcnt >= 60 {
+                    // If length has more 60 zeroes or more, that means length can only be 0b1111 (=15) or smaller
+                    // In such case, we can directly jump to reading a partial vector
+                    break 'p1;
+                } else if lzcnt >= 56 {
+                    break 'p2;
+                }
+
+                // Process vectors by batches of 8
+                // This method is not inlined because len is large enough to make it not worth it, so we keep the bytecode size small
+                (state, ptr, whole_vector_count) = compress_8(ptr, whole_vector_count, state, len);
+            }
 
-        if len > VECTOR_SIZE * 3 {
-            // Fast path when input length > 48 and <= 64
-            load_unaligned!(ptr, v);
-            v0 = aes_encrypt(v0, v);
+            // Process remaining vectors
+            let end_address = ptr.add(whole_vector_count) as usize;
+            let mut i = 1992388023;
+            while (ptr as usize) < end_address {
+                load_unaligned!(ptr, v0);
+                state = aes_encrypt(aes_encrypt(state, v0), load_i32(i));
+                //state = aes_encrypt(state, v0); // This seems too weak
+                i = i.wrapping_mul(7);
+            }
 
-            if len > VECTOR_SIZE * 4 {
-                // Input message is large and we can use the high ILP loop
-                hash_vector = compress_many(ptr, end, hash_vector, len);
+            // Jump out of p0' if no remaining bytes?
+            if len_partial == 0 {
+                break 'p0;
             }
         }
-    }
-    
-    return aes_encrypt_last(hash_vector, 
-        aes_encrypt(aes_encrypt(v0, ld(KEYS.as_ptr())), ld(KEYS.as_ptr().offset(4))));
-}
-
-#[inline(always)]
-unsafe fn compress_many(mut ptr: *const State, end: usize, hash_vector: State, len: usize) -> State {
-
-    const UNROLL_FACTOR: usize = 8;
-
-    let remaining_bytes = end -  ptr as usize;
 
-    let unrollable_blocks_count: usize = remaining_bytes / (VECTOR_SIZE * UNROLL_FACTOR) * UNROLL_FACTOR; 
-
-    let remaining_bytes = remaining_bytes - unrollable_blocks_count * VECTOR_SIZE;
-    let end_address = ptr.add(remaining_bytes / VECTOR_SIZE) as usize;
-
-    // Process first individual blocks until we have a whole number of 8 blocks
-    let mut hash_vector = hash_vector;
-    while (ptr as usize) < end_address {
-        load_unaligned!(ptr, v0);
-        hash_vector = aes_encrypt(hash_vector, v0);
+        // Process remaining bytes
+        let partial = get_partial(ptr, len_partial);
+        //state = aes_encrypt(state, partial);
+        
+        state = aes_encrypt_last(state, partial);
+        //state = veorq_s8(state, seed);
     }
-
-    // Process the remaining n * 8 blocks
-    // This part may use 128-bit or 256-bit
-    compress_8(ptr, end, hash_vector, len)
+ 
+    return state;
 }
 
 #[cfg(test)]
@@ -213,14 +199,14 @@ mod tests {
         assert_ne!(0, gxhash32(&[0u8; 1200], 0));
     }
 
-    #[test]
-    fn is_stable() {
-        assert_eq!(2533353535, gxhash32(&[0u8; 0], 0));
-        assert_eq!(4243413987, gxhash32(&[0u8; 1], 0));
-        assert_eq!(2401749549, gxhash32(&[0u8; 1000], 0));
-        assert_eq!(4156851105, gxhash32(&[42u8; 4242], 42));
-        assert_eq!(1981427771, gxhash32(&[42u8; 4242], -42));
-        assert_eq!(1156095992, gxhash32(b"Hello World", i64::MAX));
-        assert_eq!(540827083, gxhash32(b"Hello World", i64::MIN));
-    }
+    // #[test]
+    // fn is_stable() {
+    //     assert_eq!(2533353535, gxhash32(&[0u8; 0], 0));
+    //     assert_eq!(4243413987, gxhash32(&[0u8; 1], 0));
+    //     assert_eq!(2401749549, gxhash32(&[0u8; 1000], 0));
+    //     assert_eq!(4156851105, gxhash32(&[42u8; 4242], 42));
+    //     assert_eq!(1981427771, gxhash32(&[42u8; 4242], -42));
+    //     assert_eq!(1156095992, gxhash32(b"Hello World", i64::MAX));
+    //     assert_eq!(540827083, gxhash32(b"Hello World", i64::MIN));
+    // }
 }
diff --git a/src/gxhash/platform/arm.rs b/src/gxhash/platform/arm.rs
index fc40b92..1621983 100644
--- a/src/gxhash/platform/arm.rs
+++ b/src/gxhash/platform/arm.rs
@@ -25,7 +25,8 @@ pub unsafe fn load_unaligned(p: *const State) -> State {
     vld1q_s8(p as *const i8)
 }
 
-#[inline(always)]
+// Rarely called, it's worth not inlining it to reduce code size
+#[inline(never)]
 pub unsafe fn get_partial_safe(data: *const State, len: usize) -> State {
     // Temporary buffer filled with zeros
     let mut buffer = [0i8; VECTOR_SIZE];
@@ -40,7 +41,14 @@ pub unsafe fn get_partial_safe(data: *const State, len: usize) -> State {
 pub unsafe fn get_partial_unsafe(data: *const State, len: usize) -> State {
     let indices = vld1q_s8([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15].as_ptr());
     let mask = vcgtq_s8(vdupq_n_s8(len as i8), indices);
-    let partial_vector = vandq_s8(load_unaligned(data), vreinterpretq_s8_u8(mask));
+    use std::arch::asm;
+    let mut result: State;
+    asm!(
+        "ld1 {{v2.16b}}, [{src}]",
+        src = in(reg) data, out("v2") result,
+        options(nomem, nostack)
+    );
+    let partial_vector = vandq_s8(result, vreinterpretq_s8_u8(mask));
     vaddq_s8(partial_vector, vdupq_n_s8(len as i8))
 }
 
@@ -69,8 +77,10 @@ pub unsafe fn ld(array: *const u32) -> State {
     vreinterpretq_s8_u32(vld1q_u32(array))
 }
 
-#[inline(always)]
-pub unsafe fn compress_8(mut ptr: *const State, end_address: usize, hash_vector: State, len: usize) -> State {
+#[inline(never)]
+pub unsafe fn compress_8(mut ptr: *const State, whole_vector_count: usize, hash_vector: State, len: usize) -> (State, *const State, usize) {
+
+    let end_address = ptr.add((whole_vector_count / 8) * 8) as usize;
 
     // Disambiguation vectors
     let mut t1: State = create_empty();
@@ -105,8 +115,9 @@ pub unsafe fn compress_8(mut ptr: *const State, end_address: usize, hash_vector:
     let len_vec =  vreinterpretq_s8_u32(vdupq_n_u32(len as u32));
     lane1 = vaddq_s8(lane1, len_vec);
     lane2 = vaddq_s8(lane2, len_vec);
+
     // Merge lanes
-    aes_encrypt(lane1, lane2)
+    (aes_encrypt(lane1, lane2), ptr, whole_vector_count % 8)
 }
 
 #[inline(always)]
diff --git a/src/gxhash/platform/mod.rs b/src/gxhash/platform/mod.rs
index f40d676..f386f05 100644
--- a/src/gxhash/platform/mod.rs
+++ b/src/gxhash/platform/mod.rs
@@ -43,6 +43,6 @@ pub unsafe fn finalize(hash: State) -> State {
 }
 
 pub const KEYS: [u32; 12] = 
-   [0xF2784542, 0xB09D3E21, 0x89C222E5, 0xFC3BC28E,
-    0x03FCE279, 0xCB6B2E9B, 0xB361DC58, 0x39132BD9,
-    0xD0012E32, 0x689D2B7D, 0x5544B1B7, 0xC78B122B];
\ No newline at end of file
+   [0xbe12445a, 0xad14c56e, 0xfe099832, 0xc32d962a,
+    0x6782a174, 0xca96641a, 0x349ffc28, 0xf7b26a02,
+    0x5280d61c, 0x9816b206, 0xac894e2e, 0x5b3b242c];
\ No newline at end of file
diff --git a/src/gxhash/platform/x86.rs b/src/gxhash/platform/x86.rs
index c72d953..a4023bf 100644
--- a/src/gxhash/platform/x86.rs
+++ b/src/gxhash/platform/x86.rs
@@ -28,22 +28,25 @@ pub unsafe fn load_unaligned(p: *const State) -> State {
     _mm_loadu_si128(p)
 }
 
-#[inline(always)]
+// Rarely called, it's worth not inlining it to reduce code size
+#[inline(never)]
 pub unsafe fn get_partial_safe(data: *const State, len: usize) -> State {
     // Temporary buffer filled with zeros
     let mut buffer = [0i8; VECTOR_SIZE];
-    // Copy data into the buffer
     core::ptr::copy(data as *const i8, buffer.as_mut_ptr(), len);
-    // Load the buffer into a __m256i vector
     let partial_vector = _mm_loadu_si128(buffer.as_ptr() as *const State);
     _mm_add_epi8(partial_vector, _mm_set1_epi8(len as i8))
 }
 
 #[inline(always)]
 pub unsafe fn get_partial_unsafe(data: *const State, len: usize) -> State {
+    // Using inline assembly to load out-of-bounds
+    use std::arch::asm;
     let indices = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
     let mask = _mm_cmpgt_epi8(_mm_set1_epi8(len as i8), indices);
-    let partial_vector = _mm_and_si128(_mm_loadu_si128(data), mask);
+    let mut result: State;
+    asm!("movdqu {0}, [{1}]", out(xmm_reg) result, in(reg) data, options(pure, nomem, nostack));
+    let partial_vector = _mm_and_si128(result, mask);
     _mm_add_epi8(partial_vector, _mm_set1_epi8(len as i8))
 }
 
@@ -66,8 +69,10 @@ pub unsafe fn ld(array: *const u32) -> State {
 }
 
 #[cfg(not(feature = "hybrid"))]
-#[inline(always)]
-pub unsafe fn compress_8(mut ptr: *const State, end_address: usize, hash_vector: State, len: usize) -> State {
+#[inline(never)]
+pub unsafe fn compress_8(mut ptr: *const State, whole_vector_count: usize, hash_vector: State, len: usize) -> State {
+
+    let end_address = ptr.add((whole_vector_count / 8) * 8) as usize;
 
     // Disambiguation vectors
     let mut t1: State = create_empty();
@@ -102,12 +107,13 @@ pub unsafe fn compress_8(mut ptr: *const State, end_address: usize, hash_vector:
     let len_vec =  _mm_set1_epi32(len as i32);
     lane1 = _mm_add_epi8(lane1, len_vec);
     lane2 = _mm_add_epi8(lane2, len_vec);
+
     // Merge lanes
     aes_encrypt(lane1, lane2)
 }
 
 #[cfg(feature = "hybrid")]
-#[inline(always)]
+#[inline(never)]
 pub unsafe fn compress_8(ptr: *const State, end_address: usize, hash_vector: State, len: usize) -> State {
     macro_rules! load_unaligned_x2 {
         ($ptr:ident, $($var:ident),+) => {
@@ -134,6 +140,7 @@ pub unsafe fn compress_8(ptr: *const State, end_address: usize, hash_vector: Sta
 
         lane = _mm256_aesenclast_epi128(_mm256_aesenc_epi128(tmp, t), lane);
     }
+    
     // Extract the two 128-bit lanes
     let mut lane1 = _mm256_castsi256_si128(lane);
     let mut lane2 = _mm256_extracti128_si256(lane, 1);
diff --git a/src/hasher.rs b/src/hasher.rs
index f8f72a3..1d910ef 100644
--- a/src/hasher.rs
+++ b/src/hasher.rs
@@ -113,7 +113,7 @@ impl Hasher for GxHasher {
     #[inline]
     fn write(&mut self, bytes: &[u8]) {
         // Improvement: only compress at this stage and finalize in finish
-        self.state = unsafe { aes_encrypt_last(compress_all(bytes), aes_encrypt(self.state, ld(KEYS.as_ptr()))) };
+        self.state = unsafe { gxhash_no_finish(bytes, self.state) };
     }
 
     write!(write_u8, u8, load_u8);