From 340f9627109601d81bdd6d2ef0469ed72b7b6f76 Mon Sep 17 00:00:00 2001
From: aviadingo <aviad@ingonyama.com>
Date: Sun, 11 Aug 2024 13:58:49 +0300
Subject: [PATCH] ran clang-format

---
 icicle/include/hash/blake2s/blake2s.cuh       |  80 +++---
 icicle/src/hash/blake2s/Makefile              |   8 +-
 icicle/src/hash/blake2s/blake2s.cu            | 240 +++++++++--------
 icicle/src/hash/blake2s/extern.cu             |   9 +-
 icicle/src/hash/blake2s/test_blake2s.cu       | 168 ++++++------
 .../src/hash/blake2s/test_blake2s_batched.cu  | 243 +++++++++---------
 .../src/hash/blake2s/test_blake2s_hasher.cu   | 110 --------
 icicle/src/hash/blake2s/test_blake2s_integ.cu | 111 ++++++++
 icicle/src/hash/blake2s/test_blake2s_seq.cu   | 165 ++++++------
 .../src/hash/blake2s/test_blake2s_seq_sa.cu   | 172 ++++++-------
 icicle/src/hash/blake2s/test_tree.cu          |  11 +-
 11 files changed, 645 insertions(+), 672 deletions(-)
 delete mode 100644 icicle/src/hash/blake2s/test_blake2s_hasher.cu
 create mode 100644 icicle/src/hash/blake2s/test_blake2s_integ.cu

diff --git a/icicle/include/hash/blake2s/blake2s.cuh b/icicle/include/hash/blake2s/blake2s.cuh
index a4c675ce4..07059f0cd 100644
--- a/icicle/include/hash/blake2s/blake2s.cuh
+++ b/icicle/include/hash/blake2s/blake2s.cuh
@@ -7,49 +7,45 @@
  * This file is released into the Public Domain.
  */
 
+#pragma once
+typedef unsigned char BYTE;
+typedef unsigned int WORD;
+typedef unsigned long long LONG;
 
- #pragma once
- typedef unsigned char BYTE;
- typedef unsigned int  WORD;
- typedef unsigned long long LONG;
- 
- #include <stdlib.h>
- #include <string.h>
- #include <stdio.h>
- #include <stdint.h>
- #include "gpu-utils/device_context.cuh"
- #include "gpu-utils/error_handler.cuh"
- 
- #include "hash/hash.cuh"
- using namespace hash;
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdint.h>
+#include "gpu-utils/device_context.cuh"
+#include "gpu-utils/error_handler.cuh"
 
-namespace blake2s{
-    #define BLAKE2S_ROUNDS 10
-    #define BLAKE2S_BLOCK_LENGTH 64
-    #define BLAKE2S_CHAIN_SIZE 8
-    #define BLAKE2S_CHAIN_LENGTH (BLAKE2S_CHAIN_SIZE * sizeof(uint32_t))
-    #define BLAKE2S_STATE_SIZE 16
-    #define BLAKE2S_STATE_LENGTH (BLAKE2S_STATE_SIZE * sizeof(uint32_t))
+#include "hash/hash.cuh"
+using namespace hash;
 
-    class Blake2s : public Hasher<BYTE, BYTE>
-    {
-    public:
-      cudaError_t run_hash_many_kernel(
-        const BYTE* input,
-        BYTE*       output,
-        WORD        number_of_states,
-        WORD        input_len,
-        WORD        output_len,
-        const device_context::DeviceContext& ctx) const override;
-        
-  
-      Blake2s()
-          : Hasher<BYTE, BYTE>(BLAKE2S_STATE_SIZE, BLAKE2S_STATE_SIZE, BLAKE2S_STATE_SIZE, 0)
-      {
-      }
-    };
+namespace blake2s {
+#define BLAKE2S_ROUNDS       10
+#define BLAKE2S_BLOCK_LENGTH 64
+#define BLAKE2S_CHAIN_SIZE   8
+#define BLAKE2S_CHAIN_LENGTH (BLAKE2S_CHAIN_SIZE * sizeof(uint32_t))
+#define BLAKE2S_STATE_SIZE   16
+#define BLAKE2S_STATE_LENGTH (BLAKE2S_STATE_SIZE * sizeof(uint32_t))
 
- extern "C" {
- void mcm_cuda_blake2s_hash_batch(BYTE *key, WORD keylen, BYTE *in, WORD inlen, BYTE *out, WORD output_len, WORD n_batch);
- }
-}
\ No newline at end of file
+  class Blake2s : public Hasher<BYTE, BYTE>
+  {
+  public:
+    cudaError_t run_hash_many_kernel(
+      const BYTE* input,
+      BYTE* output,
+      WORD number_of_states,
+      WORD input_len,
+      WORD output_len,
+      const device_context::DeviceContext& ctx) const override;
+
+    Blake2s() : Hasher<BYTE, BYTE>(BLAKE2S_STATE_SIZE, BLAKE2S_STATE_SIZE, BLAKE2S_STATE_SIZE, 0) {}
+  };
+
+  extern "C" {
+  void
+  mcm_cuda_blake2s_hash_batch(BYTE* key, WORD keylen, BYTE* in, WORD inlen, BYTE* out, WORD output_len, WORD n_batch);
+  }
+} // namespace blake2s
\ No newline at end of file
diff --git a/icicle/src/hash/blake2s/Makefile b/icicle/src/hash/blake2s/Makefile
index 32d7c0fce..4d13a3733 100644
--- a/icicle/src/hash/blake2s/Makefile
+++ b/icicle/src/hash/blake2s/Makefile
@@ -6,9 +6,9 @@ test_blake2s_batched: test_blake2s_batched.cu blake2s.cu
 	nvcc -o test_blake2s_batched -I. -I../../../include test_blake2s_batched.cu -g
 	./test_blake2s_batched ./batched_test_vectors.csv
 	
-test_blake2s_hasher: test_blake2s_hasher.cu blake2s.cu 
-	nvcc -o test_blake2s_hasher -I. -I../../../include test_blake2s_hasher.cu -g
-	./test_blake2s_hasher
+test_blake2s_integ: test_blake2s_integ.cu blake2s.cu 
+	nvcc -o test_blake2s_integ -I. -I../../../include test_blake2s_integ.cu -g
+	./test_blake2s_integ
 
 test_blake2s_seq: test_blake2s_seq.cu blake2s.cu 
 	nvcc -o test_blake2s_seq -I. -I../../../include test_blake2s_seq.cu -g
@@ -23,4 +23,4 @@ test_blake2s_tree: test_tree.cu blake2s.cu ../../merkle-tree/merkle.cu
 	./test_blake2s_tree
 
 clear:
-	rm test_blake2s test_blake2s_tree test_blake2s_hasher test_blake2s_seq test_blake2s_seq_sa test_blake2s_batched
\ No newline at end of file
+	rm test_blake2s test_blake2s_tree test_blake2s_integ test_blake2s_seq test_blake2s_seq_sa test_blake2s_batched
\ No newline at end of file
diff --git a/icicle/src/hash/blake2s/blake2s.cu b/icicle/src/hash/blake2s/blake2s.cu
index e53ad03d0..9d1b1a44b 100644
--- a/icicle/src/hash/blake2s/blake2s.cu
+++ b/icicle/src/hash/blake2s/blake2s.cu
@@ -10,8 +10,7 @@ using namespace hash;
 
 namespace blake2s {
 
-
-typedef struct {
+  typedef struct {
     WORD digestlen;
     BYTE key[32];
     WORD keylen;
@@ -22,27 +21,24 @@ typedef struct {
     uint32_t t0;
     uint32_t t1;
     uint32_t f0;
-} cuda_blake2s_ctx_t;
+  } cuda_blake2s_ctx_t;
 
-typedef cuda_blake2s_ctx_t CUDA_BLAKE2S_CTX;
+  typedef cuda_blake2s_ctx_t CUDA_BLAKE2S_CTX;
 
-__constant__ CUDA_BLAKE2S_CTX c_CTX;
+  __constant__ CUDA_BLAKE2S_CTX c_CTX;
 
-__constant__ uint32_t BLAKE2S_IVS[8] = {
-    0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
-    0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL
-};
+  __constant__ uint32_t BLAKE2S_IVS[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
+                                          0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL};
 
-const uint32_t CPU_BLAKE2S_IVS[8] = {
-    0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
-    0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL
-};
+  const uint32_t CPU_BLAKE2S_IVS[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
+                                       0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL};
 
-void cpu_blake2s_init(cuda_blake2s_ctx_t *ctx, BYTE *key, WORD keylen, WORD digestbitlen) {
+  void cpu_blake2s_init(cuda_blake2s_ctx_t* ctx, BYTE* key, WORD keylen, WORD digestbitlen)
+  {
     memset(ctx, 0, sizeof(cuda_blake2s_ctx_t));
     if (keylen > 0) {
-        memcpy(ctx->buff, key, keylen);
-        memcpy(ctx->key, key, keylen);
+      memcpy(ctx->buff, key, keylen);
+      memcpy(ctx->key, key, keylen);
     }
     ctx->keylen = keylen;
     ctx->digestlen = digestbitlen >> 3;
@@ -60,32 +56,27 @@ void cpu_blake2s_init(cuda_blake2s_ctx_t *ctx, BYTE *key, WORD keylen, WORD dige
     ctx->chain[7] = CPU_BLAKE2S_IVS[7];
 
     ctx->pos = (keylen > 0) ? BLAKE2S_BLOCK_LENGTH : 0;
-}
-
-__constant__ uint8_t BLAKE2S_SIGMA[10][16] = {
-    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
-    { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
-    { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
-    { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
-    { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
-    { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
-    { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
-    { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
-    { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
-    { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }
-};
-
-__device__ uint32_t cuda_blake2s_leuint32(const BYTE *in) {
+  }
+
+  __constant__ uint8_t BLAKE2S_SIGMA[10][16] = {
+    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
+    {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4}, {7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8},
+    {9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13}, {2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9},
+    {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11}, {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10},
+    {6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5}, {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0}};
+
+  __device__ uint32_t cuda_blake2s_leuint32(const BYTE* in)
+  {
     uint32_t a;
     memcpy(&a, in, 4);
     return a;
-}
+  }
 
-__device__ uint32_t cuda_blake2s_ROTR32(uint32_t a, uint8_t b) {
-    return (a >> b) | (a << (32 - b));
-}
+  __device__ uint32_t cuda_blake2s_ROTR32(uint32_t a, uint8_t b) { return (a >> b) | (a << (32 - b)); }
 
-__device__ void cuda_blake2s_G(cuda_blake2s_ctx_t *ctx, uint32_t m1, uint32_t m2, int32_t a, int32_t b, int32_t c, int32_t d) {
+  __device__ void
+  cuda_blake2s_G(cuda_blake2s_ctx_t* ctx, uint32_t m1, uint32_t m2, int32_t a, int32_t b, int32_t c, int32_t d)
+  {
     ctx->state[a] = ctx->state[a] + ctx->state[b] + m1;
     ctx->state[d] = cuda_blake2s_ROTR32(ctx->state[d] ^ ctx->state[a], 16);
     ctx->state[c] = ctx->state[c] + ctx->state[d];
@@ -94,9 +85,10 @@ __device__ void cuda_blake2s_G(cuda_blake2s_ctx_t *ctx, uint32_t m1, uint32_t m2
     ctx->state[d] = cuda_blake2s_ROTR32(ctx->state[d] ^ ctx->state[a], 8);
     ctx->state[c] = ctx->state[c] + ctx->state[d];
     ctx->state[b] = cuda_blake2s_ROTR32(ctx->state[b] ^ ctx->state[c], 7);
-}
+  }
 
-__device__ __forceinline__ void cuda_blake2s_init_state(cuda_blake2s_ctx_t *ctx) {
+  __device__ __forceinline__ void cuda_blake2s_init_state(cuda_blake2s_ctx_t* ctx)
+  {
     memcpy(ctx->state, ctx->chain, BLAKE2S_CHAIN_LENGTH);
     // ctx->state[8] = ctx->t0;
     // ctx->state[9] = ctx->t1;
@@ -113,30 +105,32 @@ __device__ __forceinline__ void cuda_blake2s_init_state(cuda_blake2s_ctx_t *ctx)
     // ctx->state[12] = BLAKE2S_IVS[5];
     // ctx->state[13] = BLAKE2S_IVS[6];
     // ctx->state[14] = BLAKE2S_IVS[7];
-}
+  }
 
-__device__ __forceinline__ void cuda_blake2s_compress(cuda_blake2s_ctx_t *ctx, const BYTE *in, WORD inoffset) {
+  __device__ __forceinline__ void cuda_blake2s_compress(cuda_blake2s_ctx_t* ctx, const BYTE* in, WORD inoffset)
+  {
     cuda_blake2s_init_state(ctx);
-    uint32_t m[16] = { 0 };
+    uint32_t m[16] = {0};
     for (int j = 0; j < 16; j++)
-        m[j] = cuda_blake2s_leuint32(in + inoffset + (j << 2));
+      m[j] = cuda_blake2s_leuint32(in + inoffset + (j << 2));
 
     for (int round = 0; round < BLAKE2S_ROUNDS; round++) {
-        cuda_blake2s_G(ctx, m[BLAKE2S_SIGMA[round][0]], m[BLAKE2S_SIGMA[round][1]], 0, 4, 8, 12);
-        cuda_blake2s_G(ctx, m[BLAKE2S_SIGMA[round][2]], m[BLAKE2S_SIGMA[round][3]], 1, 5, 9, 13);
-        cuda_blake2s_G(ctx, m[BLAKE2S_SIGMA[round][4]], m[BLAKE2S_SIGMA[round][5]], 2, 6, 10, 14);
-        cuda_blake2s_G(ctx, m[BLAKE2S_SIGMA[round][6]], m[BLAKE2S_SIGMA[round][7]], 3, 7, 11, 15);
-        cuda_blake2s_G(ctx, m[BLAKE2S_SIGMA[round][8]], m[BLAKE2S_SIGMA[round][9]], 0, 5, 10, 15);
-        cuda_blake2s_G(ctx, m[BLAKE2S_SIGMA[round][10]], m[BLAKE2S_SIGMA[round][11]], 1, 6, 11, 12);
-        cuda_blake2s_G(ctx, m[BLAKE2S_SIGMA[round][12]], m[BLAKE2S_SIGMA[round][13]], 2, 7, 8, 13);
-        cuda_blake2s_G(ctx, m[BLAKE2S_SIGMA[round][14]], m[BLAKE2S_SIGMA[round][15]], 3, 4, 9, 14);
+      cuda_blake2s_G(ctx, m[BLAKE2S_SIGMA[round][0]], m[BLAKE2S_SIGMA[round][1]], 0, 4, 8, 12);
+      cuda_blake2s_G(ctx, m[BLAKE2S_SIGMA[round][2]], m[BLAKE2S_SIGMA[round][3]], 1, 5, 9, 13);
+      cuda_blake2s_G(ctx, m[BLAKE2S_SIGMA[round][4]], m[BLAKE2S_SIGMA[round][5]], 2, 6, 10, 14);
+      cuda_blake2s_G(ctx, m[BLAKE2S_SIGMA[round][6]], m[BLAKE2S_SIGMA[round][7]], 3, 7, 11, 15);
+      cuda_blake2s_G(ctx, m[BLAKE2S_SIGMA[round][8]], m[BLAKE2S_SIGMA[round][9]], 0, 5, 10, 15);
+      cuda_blake2s_G(ctx, m[BLAKE2S_SIGMA[round][10]], m[BLAKE2S_SIGMA[round][11]], 1, 6, 11, 12);
+      cuda_blake2s_G(ctx, m[BLAKE2S_SIGMA[round][12]], m[BLAKE2S_SIGMA[round][13]], 2, 7, 8, 13);
+      cuda_blake2s_G(ctx, m[BLAKE2S_SIGMA[round][14]], m[BLAKE2S_SIGMA[round][15]], 3, 4, 9, 14);
     }
 
     for (int offset = 0; offset < BLAKE2S_CHAIN_SIZE; offset++)
-        ctx->chain[offset] = ctx->chain[offset] ^ ctx->state[offset] ^ ctx->state[offset + 8];
-}
+      ctx->chain[offset] = ctx->chain[offset] ^ ctx->state[offset] ^ ctx->state[offset + 8];
+  }
 
-__device__ void cuda_blake2s_init(cuda_blake2s_ctx_t *ctx, BYTE *key, WORD keylen, WORD digestbitlen) {
+  __device__ void cuda_blake2s_init(cuda_blake2s_ctx_t* ctx, BYTE* key, WORD keylen, WORD digestbitlen)
+  {
     memset(ctx, 0, sizeof(cuda_blake2s_ctx_t));
     ctx->keylen = keylen;
     ctx->digestlen = digestbitlen >> 3;
@@ -152,57 +146,56 @@ __device__ void cuda_blake2s_init(cuda_blake2s_ctx_t *ctx, BYTE *key, WORD keyle
     ctx->chain[5] = BLAKE2S_IVS[5];
     ctx->chain[6] = BLAKE2S_IVS[6];
     ctx->chain[7] = BLAKE2S_IVS[7];
-    
+
     if (keylen > 0) {
-        memcpy(ctx->buff, key, keylen);
-        memcpy(ctx->key, key, keylen);
+      memcpy(ctx->buff, key, keylen);
+      memcpy(ctx->key, key, keylen);
     }
     ctx->pos = (keylen > 0) ? BLAKE2S_BLOCK_LENGTH : 0;
-}
+  }
 
-__device__ void cuda_blake2s_update(cuda_blake2s_ctx_t *ctx, const BYTE *in, LONG inlen) {
-    if (inlen == 0)
-        return;
+  __device__ void cuda_blake2s_update(cuda_blake2s_ctx_t* ctx, const BYTE* in, LONG inlen)
+  {
+    if (inlen == 0) return;
 
     WORD start = 0;
     int64_t in_index = 0, block_index = 0;
 
     if (ctx->pos) {
-        start = BLAKE2S_BLOCK_LENGTH - ctx->pos;
-        if (start < inlen) {
-            memcpy(ctx->buff + ctx->pos, in, start);
-            ctx->t0 += BLAKE2S_BLOCK_LENGTH;
-
-            if (ctx->t0 == 0) ctx->t1++;
-
-            cuda_blake2s_compress(ctx, ctx->buff, 0);
-            ctx->pos = 0;
-            memset(ctx->buff, 0, BLAKE2S_BLOCK_LENGTH);
-        } else {
-            memcpy(ctx->buff + ctx->pos, in, inlen);
-            ctx->pos += inlen;
-            return;
-        }
+      start = BLAKE2S_BLOCK_LENGTH - ctx->pos;
+      if (start < inlen) {
+        memcpy(ctx->buff + ctx->pos, in, start);
+        ctx->t0 += BLAKE2S_BLOCK_LENGTH;
+
+        if (ctx->t0 == 0) ctx->t1++;
+
+        cuda_blake2s_compress(ctx, ctx->buff, 0);
+        ctx->pos = 0;
+        memset(ctx->buff, 0, BLAKE2S_BLOCK_LENGTH);
+      } else {
+        memcpy(ctx->buff + ctx->pos, in, inlen);
+        ctx->pos += inlen;
+        return;
+      }
     }
 
     block_index = inlen - BLAKE2S_BLOCK_LENGTH;
     for (in_index = start; in_index < block_index; in_index += BLAKE2S_BLOCK_LENGTH) {
-        ctx->t0 += BLAKE2S_BLOCK_LENGTH;
-        if (ctx->t0 == 0)
-            ctx->t1++;
+      ctx->t0 += BLAKE2S_BLOCK_LENGTH;
+      if (ctx->t0 == 0) ctx->t1++;
 
-        cuda_blake2s_compress(ctx, in, in_index);
+      cuda_blake2s_compress(ctx, in, in_index);
     }
 
     memcpy(ctx->buff, in + in_index, inlen - in_index);
     ctx->pos += inlen - in_index;
-}
+  }
 
-__device__ void cuda_blake2s_final(cuda_blake2s_ctx_t *ctx, BYTE *out) {
+  __device__ void cuda_blake2s_final(cuda_blake2s_ctx_t* ctx, BYTE* out)
+  {
     ctx->f0 = 0xFFFFFFFFUL;
     ctx->t0 += ctx->pos;
-    if (ctx->pos > 0 && ctx->t0 == 0)
-        ctx->t1++;
+    if (ctx->pos > 0 && ctx->t0 == 0) ctx->t1++;
 
     cuda_blake2s_compress(ctx, ctx->buff, 0);
     memset(ctx->buff, 0, BLAKE2S_BLOCK_LENGTH);
@@ -210,34 +203,36 @@ __device__ void cuda_blake2s_final(cuda_blake2s_ctx_t *ctx, BYTE *out) {
 
     int i4 = 0;
     for (int i = 0; i < BLAKE2S_CHAIN_SIZE && ((i4 = i * 4) < ctx->digestlen); i++) {
-        BYTE *BYTEs = (BYTE*)(&ctx->chain[i]);
-        if (i4 < ctx->digestlen - 4)
-            memcpy(out + i4, BYTEs, 4);
-        else
-            memcpy(out + i4, BYTEs, ctx->digestlen - i4);
+      BYTE* BYTEs = (BYTE*)(&ctx->chain[i]);
+      if (i4 < ctx->digestlen - 4)
+        memcpy(out + i4, BYTEs, 4);
+      else
+        memcpy(out + i4, BYTEs, ctx->digestlen - i4);
     }
-}
+  }
 
-__global__ void kernel_blake2s_hash(const BYTE *indata, WORD inlen, BYTE *outdata, WORD n_batch, WORD BLAKE2S_BLOCK_SIZE) {
+  __global__ void
+  kernel_blake2s_hash(const BYTE* indata, WORD inlen, BYTE* outdata, WORD n_batch, WORD BLAKE2S_BLOCK_SIZE)
+  {
     WORD thread = blockIdx.x * blockDim.x + threadIdx.x;
-    if (thread >= n_batch) {
-        return;
-    }
-    BYTE key[32] = "";  // Null key
+    if (thread >= n_batch) { return; }
+    BYTE key[32] = ""; // Null key
     WORD keylen = 0;
     CUDA_BLAKE2S_CTX blake_ctx;
-    const BYTE *in = indata + thread * inlen;
-    BYTE *out = outdata + thread * BLAKE2S_BLOCK_SIZE;
-    
+    const BYTE* in = indata + thread * inlen;
+    BYTE* out = outdata + thread * BLAKE2S_BLOCK_SIZE;
+
     cuda_blake2s_init(&blake_ctx, key, keylen, (BLAKE2S_BLOCK_SIZE << 3));
     cuda_blake2s_update(&blake_ctx, in, inlen);
     cuda_blake2s_final(&blake_ctx, out);
-}
-
-extern "C" {
-void mcm_cuda_blake2s_hash_batch(BYTE *key, WORD keylen, BYTE *in, WORD inlen, BYTE *out, WORD output_len, WORD n_batch) {
-    BYTE *cuda_indata;
-    BYTE *cuda_outdata;
+  }
+
+  extern "C" {
+  void
+  mcm_cuda_blake2s_hash_batch(BYTE* key, WORD keylen, BYTE* in, WORD inlen, BYTE* out, WORD output_len, WORD n_batch)
+  {
+    BYTE* cuda_indata;
+    BYTE* cuda_outdata;
     const WORD BLAKE2S_BLOCK_SIZE = output_len;
     cudaMalloc(&cuda_indata, inlen * n_batch);
     cudaMalloc(&cuda_outdata, BLAKE2S_BLOCK_SIZE * n_batch);
@@ -255,32 +250,29 @@ void mcm_cuda_blake2s_hash_batch(BYTE *key, WORD keylen, BYTE *in, WORD inlen, B
     cudaMemcpy(out, cuda_outdata, BLAKE2S_BLOCK_SIZE * n_batch, cudaMemcpyDeviceToHost);
     cudaDeviceSynchronize();
     cudaError_t error = cudaGetLastError();
-    if (error != cudaSuccess) {
-        printf("Error cuda blake2s hash: %s \n", cudaGetErrorString(error));
-    }
+    if (error != cudaSuccess) { printf("Error cuda blake2s hash: %s \n", cudaGetErrorString(error)); }
     cudaFree(cuda_indata);
     cudaFree(cuda_outdata);
-}
-}
+  }
+  }
 
-cudaError_t Blake2s::run_hash_many_kernel(
+  cudaError_t Blake2s::run_hash_many_kernel(
     const BYTE* input,
-    BYTE*       output,
-    WORD        number_of_states,
-    WORD        input_len,
-    WORD        output_len,
+    BYTE* output,
+    WORD number_of_states,
+    WORD input_len,
+    WORD output_len,
     const device_context::DeviceContext& ctx) const
-    {
-        const WORD BLAKE2S_BLOCK_SIZE = output_len;
-        WORD thread = 256;
-        WORD block = (number_of_states + thread - 1) / thread;
-
-        kernel_blake2s_hash<<<block, thread, 0, ctx.stream>>>(input, input_len, output, number_of_states, BLAKE2S_BLOCK_SIZE);
-
-        CHK_IF_RETURN(cudaPeekAtLastError());
-        return CHK_LAST();
+  {
+    const WORD BLAKE2S_BLOCK_SIZE = output_len;
+    WORD thread = 256;
+    WORD block = (number_of_states + thread - 1) / thread;
 
-    }
+    kernel_blake2s_hash<<<block, thread, 0, ctx.stream>>>(
+      input, input_len, output, number_of_states, BLAKE2S_BLOCK_SIZE);
 
+    CHK_IF_RETURN(cudaPeekAtLastError());
+    return CHK_LAST();
+  }
 
 } // namespace blake2s
\ No newline at end of file
diff --git a/icicle/src/hash/blake2s/extern.cu b/icicle/src/hash/blake2s/extern.cu
index 1e5abe1cd..eb9120f78 100644
--- a/icicle/src/hash/blake2s/extern.cu
+++ b/icicle/src/hash/blake2s/extern.cu
@@ -9,11 +9,10 @@
 #include "merkle-tree/merkle.cuh"
 
 namespace blake2s {
-  extern "C" cudaError_t
-  blake2s_cuda(BYTE * input, BYTE * output,  WORD number_of_blocks, WORD input_block_size, WORD output_block_size, HashConfig& config)
+  extern "C" cudaError_t blake2s_cuda(
+    BYTE* input, BYTE* output, WORD number_of_blocks, WORD input_block_size, WORD output_block_size, HashConfig& config)
   {
-    return Blake2s().hash_many(
-      input, output, number_of_blocks, input_block_size, output_block_size, config);
+    return Blake2s().hash_many(input, output, number_of_blocks, input_block_size, output_block_size, config);
   }
 
   extern "C" cudaError_t build_blake2s_merkle_tree_cuda(
@@ -28,6 +27,4 @@ namespace blake2s {
       leaves, digests, height, input_block_len, blake2s, blake2s, tree_config);
   }
 
-
-
 } // namespace blake2s
\ No newline at end of file
diff --git a/icicle/src/hash/blake2s/test_blake2s.cu b/icicle/src/hash/blake2s/test_blake2s.cu
index 2dc67b0e8..2c68e6d3f 100644
--- a/icicle/src/hash/blake2s/test_blake2s.cu
+++ b/icicle/src/hash/blake2s/test_blake2s.cu
@@ -9,101 +9,103 @@
 
 #include "hash/blake2s/blake2s.cuh"
 
-
 using namespace blake2s;
 
 #define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
-#define END_TIMER(timer, msg) \
+#define END_TIMER(timer, msg)                                                                                          \
   printf("%s: %.0f us\n", msg, FpMicroseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
 
 extern "C" {
-void mcm_cuda_blake2s_hash_batch(BYTE *key, WORD keylen, BYTE *in, WORD inlen, BYTE *out, WORD outlen, WORD n_batch);
+void mcm_cuda_blake2s_hash_batch(BYTE* key, WORD keylen, BYTE* in, WORD inlen, BYTE* out, WORD outlen, WORD n_batch);
 }
 
-void print_hash(BYTE *hash, WORD len) {
-    printf("%d \n", len);
-    for (WORD i = 0; i < len; i++) {
-        printf("%02x", hash[i]);
-    }
-    printf("\n");
+void print_hash(BYTE* hash, WORD len)
+{
+  printf("%d \n", len);
+  for (WORD i = 0; i < len; i++) {
+    printf("%02x", hash[i]);
+  }
+  printf("\n");
 }
 
-BYTE *read_file(const char *filename, size_t *filesize) {
-    FILE *file = fopen(filename, "rb");
-    if (!file) {
-        perror("Failed to open file");
-        exit(EXIT_FAILURE);
-    }
-
-    fseek(file, 0, SEEK_END);
-    *filesize = ftell(file);
-    fseek(file, 0, SEEK_SET);
-
-    BYTE *buffer = (BYTE *)malloc(*filesize);
-    if (!buffer) {
-        perror("Failed to allocate memory");
-        fclose(file);
-        exit(EXIT_FAILURE);
-    }
-
-    size_t bytesRead = fread(buffer, 1, *filesize, file);
-    if (bytesRead != *filesize) {
-        perror("Failed to read file");
-        free(buffer);
-        fclose(file);
-        exit(EXIT_FAILURE);
-    }
+BYTE* read_file(const char* filename, size_t* filesize)
+{
+  FILE* file = fopen(filename, "rb");
+  if (!file) {
+    perror("Failed to open file");
+    exit(EXIT_FAILURE);
+  }
+
+  fseek(file, 0, SEEK_END);
+  *filesize = ftell(file);
+  fseek(file, 0, SEEK_SET);
+
+  BYTE* buffer = (BYTE*)malloc(*filesize);
+  if (!buffer) {
+    perror("Failed to allocate memory");
+    fclose(file);
+    exit(EXIT_FAILURE);
+  }
 
+  size_t bytesRead = fread(buffer, 1, *filesize, file);
+  if (bytesRead != *filesize) {
+    perror("Failed to read file");
+    free(buffer);
     fclose(file);
-    return buffer;
+    exit(EXIT_FAILURE);
+  }
+
+  fclose(file);
+  return buffer;
 }
 
-int main(int argc, char **argv) {
-    using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
-    using FpMicroseconds = std::chrono::duration<float, std::chrono::microseconds::period>;
-
-    BYTE *input;
-    size_t inlen;
-    const char *input_filename;
-    const char *default_input = "aaaaaaaaaaa";
-
-    if (argc < 2) {
-        // Use default input if no file is provided
-        input = (BYTE *)default_input;
-        inlen = strlen(default_input);
-    } else {
-        input_filename = argv[1];
-        input = read_file(input_filename, &inlen);
-    }
-
-    // Test parameters
-    BYTE key[32] = "";  // Example key
-    WORD keylen = strlen((char *)key);
-    WORD n_outbit = 256;  // Output length in bits
-    WORD n_batch = 1;  // Number of hashes to compute in parallel
-
-    // Allocate memory for the output
-    WORD outlen = n_outbit / 8;
-    BYTE *output = (BYTE *)malloc(outlen * n_batch);
-    if (!output) {
-        perror("Failed to allocate memory for output");
-        if (argc >= 2) free(input);  // Free file buffer if it was allocated
-        return EXIT_FAILURE;
-    }
-
-    printf("Key len: %d \n", keylen);
-    
-    // Perform the hashing
-    START_TIMER(blake_timer)
-    mcm_cuda_blake2s_hash_batch(key, keylen, input, inlen, output, outlen, n_batch);
-    END_TIMER(blake_timer, "Blake Timer")
-    
-    // Print the result
-    printf("BLAKE2S hash:\n");
-    print_hash(output, outlen);
-
-    // Clean up
-    free(output);
-    if (argc >= 2) free(input);  // Free file buffer if it was allocated
-    return 0;
+int main(int argc, char** argv)
+{
+  using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
+  using FpMicroseconds = std::chrono::duration<float, std::chrono::microseconds::period>;
+
+  BYTE* input;
+  size_t inlen;
+  const char* input_filename;
+  const char* default_input = "aaaaaaaaaaa";
+
+  if (argc < 2) {
+    // Use default input if no file is provided
+    input = (BYTE*)default_input;
+    inlen = strlen(default_input);
+  } else {
+    input_filename = argv[1];
+    input = read_file(input_filename, &inlen);
+  }
+
+  // Test parameters
+  BYTE key[32] = ""; // Example key
+  WORD keylen = strlen((char*)key);
+  WORD n_outbit = 256; // Output length in bits
+  WORD n_batch = 1;    // Number of hashes to compute in parallel
+
+  // Allocate memory for the output
+  WORD outlen = n_outbit / 8;
+  BYTE* output = (BYTE*)malloc(outlen * n_batch);
+  if (!output) {
+    perror("Failed to allocate memory for output");
+    if (argc >= 2) free(input); // Free file buffer if it was allocated
+    return EXIT_FAILURE;
+  }
+
+  printf("Key len: %d \n", keylen);
+
+  // Perform the hashing
+  START_TIMER(blake_timer)
+  mcm_cuda_blake2s_hash_batch(key, keylen, input, inlen, output, outlen, n_batch);
+  END_TIMER(blake_timer, "Blake Timer")
+
+  // Print the result
+  printf("BLAKE2S hash:\n");
+  print_hash(output, outlen);
+
+  // Clean up
+  free(output);
+  if (argc >= 2) free(input); // Free file buffer if it was allocated
+  return 0;
 }
diff --git a/icicle/src/hash/blake2s/test_blake2s_batched.cu b/icicle/src/hash/blake2s/test_blake2s_batched.cu
index 8bef55049..405cd924c 100644
--- a/icicle/src/hash/blake2s/test_blake2s_batched.cu
+++ b/icicle/src/hash/blake2s/test_blake2s_batched.cu
@@ -14,139 +14,142 @@
 using namespace blake2s;
 
 #define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
-#define END_TIMER(timer, msg) \
+#define END_TIMER(timer, msg)                                                                                          \
   printf("%s: %.0f us\n", msg, FpMicroseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
 
-
-void print_hash(BYTE *hash, WORD len) {
-    for (WORD i = 0; i < len; i++) {
-        printf("%02x", hash[i]);
-    }
-    printf("\n");
+void print_hash(BYTE* hash, WORD len)
+{
+  for (WORD i = 0; i < len; i++) {
+    printf("%02x", hash[i]);
+  }
+  printf("\n");
 }
 
 // Function to trim leading and trailing whitespace from a string
-std::string trim(const std::string& str) {
-    size_t first = str.find_first_not_of(' ');
-    size_t last = str.find_last_not_of(' ');
-    return str.substr(first, (last - first + 1));
+std::string trim(const std::string& str)
+{
+  size_t first = str.find_first_not_of(' ');
+  size_t last = str.find_last_not_of(' ');
+  return str.substr(first, (last - first + 1));
 }
 
-std::unordered_map<std::string, std::string> load_strings_and_hashes_from_csv(const char *filename) {
-    std::unordered_map<std::string, std::string> string_hash_map;
-    std::ifstream file(filename);
-    if (!file.is_open()) {
-        perror("Failed to open CSV file");
-        exit(EXIT_FAILURE);
+std::unordered_map<std::string, std::string> load_strings_and_hashes_from_csv(const char* filename)
+{
+  std::unordered_map<std::string, std::string> string_hash_map;
+  std::ifstream file(filename);
+  if (!file.is_open()) {
+    perror("Failed to open CSV file");
+    exit(EXIT_FAILURE);
+  }
+
+  std::string line;
+  while (std::getline(file, line)) {
+    std::stringstream ss(line);
+    std::string input_string, hash_value;
+    if (std::getline(ss, input_string, ',') && std::getline(ss, hash_value, ',')) {
+      // Trim any whitespace around the strings
+      input_string = trim(input_string);
+      hash_value = trim(hash_value);
+      string_hash_map[input_string] = hash_value;
     }
+  }
 
-    std::string line;
-    while (std::getline(file, line)) {
-        std::stringstream ss(line);
-        std::string input_string, hash_value;
-        if (std::getline(ss, input_string, ',') && std::getline(ss, hash_value, ',')) {
-            // Trim any whitespace around the strings
-            input_string = trim(input_string);
-            hash_value = trim(hash_value);
-            string_hash_map[input_string] = hash_value;
-        }
-    }
-
-    file.close();
-    return string_hash_map;
+  file.close();
+  return string_hash_map;
 }
 
-int main(int argc, char **argv) {
-    using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
-    using FpMicroseconds = std::chrono::duration<float, std::chrono::microseconds::period>;
-
-    if (argc < 2) {
-        fprintf(stderr, "Usage: %s <strings and hashes CSV file>\n", argv[0]);
-        return EXIT_FAILURE;
-    }
-
-    const char *csv_filename = argv[1];
-    auto string_hash_map = load_strings_and_hashes_from_csv(csv_filename);
-
-    if (string_hash_map.size() != 10) {
-        fprintf(stderr, "CSV file must contain exactly 10 strings and hashes.\n");
-        return EXIT_FAILURE;
-    }
-
-    // Prepare the test strings and expected hashes from the map
-    std::vector<std::string> test_strings;
-    std::vector<std::string> expected_hashes;
-    for (const auto& pair : string_hash_map) {
-        test_strings.push_back(pair.first);
-        expected_hashes.push_back(pair.second);
-    }
-
-    // Test parameters
-    BYTE key[32] = "";  // Example key
-    WORD keylen = strlen((char *)key);
-    WORD n_outbit = 256;  // Output length in bits
-    WORD n_batch = 10;  // Number of different inputs to hash in parallel
-    size_t max_len = 10;  // Max length of the test strings
-
-    // Calculate total input length and allocate memory for the batched input
-    size_t total_len = 0;
-    for (const auto& str : test_strings) {
-        total_len += str.size();
-    }
-    BYTE *batched_input = (BYTE *)malloc(total_len);
-    WORD *in_lengths = (WORD *)malloc(n_batch * sizeof(WORD));
-
-    // Copy test strings to batched input and store their lengths
-    BYTE *current_position = batched_input;
-    for (int i = 0; i < n_batch; ++i) {
-        memcpy(current_position, test_strings[i].c_str(), test_strings[i].size());
-        current_position += test_strings[i].size();
-    }
-
-    // Allocate memory for the output
-    WORD outlen = n_outbit / 8;
-    BYTE *output = (BYTE *)malloc(outlen * n_batch);
-    if (!output) {
-        perror("Failed to allocate memory for output");
-        free(batched_input);
-        free(in_lengths);
-        return EXIT_FAILURE;
+int main(int argc, char** argv)
+{
+  using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
+  using FpMicroseconds = std::chrono::duration<float, std::chrono::microseconds::period>;
+
+  if (argc < 2) {
+    fprintf(stderr, "Usage: %s <strings and hashes CSV file>\n", argv[0]);
+    return EXIT_FAILURE;
+  }
+
+  const char* csv_filename = argv[1];
+  auto string_hash_map = load_strings_and_hashes_from_csv(csv_filename);
+
+  if (string_hash_map.size() != 10) {
+    fprintf(stderr, "CSV file must contain exactly 10 strings and hashes.\n");
+    return EXIT_FAILURE;
+  }
+
+  // Prepare the test strings and expected hashes from the map
+  std::vector<std::string> test_strings;
+  std::vector<std::string> expected_hashes;
+  for (const auto& pair : string_hash_map) {
+    test_strings.push_back(pair.first);
+    expected_hashes.push_back(pair.second);
+  }
+
+  // Test parameters
+  BYTE key[32] = ""; // Example key
+  WORD keylen = strlen((char*)key);
+  WORD n_outbit = 256; // Output length in bits
+  WORD n_batch = 10;   // Number of different inputs to hash in parallel
+  size_t max_len = 10; // Max length of the test strings
+
+  // Calculate total input length and allocate memory for the batched input
+  size_t total_len = 0;
+  for (const auto& str : test_strings) {
+    total_len += str.size();
+  }
+  BYTE* batched_input = (BYTE*)malloc(total_len);
+  WORD* in_lengths = (WORD*)malloc(n_batch * sizeof(WORD));
+
+  // Copy test strings to batched input and store their lengths
+  BYTE* current_position = batched_input;
+  for (int i = 0; i < n_batch; ++i) {
+    memcpy(current_position, test_strings[i].c_str(), test_strings[i].size());
+    current_position += test_strings[i].size();
+  }
+
+  // Allocate memory for the output
+  WORD outlen = n_outbit / 8;
+  BYTE* output = (BYTE*)malloc(outlen * n_batch);
+  if (!output) {
+    perror("Failed to allocate memory for output");
+    free(batched_input);
+    free(in_lengths);
+    return EXIT_FAILURE;
+  }
+
+  printf("Key len: %d \n", keylen);
+  HashConfig config = default_hash_config();
+
+  // Perform the hashing
+  START_TIMER(blake_timer)
+  // mcm_cuda_blake2s_hash_batch(key, keylen, batched_input, max_len, output, outlen, n_batch);
+  blake2s_cuda(batched_input, output, n_batch, max_len, outlen, config);
+  END_TIMER(blake_timer, "Blake Timer")
+
+  // Print and compare the results
+  printf("BLAKE2S hash (batch size = %d):\n", n_batch);
+  for (WORD i = 0; i < n_batch; i++) {
+    printf("String: %s\n", test_strings[i].c_str());
+    printf("Computed Hash %d: ", i + 1);
+    print_hash(output + i * outlen, outlen);
+    std::cout << "Expected Hash " << i + 1 << ": " << expected_hashes[i] << std::endl;
+
+    std::string computed_hash;
+    for (WORD j = 0; j < outlen; ++j) {
+      char buffer[3];
+      snprintf(buffer, sizeof(buffer), "%02x", output[i * outlen + j]);
+      computed_hash += buffer;
     }
 
-    printf("Key len: %d \n", keylen);
-    HashConfig config = default_hash_config();
-
-    // Perform the hashing
-    START_TIMER(blake_timer)
-    // mcm_cuda_blake2s_hash_batch(key, keylen, batched_input, max_len, output, outlen, n_batch);
-    blake2s_cuda(batched_input, output, n_batch, max_len, outlen, config);
-    END_TIMER(blake_timer, "Blake Timer")
-    
-    // Print and compare the results
-    printf("BLAKE2S hash (batch size = %d):\n", n_batch);
-    for (WORD i = 0; i < n_batch; i++) {
-        printf("String: %s\n", test_strings[i].c_str());
-        printf("Computed Hash %d: ", i + 1);
-        print_hash(output + i * outlen, outlen);
-        std::cout << "Expected Hash " << i+1 <<": " <<expected_hashes[i] << std::endl;
-
-        std::string computed_hash;
-        for (WORD j = 0; j < outlen; ++j) {
-            char buffer[3];
-            snprintf(buffer, sizeof(buffer), "%02x", output[i * outlen + j]);
-            computed_hash += buffer;
-        }
-
-        if (computed_hash == expected_hashes[i]) {
-            printf(" (Match)\n");
-        } else {
-            printf(" (Mismatch)\n");
-        }
+    if (computed_hash == expected_hashes[i]) {
+      printf(" (Match)\n");
+    } else {
+      printf(" (Mismatch)\n");
     }
+  }
 
-    // Clean up
-    free(output);
-    free(batched_input);
-    free(in_lengths);
-    return 0;
+  // Clean up
+  free(output);
+  free(batched_input);
+  free(in_lengths);
+  return 0;
 }
diff --git a/icicle/src/hash/blake2s/test_blake2s_hasher.cu b/icicle/src/hash/blake2s/test_blake2s_hasher.cu
deleted file mode 100644
index a1f497b9f..000000000
--- a/icicle/src/hash/blake2s/test_blake2s_hasher.cu
+++ /dev/null
@@ -1,110 +0,0 @@
-#include <chrono>
-#include "gpu-utils/device_context.cuh"
-
-#include <cassert>
-#include <chrono>
-#include <fstream>
-#include <iostream>
-#include <iomanip>
-#include "extern.cu"
-
-
-using namespace blake2s;
-
-#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
-#define END_TIMER(timer, msg) \
-  printf("%s: %.0f us\n", msg, FpMicroseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
-
-extern "C" {
-void mcm_cuda_blake2s_hash_batch(BYTE *key, WORD keylen, BYTE *in, WORD inlen, BYTE *out, WORD n_outbit, WORD n_batch);
-}
-
-void print_hash(BYTE *hash, WORD len) {
-    printf("Hash Len: %d \n", len);
-    printf("BLAKE2S hash:\n");
-    for (WORD i = 0; i < len; i++) {
-        printf("%02x", hash[i]);
-    }
-    printf("\n");
-}
-
-BYTE *read_file(const char *filename, size_t *filesize) {
-    FILE *file = fopen(filename, "rb");
-    if (!file) {
-        perror("Failed to open file");
-        exit(EXIT_FAILURE);
-    }
-
-    fseek(file, 0, SEEK_END);
-    *filesize = ftell(file);
-    fseek(file, 0, SEEK_SET);
-
-    BYTE *buffer = (BYTE *)malloc(*filesize);
-    if (!buffer) {
-        perror("Failed to allocate memory");
-        fclose(file);
-        exit(EXIT_FAILURE);
-    }
-
-    size_t bytesRead = fread(buffer, 1, *filesize, file);
-    if (bytesRead != *filesize) {
-        perror("Failed to read file");
-        free(buffer);
-        fclose(file);
-        exit(EXIT_FAILURE);
-    }
-
-    fclose(file);
-    return buffer;
-}
-
-int main(int argc, char **argv) {
-    using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
-    using FpMicroseconds = std::chrono::duration<float, std::chrono::microseconds::period>;
-
-    BYTE *input;
-    size_t inlen;
-    const char *input_filename;
-    const char *default_input = "aaaaaaaaaaa";
-
-    if (argc < 2) {
-        // Use default input if no file is provided
-        input = (BYTE *)default_input;
-        inlen = strlen(default_input);
-    } else {
-        input_filename = argv[1];
-        input = read_file(input_filename, &inlen);
-    }
-
-    // Test parameters
-    BYTE key[32] = "";  // Example key
-    WORD keylen = strlen((char *)key);
-    WORD n_outbit = 256;  // Output length in bits
-    WORD n_batch = 1;  // Number of hashes to compute in parallel
-    // Allocate memory for the output
-    WORD outlen = n_outbit / 8;
-    BYTE *output = (BYTE *)malloc(outlen * n_batch);
-    if (!output) {
-        perror("Failed to allocate memory for output");
-        if (argc >= 2) free(input);  // Free file buffer if it was allocated
-        return EXIT_FAILURE;
-    }
-
-    printf("Key len: %d \n", keylen);
-    
-    // Perform the hashing
-    START_TIMER(blake_timer)
-    HashConfig config = default_hash_config();
-
-    blake2s_cuda(input, output, n_batch, inlen, outlen, config);
-    END_TIMER(blake_timer, "Blake Timer")
-    
-    // Print the result
-    print_hash(output, outlen);
-
-    // Clean up
-    free(output);
-    if (argc >= 2) free(input);  // Free file buffer if it was allocated
-    return 0;
-}
-
diff --git a/icicle/src/hash/blake2s/test_blake2s_integ.cu b/icicle/src/hash/blake2s/test_blake2s_integ.cu
new file mode 100644
index 000000000..2991333fa
--- /dev/null
+++ b/icicle/src/hash/blake2s/test_blake2s_integ.cu
@@ -0,0 +1,111 @@
+#include <chrono>
+#include "gpu-utils/device_context.cuh"
+
+#include <cassert>
+#include <chrono>
+#include <fstream>
+#include <iostream>
+#include <iomanip>
+#include "extern.cu"
+
+using namespace blake2s;
+
+#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
+#define END_TIMER(timer, msg)                                                                                          \
+  printf("%s: %.0f us\n", msg, FpMicroseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
+
+extern "C" {
+void mcm_cuda_blake2s_hash_batch(BYTE* key, WORD keylen, BYTE* in, WORD inlen, BYTE* out, WORD n_outbit, WORD n_batch);
+}
+
+void print_hash(BYTE* hash, WORD len)
+{
+  printf("Hash Len: %d \n", len);
+  printf("BLAKE2S hash:\n");
+  for (WORD i = 0; i < len; i++) {
+    printf("%02x", hash[i]);
+  }
+  printf("\n");
+}
+
+BYTE* read_file(const char* filename, size_t* filesize)
+{
+  FILE* file = fopen(filename, "rb");
+  if (!file) {
+    perror("Failed to open file");
+    exit(EXIT_FAILURE);
+  }
+
+  fseek(file, 0, SEEK_END);
+  *filesize = ftell(file);
+  fseek(file, 0, SEEK_SET);
+
+  BYTE* buffer = (BYTE*)malloc(*filesize);
+  if (!buffer) {
+    perror("Failed to allocate memory");
+    fclose(file);
+    exit(EXIT_FAILURE);
+  }
+
+  size_t bytesRead = fread(buffer, 1, *filesize, file);
+  if (bytesRead != *filesize) {
+    perror("Failed to read file");
+    free(buffer);
+    fclose(file);
+    exit(EXIT_FAILURE);
+  }
+
+  fclose(file);
+  return buffer;
+}
+
+int main(int argc, char** argv)
+{
+  using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
+  using FpMicroseconds = std::chrono::duration<float, std::chrono::microseconds::period>;
+
+  BYTE* input;
+  size_t inlen;
+  const char* input_filename;
+  const char* default_input = "aaaaaaaaaaa";
+
+  if (argc < 2) {
+    // Use default input if no file is provided
+    input = (BYTE*)default_input;
+    inlen = strlen(default_input);
+  } else {
+    input_filename = argv[1];
+    input = read_file(input_filename, &inlen);
+  }
+
+  // Test parameters
+  BYTE key[32] = ""; // Example key
+  WORD keylen = strlen((char*)key);
+  WORD n_outbit = 256; // Output length in bits
+  WORD n_batch = 1;    // Number of hashes to compute in parallel
+  // Allocate memory for the output
+  WORD outlen = n_outbit / 8;
+  BYTE* output = (BYTE*)malloc(outlen * n_batch);
+  if (!output) {
+    perror("Failed to allocate memory for output");
+    if (argc >= 2) free(input); // Free file buffer if it was allocated
+    return EXIT_FAILURE;
+  }
+
+  printf("Key len: %d \n", keylen);
+
+  // Perform the hashing
+  START_TIMER(blake_timer)
+  HashConfig config = default_hash_config();
+
+  blake2s_cuda(input, output, n_batch, inlen, outlen, config);
+  END_TIMER(blake_timer, "Blake Timer")
+
+  // Print the result
+  print_hash(output, outlen);
+
+  // Clean up
+  free(output);
+  if (argc >= 2) free(input); // Free file buffer if it was allocated
+  return 0;
+}
diff --git a/icicle/src/hash/blake2s/test_blake2s_seq.cu b/icicle/src/hash/blake2s/test_blake2s_seq.cu
index 8e4af10f6..1578d8d59 100644
--- a/icicle/src/hash/blake2s/test_blake2s_seq.cu
+++ b/icicle/src/hash/blake2s/test_blake2s_seq.cu
@@ -8,106 +8,97 @@
 #include <iomanip>
 #include "extern.cu"
 
-
 using namespace blake2s;
 
 #define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
-#define END_TIMER(timer, msg) \
+#define END_TIMER(timer, msg)                                                                                          \
   printf("%s: %.0f us\n", msg, FpMicroseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
 
-
-void print_hash(BYTE *hash, WORD len) {
-    printf("Hash Len: %d \n", len);
-    printf("BLAKE2S hash:\n");
-    for (WORD i = 0; i < len; i++) {
-        printf("%02x", hash[i]);
-    }
-    printf("\n");
+void print_hash(BYTE* hash, WORD len)
+{
+  printf("Hash Len: %d \n", len);
+  printf("BLAKE2S hash:\n");
+  for (WORD i = 0; i < len; i++) {
+    printf("%02x", hash[i]);
+  }
+  printf("\n");
 }
 
-std::string byte_to_hex(BYTE *data, WORD len) {
-    std::stringstream ss;
-    for (WORD i = 0; i < len; i++) {
-        ss << std::hex << std::setw(2) << std::setfill('0') << (int)data[i];
-    }
-    return ss.str();
+std::string byte_to_hex(BYTE* data, WORD len)
+{
+  std::stringstream ss;
+  for (WORD i = 0; i < len; i++) {
+    ss << std::hex << std::setw(2) << std::setfill('0') << (int)data[i];
+  }
+  return ss.str();
 }
 
-std::vector<std::string> load_csv(const char *filename) {
-    std::vector<std::string> hashes;
-    std::ifstream file(filename);
-    std::string line;
-    while (std::getline(file, line)) {
-        // Directly add the line as a hash, assuming one hash per line
-        hashes.push_back(line);
-    }
-    return hashes;
+std::vector<std::string> load_csv(const char* filename)
+{
+  std::vector<std::string> hashes;
+  std::ifstream file(filename);
+  std::string line;
+  while (std::getline(file, line)) {
+    // Directly add the line as a hash, assuming one hash per line
+    hashes.push_back(line);
+  }
+  return hashes;
 }
 
+int main(int argc, char** argv)
+{
+  using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
+  using FpMicroseconds = std::chrono::duration<float, std::chrono::microseconds::period>;
+
+  std::vector<std::string> test_strings = {"0",      "01",      "012",      "0123",      "01234",
+                                           "012345", "0123456", "01234567", "012345678", "0123456789"};
+
+  const char* csv_filename = "expected_hashes.csv"; // Replace with your actual CSV file name
+  std::vector<std::string> expected_hashes = load_csv(csv_filename);
+  assert(expected_hashes.size() == test_strings.size() && "Number of hashes in CSV must match number of test strings.");
+  std::cout << "Loaded hashes from CSV:" << std::endl;
+  // for (size_t i = 0; i < expected_hashes.size(); ++i) {
+  //     std::cout << "Expected hash " << i  << ": " << expected_hashes[i] << std::endl;
+  // }
+
+  // Test parameters
+  WORD n_outbit = 256; // Output length in bits
+  WORD n_batch = 1;    // Number of hashes to compute in parallel
+
+  // Allocate memory for the output
+  WORD outlen = n_outbit / 8;
+
+  // Perform the hashing
+  HashConfig config = default_hash_config();
+
+  for (size_t i = 0; i < test_strings.size(); i++) {
+    BYTE* output = (BYTE*)malloc(outlen * n_batch);
+    if (!output) {
+      perror("Failed to allocate memory for output");
+      return EXIT_FAILURE;
+    }
 
-
-
-int main(int argc, char **argv) {
-    using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
-    using FpMicroseconds = std::chrono::duration<float, std::chrono::microseconds::period>;
-
-    std::vector<std::string> test_strings = {
-        "0", "01", "012", "0123", "01234", "012345", "0123456", "01234567", "012345678", "0123456789"
-    };
-
-    const char *csv_filename = "expected_hashes.csv";  // Replace with your actual CSV file name
-    std::vector<std::string> expected_hashes = load_csv(csv_filename);
-    assert(expected_hashes.size() == test_strings.size() && "Number of hashes in CSV must match number of test strings.");
-    std::cout << "Loaded hashes from CSV:" << std::endl;
-    // for (size_t i = 0; i < expected_hashes.size(); ++i) {
-    //     std::cout << "Expected hash " << i  << ": " << expected_hashes[i] << std::endl;
-    // }
-
-    // Test parameters
-    WORD n_outbit = 256;  // Output length in bits
-    WORD n_batch = 1;  // Number of hashes to compute in parallel
-
-
-
-    // Allocate memory for the output
-    WORD outlen = n_outbit / 8;
+    const std::string& input_str = test_strings[i];
+    BYTE* input = (BYTE*)input_str.c_str();
+    size_t inlen = input_str.size();
 
     // Perform the hashing
-    HashConfig config = default_hash_config();
-
-    for (size_t i = 0; i < test_strings.size(); i++) {
-
-        BYTE *output = (BYTE *)malloc(outlen * n_batch);
-        if (!output) {
-            perror("Failed to allocate memory for output");
-            return EXIT_FAILURE;
-        }
-
-        const std::string &input_str = test_strings[i];
-        BYTE *input = (BYTE *)input_str.c_str();
-        size_t inlen = input_str.size();
-
-
-        // Perform the hashing
-        START_TIMER(blake_timer)
-        blake2s_cuda(input, output, n_batch, inlen, outlen, config);
-        END_TIMER(blake_timer, "Blake Timer")
-        // Convert the output to hex string
-        std::string computed_hash = byte_to_hex(output, outlen);
-        // Compare with the expected hash
-
-        
-        if (computed_hash == expected_hashes[i]) {
-            std::cout << "Test " << i  << " passed." << std::endl;
-        } else {
-            std::cout << "Test " << i << " failed." << std::endl;    
-            std::cout << "Expected: " << expected_hashes[i] << std::endl;
-            std::cout << "Got:      " << computed_hash << std::endl;
-        }
-    free(output);
+    START_TIMER(blake_timer)
+    blake2s_cuda(input, output, n_batch, inlen, outlen, config);
+    END_TIMER(blake_timer, "Blake Timer")
+    // Convert the output to hex string
+    std::string computed_hash = byte_to_hex(output, outlen);
+    // Compare with the expected hash
+
+    if (computed_hash == expected_hashes[i]) {
+      std::cout << "Test " << i << " passed." << std::endl;
+    } else {
+      std::cout << "Test " << i << " failed." << std::endl;
+      std::cout << "Expected: " << expected_hashes[i] << std::endl;
+      std::cout << "Got:      " << computed_hash << std::endl;
     }
-    
-    return 0;
-    
-}
+    free(output);
+  }
 
+  return 0;
+}
diff --git a/icicle/src/hash/blake2s/test_blake2s_seq_sa.cu b/icicle/src/hash/blake2s/test_blake2s_seq_sa.cu
index 38dff5565..cf8177a30 100644
--- a/icicle/src/hash/blake2s/test_blake2s_seq_sa.cu
+++ b/icicle/src/hash/blake2s/test_blake2s_seq_sa.cu
@@ -12,108 +12,102 @@
 using namespace blake2s;
 
 #define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
-#define END_TIMER(timer, msg) \
+#define END_TIMER(timer, msg)                                                                                          \
   printf("%s: %.0f us\n", msg, FpMicroseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
 
 extern "C" {
-void mcm_cuda_blake2s_hash_batch(BYTE *key, WORD keylen, BYTE *in, WORD inlen, BYTE *out, WORD n_outbit, WORD n_batch);
+void mcm_cuda_blake2s_hash_batch(BYTE* key, WORD keylen, BYTE* in, WORD inlen, BYTE* out, WORD n_outbit, WORD n_batch);
 }
 
-void print_hash(BYTE *hash, WORD len) {
-    printf("Hash Len: %d \n", len);
-    printf("BLAKE2S hash:\n");
-    for (WORD i = 0; i < len; i++) {
-        printf("%02x", hash[i]);
-    }
-    printf("\n");
+void print_hash(BYTE* hash, WORD len)
+{
+  printf("Hash Len: %d \n", len);
+  printf("BLAKE2S hash:\n");
+  for (WORD i = 0; i < len; i++) {
+    printf("%02x", hash[i]);
+  }
+  printf("\n");
 }
 
-std::string byte_to_hex(BYTE *data, WORD len) {
-    std::stringstream ss;
-    for (WORD i = 0; i < len; i++) {
-        ss << std::hex << std::setw(2) << std::setfill('0') << (int)data[i];
-    }
-    return ss.str();
+std::string byte_to_hex(BYTE* data, WORD len)
+{
+  std::stringstream ss;
+  for (WORD i = 0; i < len; i++) {
+    ss << std::hex << std::setw(2) << std::setfill('0') << (int)data[i];
+  }
+  return ss.str();
 }
 
-std::vector<std::string> load_csv(const char *filename) {
-    std::vector<std::string> hashes;
-    std::ifstream file(filename);
-    std::string line;
-    while (std::getline(file, line)) {
-        // Directly add the line as a hash, assuming one hash per line
-        hashes.push_back(line);
-    }
-    return hashes;
+std::vector<std::string> load_csv(const char* filename)
+{
+  std::vector<std::string> hashes;
+  std::ifstream file(filename);
+  std::string line;
+  while (std::getline(file, line)) {
+    // Directly add the line as a hash, assuming one hash per line
+    hashes.push_back(line);
+  }
+  return hashes;
 }
 
+int main(int argc, char** argv)
+{
+  using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
+  using FpMicroseconds = std::chrono::duration<float, std::chrono::microseconds::period>;
+
+  std::vector<std::string> test_strings = {"0",      "01",      "012",      "0123",      "01234",
+                                           "012345", "0123456", "01234567", "012345678", "0123456789"};
+
+  const char* csv_filename = "expected_hashes.csv"; // Replace with your actual CSV file name
+  std::vector<std::string> expected_hashes = load_csv(csv_filename);
+  assert(expected_hashes.size() == test_strings.size() && "Number of hashes in CSV must match number of test strings.");
+  std::cout << "Loaded hashes from CSV:" << std::endl;
+  // for (size_t i = 0; i < expected_hashes.size(); ++i) {
+  //     std::cout << "Expected hash " << i  << ": " << expected_hashes[i] << std::endl;
+  // }
+
+  // Test parameters
+  WORD n_outbit = 256; // Output length in bits
+  WORD n_batch = 1;    // Number of hashes to compute in parallel
+
+  // Test parameters
+  BYTE key[32] = ""; // Example key
+  WORD keylen = strlen((char*)key);
+
+  // Allocate memory for the output
+  WORD outlen = n_outbit / 8;
+
+  // Perform the hashing
+  HashConfig config = default_hash_config();
+
+  for (size_t i = 0; i < test_strings.size(); i++) {
+    BYTE* output = (BYTE*)malloc(outlen * n_batch);
+    if (!output) {
+      perror("Failed to allocate memory for output");
+      return EXIT_FAILURE;
+    }
 
-
-
-int main(int argc, char **argv) {
-    using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
-    using FpMicroseconds = std::chrono::duration<float, std::chrono::microseconds::period>;
-
-    std::vector<std::string> test_strings = {
-        "0", "01", "012", "0123", "01234", "012345", "0123456", "01234567", "012345678", "0123456789"
-    };
-
-    const char *csv_filename = "expected_hashes.csv";  // Replace with your actual CSV file name
-    std::vector<std::string> expected_hashes = load_csv(csv_filename);
-    assert(expected_hashes.size() == test_strings.size() && "Number of hashes in CSV must match number of test strings.");
-    std::cout << "Loaded hashes from CSV:" << std::endl;
-    // for (size_t i = 0; i < expected_hashes.size(); ++i) {
-    //     std::cout << "Expected hash " << i  << ": " << expected_hashes[i] << std::endl;
-    // }
-
-    // Test parameters
-    WORD n_outbit = 256;  // Output length in bits
-    WORD n_batch = 1;  // Number of hashes to compute in parallel
-
-    // Test parameters
-    BYTE key[32] = "";  // Example key
-    WORD keylen = strlen((char *)key);
-
-    // Allocate memory for the output
-    WORD outlen = n_outbit / 8;
+    const std::string& input_str = test_strings[i];
+    BYTE* input = (BYTE*)input_str.c_str();
+    size_t inlen = input_str.size();
 
     // Perform the hashing
-    HashConfig config = default_hash_config();
-
-    for (size_t i = 0; i < test_strings.size(); i++) {
-
-        BYTE *output = (BYTE *)malloc(outlen * n_batch);
-        if (!output) {
-            perror("Failed to allocate memory for output");
-            return EXIT_FAILURE;
-        }
-
-        const std::string &input_str = test_strings[i];
-        BYTE *input = (BYTE *)input_str.c_str();
-        size_t inlen = input_str.size();
-
-
-
-        // Perform the hashing
-        START_TIMER(blake_timer)
-        mcm_cuda_blake2s_hash_batch(key, keylen, input, inlen, output, outlen, n_batch);
-        END_TIMER(blake_timer, "Blake Timer")
-        // Convert the output to hex string
-        std::string computed_hash = byte_to_hex(output, outlen);
-        // Compare with the expected hash
-
-        
-        if (computed_hash == expected_hashes[i]) {
-            std::cout << "Test " << i  << " passed." << std::endl;
-        } else {
-            std::cout << "Test " << i << " failed." << std::endl;    
-            std::cout << "Expected: " << expected_hashes[i] << std::endl;
-            std::cout << "Got:      " << computed_hash << std::endl;
-        }
-    free(output);
+    START_TIMER(blake_timer)
+    mcm_cuda_blake2s_hash_batch(key, keylen, input, inlen, output, outlen, n_batch);
+    END_TIMER(blake_timer, "Blake Timer")
+    // Convert the output to hex string
+    std::string computed_hash = byte_to_hex(output, outlen);
+    // Compare with the expected hash
+
+    if (computed_hash == expected_hashes[i]) {
+      std::cout << "Test " << i << " passed." << std::endl;
+    } else {
+      std::cout << "Test " << i << " failed." << std::endl;
+      std::cout << "Expected: " << expected_hashes[i] << std::endl;
+      std::cout << "Got:      " << computed_hash << std::endl;
     }
-    
-    return 0;
-    
-}
+    free(output);
+  }
 
+  return 0;
+}
diff --git a/icicle/src/hash/blake2s/test_tree.cu b/icicle/src/hash/blake2s/test_tree.cu
index b111ae3b5..85145ccf0 100644
--- a/icicle/src/hash/blake2s/test_tree.cu
+++ b/icicle/src/hash/blake2s/test_tree.cu
@@ -11,7 +11,6 @@
 
 using namespace blake2s;
 
-
 #define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
 #define END_TIMER(timer, msg)                                                                                          \
   printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
@@ -56,7 +55,7 @@ int main(int argc, char* argv[])
 
   /// Allocate memory for digests of {keep_rows} rows of a tree
   START_TIMER(timer_digests);
-  size_t digests_mem = digests_len * sizeof(BYTE)*64;
+  size_t digests_mem = digests_len * sizeof(BYTE) * 64;
   BYTE* digests = static_cast<BYTE*>(malloc(digests_mem));
   END_TIMER(timer_digests, "Allocated memory for digests");
 
@@ -81,15 +80,13 @@ int main(int argc, char* argv[])
 
   for (int i = 0; i < digests_len * 32; i++) {
     WORD root = digests[i];
-    
+
     // Print the current element in hexadecimal format
     printf("%02x", root);
 
     // After every 32 elements, print a newline to start a new row
-    if ((i + 1) % 32 == 0) {
-        printf("\n");
-    }
-}
+    if ((i + 1) % 32 == 0) { printf("\n"); }
+  }
 
   free(digests);
   free(leaves);