From ed5ec2813a574224983ec3880af346f9cbbadcbd Mon Sep 17 00:00:00 2001
From: AztecBot <tech@aztecprotocol.com>
Date: Sun, 1 Dec 2024 08:08:10 +0000
Subject: [PATCH] [1 changes] feat(ssa): Option to set the maximum acceptable
 Brillig bytecode increase in unrolling
 (https://github.com/noir-lang/noir/pull/6641) feat: Sync from aztec-packages
 (https://github.com/noir-lang/noir/pull/6656) chore: refactor poseidon2
 (https://github.com/noir-lang/noir/pull/6655) fix: correct types returned by
 constant EC operations simplified within SSA
 (https://github.com/noir-lang/noir/pull/6652) feat: Sync from aztec-packages
 (https://github.com/noir-lang/noir/pull/6634) fix: used signed division for
 signed modulo (https://github.com/noir-lang/noir/pull/6635) fix(ssa): don't
 deduplicate constraints in blocks that are not dominated
 (https://github.com/noir-lang/noir/pull/6627) chore: pin foundry version in
 CI (https://github.com/noir-lang/noir/pull/6642) feat(ssa): Deduplicate
 intrinsics with predicates (https://github.com/noir-lang/noir/pull/6615)
 chore: improve error message of `&T`
 (https://github.com/noir-lang/noir/pull/6633) fix: LSP code action wasn't
 triggering on beginning or end of identifier
 (https://github.com/noir-lang/noir/pull/6616) chore!: remove `ec` module from
 stdlib (https://github.com/noir-lang/noir/pull/6612) fix(LSP): use generic
 self type to narrow down methods to complete
 (https://github.com/noir-lang/noir/pull/6617) fix!: Disallow `#[export]` on
 associated methods (https://github.com/noir-lang/noir/pull/6626) chore: redo
 typo PR by donatik27 (https://github.com/noir-lang/noir/pull/6575) chore:
 redo typo PR by Dimitrolito (https://github.com/noir-lang/noir/pull/6614)
 feat: simplify `jmpif`s by reversing branches if condition is negated
 (https://github.com/noir-lang/noir/pull/5891) fix: Do not warn on unused
 functions marked with #[export] (https://github.com/noir-lang/noir/pull/6625)
 chore: Add panic for compiler error described in #6620
 (https://github.com/noir-lang/noir/pull/6621) feat(perf): Track last loads
 per block in mem2reg and remove them if possible
 (https://github.com/noir-lang/noir/pull/6088) fix(ssa): Track all local
 allocations during flattening (https://github.com/noir-lang/noir/pull/6619)
 feat(comptime): Implement blackbox functions in comptime interpreter
 (https://github.com/noir-lang/noir/pull/6551) chore: derive PartialEq and
 Hash for FieldElement (https://github.com/noir-lang/noir/pull/6610) chore:
 ignore almost-empty directories in nargo_cli tests
 (https://github.com/noir-lang/noir/pull/6611) chore: remove temporary
 allocations from `num_bits` (https://github.com/noir-lang/noir/pull/6600)
 chore: Release Noir(1.0.0-beta.0)
 (https://github.com/noir-lang/noir/pull/6562) feat: Add `array_refcount` and
 `slice_refcount` builtins for debugging
 (https://github.com/noir-lang/noir/pull/6584) chore!: Require types of
 globals to be specified (https://github.com/noir-lang/noir/pull/6592) fix:
 don't report visibility errors when elaborating comptime value
 (https://github.com/noir-lang/noir/pull/6498) fix: preserve newlines between
 comments when formatting statements
 (https://github.com/noir-lang/noir/pull/6601) fix: parse a bit more SSA stuff
 (https://github.com/noir-lang/noir/pull/6599) chore!: remove eddsa from
 stdlib (https://github.com/noir-lang/noir/pull/6591) chore: Typo in oracles
 how to (https://github.com/noir-lang/noir/pull/6598) feat(ssa): Loop
 invariant code motion (https://github.com/noir-lang/noir/pull/6563) fix:
 remove `compiler_version` from new `Nargo.toml`
 (https://github.com/noir-lang/noir/pull/6590) feat: Avoid incrementing
 reference counts in some cases (https://github.com/noir-lang/noir/pull/6568)
 chore: fix typo in test name (https://github.com/noir-lang/noir/pull/6589)
 fix: consider prereleases to be compatible with pre-1.0.0 releases
 (https://github.com/noir-lang/noir/pull/6580) feat: try to inline brillig
 calls with all constant arguments 
 (https://github.com/noir-lang/noir/pull/6548) fix: correct type when
 simplifying `derive_pedersen_generators`
 (https://github.com/noir-lang/noir/pull/6579) feat: Sync from aztec-packages
 (https://github.com/noir-lang/noir/pull/6576)

---
 .noir-sync-commit                             |   2 +-
 noir/noir-repo/Cargo.lock                     |   1 +
 noir/noir-repo/acvm-repo/acvm_js/build.sh     |   2 +-
 .../compiler/integration-tests/package.json   |   2 +-
 .../compiler/noirc_driver/src/lib.rs          |  11 +-
 .../compiler/noirc_evaluator/Cargo.toml       |   1 +
 .../noirc_evaluator/src/brillig/mod.rs        |   8 +-
 .../compiler/noirc_evaluator/src/ssa.rs       |  19 +-
 .../noirc_evaluator/src/ssa/ir/function.rs    |   6 +
 .../noirc_evaluator/src/ssa/ir/instruction.rs |  15 +-
 .../src/ssa/opt/constant_folding.rs           | 445 ++++++++++----
 .../src/ssa/opt/flatten_cfg.rs                | 556 ++++++++----------
 .../noirc_evaluator/src/ssa/opt/inlining.rs   |   1 -
 .../noirc_evaluator/src/ssa/opt/mem2reg.rs    | 332 ++++++++---
 .../noirc_evaluator/src/ssa/opt/unrolling.rs  | 137 ++++-
 .../src/ssa/ssa_gen/context.rs                |  14 +-
 .../noirc_evaluator/src/ssa/ssa_gen/mod.rs    |   5 +-
 .../noir_stdlib/src/hash/poseidon2.nr         |   6 +-
 .../Nargo.toml                                |   7 +
 .../src/main.nr                               |  15 +
 noir/noir-repo/tooling/nargo_cli/build.rs     |   5 +
 .../noir-repo/tooling/noirc_abi_wasm/build.sh |   2 +-
 noir/noir-repo/yarn.lock                      |  14 +-
 23 files changed, 1041 insertions(+), 565 deletions(-)
 create mode 100644 noir/noir-repo/test_programs/execution_success/inline_decompose_hint_brillig_call/Nargo.toml
 create mode 100644 noir/noir-repo/test_programs/execution_success/inline_decompose_hint_brillig_call/src/main.nr

diff --git a/.noir-sync-commit b/.noir-sync-commit
index 9bbde85e56b..93c17670eeb 100644
--- a/.noir-sync-commit
+++ b/.noir-sync-commit
@@ -1 +1 @@
-68c32b4ffd9b069fe4b119327dbf4018c17ab9d4
+4ff308128755c95b4d461bbcb7e3a49f16145585
diff --git a/noir/noir-repo/Cargo.lock b/noir/noir-repo/Cargo.lock
index 94a84b89d05..af91bafef52 100644
--- a/noir/noir-repo/Cargo.lock
+++ b/noir/noir-repo/Cargo.lock
@@ -3151,6 +3151,7 @@ dependencies = [
  "serde_json",
  "serde_with",
  "similar-asserts",
+ "test-case",
  "thiserror",
  "tracing",
 ]
diff --git a/noir/noir-repo/acvm-repo/acvm_js/build.sh b/noir/noir-repo/acvm-repo/acvm_js/build.sh
index c07d2d8a4c1..16fb26e55db 100755
--- a/noir/noir-repo/acvm-repo/acvm_js/build.sh
+++ b/noir/noir-repo/acvm-repo/acvm_js/build.sh
@@ -25,7 +25,7 @@ function run_if_available {
 require_command jq
 require_command cargo
 require_command wasm-bindgen
-#require_command wasm-opt
+require_command wasm-opt
 
 self_path=$(dirname "$(readlink -f "$0")")
 pname=$(cargo read-manifest | jq -r '.name')
diff --git a/noir/noir-repo/compiler/integration-tests/package.json b/noir/noir-repo/compiler/integration-tests/package.json
index e33179f31e7..a9d437da792 100644
--- a/noir/noir-repo/compiler/integration-tests/package.json
+++ b/noir/noir-repo/compiler/integration-tests/package.json
@@ -13,7 +13,7 @@
     "lint": "NODE_NO_WARNINGS=1 eslint . --ext .ts --ignore-path ./.eslintignore  --max-warnings 0"
   },
   "dependencies": {
-    "@aztec/bb.js": "portal:../../../../barretenberg/ts",
+    "@aztec/bb.js": "0.63.1",
     "@noir-lang/noir_js": "workspace:*",
     "@noir-lang/noir_wasm": "workspace:*",
     "@nomicfoundation/hardhat-chai-matchers": "^2.0.0",
diff --git a/noir/noir-repo/compiler/noirc_driver/src/lib.rs b/noir/noir-repo/compiler/noirc_driver/src/lib.rs
index 72ea464805f..a7cd9ff90ac 100644
--- a/noir/noir-repo/compiler/noirc_driver/src/lib.rs
+++ b/noir/noir-repo/compiler/noirc_driver/src/lib.rs
@@ -126,11 +126,19 @@ pub struct CompileOptions {
     #[arg(long)]
     pub skip_underconstrained_check: bool,
 
-    /// Setting to decide on an inlining strategy for brillig functions.
+    /// Setting to decide on an inlining strategy for Brillig functions.
     /// A more aggressive inliner should generate larger programs but more optimized
     /// A less aggressive inliner should generate smaller programs
     #[arg(long, hide = true, allow_hyphen_values = true, default_value_t = i64::MAX)]
     pub inliner_aggressiveness: i64,
+
+    /// Setting the maximum acceptable increase in Brillig bytecode size due to
+    /// unrolling small loops. When left empty, any change is accepted as long
+    /// as it required fewer SSA instructions.
+    /// A higher value results in fewer jumps but a larger program.
+    /// A lower value keeps the original program if it was smaller, even if it has more jumps.
+    #[arg(long, hide = true, allow_hyphen_values = true)]
+    pub max_bytecode_increase_percent: Option<i32>,
 }
 
 pub fn parse_expression_width(input: &str) -> Result<ExpressionWidth, std::io::Error> {
@@ -589,6 +597,7 @@ pub fn compile_no_check(
         emit_ssa: if options.emit_ssa { Some(context.package_build_path.clone()) } else { None },
         skip_underconstrained_check: options.skip_underconstrained_check,
         inliner_aggressiveness: options.inliner_aggressiveness,
+        max_bytecode_increase_percent: options.max_bytecode_increase_percent,
     };
 
     let SsaProgramArtifact { program, debug, warnings, names, brillig_names, error_types, .. } =
diff --git a/noir/noir-repo/compiler/noirc_evaluator/Cargo.toml b/noir/noir-repo/compiler/noirc_evaluator/Cargo.toml
index e25b5bf855a..bb8c62cfd95 100644
--- a/noir/noir-repo/compiler/noirc_evaluator/Cargo.toml
+++ b/noir/noir-repo/compiler/noirc_evaluator/Cargo.toml
@@ -33,6 +33,7 @@ cfg-if.workspace = true
 proptest.workspace = true
 similar-asserts.workspace = true
 num-traits.workspace = true
+test-case.workspace = true
 
 [features]
 bn254 = ["noirc_frontend/bn254"]
diff --git a/noir/noir-repo/compiler/noirc_evaluator/src/brillig/mod.rs b/noir/noir-repo/compiler/noirc_evaluator/src/brillig/mod.rs
index 1b61ae1a864..cb8c35cd8e0 100644
--- a/noir/noir-repo/compiler/noirc_evaluator/src/brillig/mod.rs
+++ b/noir/noir-repo/compiler/noirc_evaluator/src/brillig/mod.rs
@@ -12,7 +12,7 @@ use self::{
     },
 };
 use crate::ssa::{
-    ir::function::{Function, FunctionId, RuntimeType},
+    ir::function::{Function, FunctionId},
     ssa_gen::Ssa,
 };
 use fxhash::FxHashMap as HashMap;
@@ -59,7 +59,7 @@ impl std::ops::Index<FunctionId> for Brillig {
 }
 
 impl Ssa {
-    /// Compile to brillig brillig functions and ACIR functions reachable from them
+    /// Compile Brillig functions and ACIR functions reachable from them
     #[tracing::instrument(level = "trace", skip_all)]
     pub(crate) fn to_brillig(&self, enable_debug_trace: bool) -> Brillig {
         // Collect all the function ids that are reachable from brillig
@@ -67,9 +67,7 @@ impl Ssa {
         let brillig_reachable_function_ids = self
             .functions
             .iter()
-            .filter_map(|(id, func)| {
-                matches!(func.runtime(), RuntimeType::Brillig(_)).then_some(*id)
-            })
+            .filter_map(|(id, func)| func.runtime().is_brillig().then_some(*id))
             .collect::<BTreeSet<_>>();
 
         let mut brillig = Brillig::default();
diff --git a/noir/noir-repo/compiler/noirc_evaluator/src/ssa.rs b/noir/noir-repo/compiler/noirc_evaluator/src/ssa.rs
index 97c1760d87c..80514b2f2cf 100644
--- a/noir/noir-repo/compiler/noirc_evaluator/src/ssa.rs
+++ b/noir/noir-repo/compiler/noirc_evaluator/src/ssa.rs
@@ -67,6 +67,11 @@ pub struct SsaEvaluatorOptions {
 
     /// The higher the value, the more inlined brillig functions will be.
     pub inliner_aggressiveness: i64,
+
+    /// Maximum accepted percentage increase in the Brillig bytecode size after unrolling loops.
+    /// When `None` the size increase check is skipped altogether and any decrease in the SSA
+    /// instruction count is accepted.
+    pub max_bytecode_increase_percent: Option<i32>,
 }
 
 pub(crate) struct ArtifactsAndWarnings(Artifacts, Vec<SsaReport>);
@@ -104,7 +109,10 @@ pub(crate) fn optimize_into_acir(
         "After `static_assert` and `assert_constant`:",
     )?
     .run_pass(Ssa::loop_invariant_code_motion, "After Loop Invariant Code Motion:")
-    .try_run_pass(Ssa::unroll_loops_iteratively, "After Unrolling:")?
+    .try_run_pass(
+        |ssa| ssa.unroll_loops_iteratively(options.max_bytecode_increase_percent),
+        "After Unrolling:",
+    )?
     .run_pass(Ssa::simplify_cfg, "After Simplifying (2nd):")
     .run_pass(Ssa::flatten_cfg, "After Flattening:")
     .run_pass(Ssa::remove_bit_shifts, "After Removing Bit Shifts:")
@@ -450,11 +458,10 @@ impl SsaBuilder {
     }
 
     /// The same as `run_pass` but for passes that may fail
-    fn try_run_pass(
-        mut self,
-        pass: fn(Ssa) -> Result<Ssa, RuntimeError>,
-        msg: &str,
-    ) -> Result<Self, RuntimeError> {
+    fn try_run_pass<F>(mut self, pass: F, msg: &str) -> Result<Self, RuntimeError>
+    where
+        F: FnOnce(Ssa) -> Result<Ssa, RuntimeError>,
+    {
         self.ssa = time(msg, self.print_codegen_timings, || pass(self.ssa))?;
         Ok(self.print(msg))
     }
diff --git a/noir/noir-repo/compiler/noirc_evaluator/src/ssa/ir/function.rs b/noir/noir-repo/compiler/noirc_evaluator/src/ssa/ir/function.rs
index b1233e3063e..6413107c04a 100644
--- a/noir/noir-repo/compiler/noirc_evaluator/src/ssa/ir/function.rs
+++ b/noir/noir-repo/compiler/noirc_evaluator/src/ssa/ir/function.rs
@@ -197,6 +197,12 @@ impl Function {
     }
 }
 
+impl Clone for Function {
+    fn clone(&self) -> Self {
+        Function::clone_with_id(self.id(), self)
+    }
+}
+
 impl std::fmt::Display for RuntimeType {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         match self {
diff --git a/noir/noir-repo/compiler/noirc_evaluator/src/ssa/ir/instruction.rs b/noir/noir-repo/compiler/noirc_evaluator/src/ssa/ir/instruction.rs
index f606fffbf91..6737b335b7d 100644
--- a/noir/noir-repo/compiler/noirc_evaluator/src/ssa/ir/instruction.rs
+++ b/noir/noir-repo/compiler/noirc_evaluator/src/ssa/ir/instruction.rs
@@ -11,7 +11,7 @@ use fxhash::FxHasher64;
 use iter_extended::vecmap;
 use noirc_frontend::hir_def::types::Type as HirType;
 
-use crate::ssa::opt::flatten_cfg::value_merger::ValueMerger;
+use crate::ssa::{ir::function::RuntimeType, opt::flatten_cfg::value_merger::ValueMerger};
 
 use super::{
     basic_block::BasicBlockId,
@@ -478,8 +478,19 @@ impl Instruction {
             | ArraySet { .. }
             | MakeArray { .. } => true,
 
+            // Store instructions must be removed by DIE in acir code, any load
+            // instructions should already be unused by that point.
+            //
+            // Note that this check assumes that it is being performed after the flattening
+            // pass and after the last mem2reg pass. This is currently the case for the DIE
+            // pass where this check is done, but does mean that we cannot perform mem2reg
+            // after the DIE pass.
+            Store { .. } => {
+                matches!(function.runtime(), RuntimeType::Acir(_))
+                    && function.reachable_blocks().len() == 1
+            }
+
             Constrain(..)
-            | Store { .. }
             | EnableSideEffectsIf { .. }
             | IncrementRc { .. }
             | DecrementRc { .. }
diff --git a/noir/noir-repo/compiler/noirc_evaluator/src/ssa/opt/constant_folding.rs b/noir/noir-repo/compiler/noirc_evaluator/src/ssa/opt/constant_folding.rs
index ceda0c6272f..41c84c935b1 100644
--- a/noir/noir-repo/compiler/noirc_evaluator/src/ssa/opt/constant_folding.rs
+++ b/noir/noir-repo/compiler/noirc_evaluator/src/ssa/opt/constant_folding.rs
@@ -149,7 +149,8 @@ impl Function {
         use_constraint_info: bool,
         brillig_info: Option<BrilligInfo>,
     ) {
-        let mut context = Context::new(self, use_constraint_info, brillig_info);
+        let mut context = Context::new(use_constraint_info, brillig_info);
+        let mut dom = DominatorTree::with_function(self);
         context.block_queue.push_back(self.entry_block());
 
         while let Some(block) = context.block_queue.pop_front() {
@@ -158,7 +159,7 @@ impl Function {
             }
 
             context.visited_blocks.insert(block);
-            context.fold_constants_in_block(self, block);
+            context.fold_constants_in_block(&mut self.dfg, &mut dom, block);
         }
     }
 }
@@ -172,22 +173,15 @@ struct Context<'a> {
 
     /// Contains sets of values which are constrained to be equivalent to each other.
     ///
-    /// The mapping's structure is `side_effects_enabled_var => (constrained_value => [(block, simplified_value)])`.
+    /// The mapping's structure is `side_effects_enabled_var => (constrained_value => simplified_value)`.
     ///
     /// We partition the maps of constrained values according to the side-effects flag at the point
     /// at which the values are constrained. This prevents constraints which are only sometimes enforced
     /// being used to modify the rest of the program.
-    ///
-    /// We also keep track of how a value was simplified to other values per block. That is,
-    /// a same ValueId could have been simplified to one value in one block and to another value
-    /// in another block.
-    constraint_simplification_mappings:
-        HashMap<ValueId, HashMap<ValueId, Vec<(BasicBlockId, ValueId)>>>,
+    constraint_simplification_mappings: ConstraintSimplificationCache,
 
     // Cache of instructions without any side-effects along with their outputs.
     cached_instruction_results: InstructionResultCache,
-
-    dom: DominatorTree,
 }
 
 #[derive(Copy, Clone)]
@@ -196,9 +190,56 @@ pub(crate) struct BrilligInfo<'a> {
     brillig_functions: &'a BTreeMap<FunctionId, Function>,
 }
 
+/// Records a simplified equivalents of an [`Instruction`] in the blocks
+/// where the constraint that advised the simplification has been encountered.
+///
+/// For more information see [`ConstraintSimplificationCache`].
+#[derive(Default)]
+struct SimplificationCache {
+    /// Simplified expressions where we found them.
+    ///
+    /// It will always have at least one value because `add` is called
+    /// after the default is constructed.
+    simplifications: HashMap<BasicBlockId, ValueId>,
+}
+
+impl SimplificationCache {
+    /// Called with a newly encountered simplification.
+    fn add(&mut self, dfg: &DataFlowGraph, simple: ValueId, block: BasicBlockId) {
+        self.simplifications
+            .entry(block)
+            .and_modify(|existing| {
+                // `SimplificationCache` may already hold a simplification in this block
+                // so we check whether `simple` is a better simplification than the current one.
+                if let Some((_, simpler)) = simplify(dfg, *existing, simple) {
+                    *existing = simpler;
+                };
+            })
+            .or_insert(simple);
+    }
+
+    /// Try to find a simplification in a visible block.
+    fn get(&self, block: BasicBlockId, dom: &DominatorTree) -> Option<ValueId> {
+        // Deterministically walk up the dominator chain until we encounter a block that contains a simplification.
+        dom.find_map_dominator(block, |b| self.simplifications.get(&b).cloned())
+    }
+}
+
+/// HashMap from `(side_effects_enabled_var, Instruction)` to a simplified expression that it can
+/// be replaced with based on constraints that testify to their equivalence, stored together
+/// with the set of blocks at which this constraint has been observed.
+///
+/// Only blocks dominated by one in the cache should have access to this information, otherwise
+/// we create a sort of time paradox where we replace an instruction with a constant we believe
+/// it _should_ equal to, without ever actually producing and asserting the value.
+type ConstraintSimplificationCache = HashMap<ValueId, HashMap<ValueId, SimplificationCache>>;
+
 /// HashMap from `(Instruction, side_effects_enabled_var)` to the results of the instruction.
 /// Stored as a two-level map to avoid cloning Instructions during the `.get` call.
 ///
+/// The `side_effects_enabled_var` is optional because we only use them when `Instruction::requires_acir_gen_predicate`
+/// is true _and_ the constraint information is also taken into account.
+///
 /// In addition to each result, the original BasicBlockId is stored as well. This allows us
 /// to deduplicate instructions across blocks as long as the new block dominates the original.
 type InstructionResultCache = HashMap<Instruction, HashMap<Option<ValueId>, ResultCache>>;
@@ -208,15 +249,11 @@ type InstructionResultCache = HashMap<Instruction, HashMap<Option<ValueId>, Resu
 /// For more information see [`InstructionResultCache`].
 #[derive(Default)]
 struct ResultCache {
-    results: Vec<(BasicBlockId, Vec<ValueId>)>,
+    result: Option<(BasicBlockId, Vec<ValueId>)>,
 }
 
 impl<'brillig> Context<'brillig> {
-    fn new(
-        function: &Function,
-        use_constraint_info: bool,
-        brillig_info: Option<BrilligInfo<'brillig>>,
-    ) -> Self {
+    fn new(use_constraint_info: bool, brillig_info: Option<BrilligInfo<'brillig>>) -> Self {
         Self {
             use_constraint_info,
             brillig_info,
@@ -224,52 +261,65 @@ impl<'brillig> Context<'brillig> {
             block_queue: Default::default(),
             constraint_simplification_mappings: Default::default(),
             cached_instruction_results: Default::default(),
-            dom: DominatorTree::with_function(function),
         }
     }
 
-    fn fold_constants_in_block(&mut self, function: &mut Function, block: BasicBlockId) {
-        let instructions = function.dfg[block].take_instructions();
+    fn fold_constants_in_block(
+        &mut self,
+        dfg: &mut DataFlowGraph,
+        dom: &mut DominatorTree,
+        block: BasicBlockId,
+    ) {
+        let instructions = dfg[block].take_instructions();
 
-        let mut side_effects_enabled_var =
-            function.dfg.make_constant(FieldElement::one(), Type::bool());
+        // Default side effect condition variable with an enabled state.
+        let mut side_effects_enabled_var = dfg.make_constant(FieldElement::one(), Type::bool());
 
         for instruction_id in instructions {
             self.fold_constants_into_instruction(
-                &mut function.dfg,
+                dfg,
+                dom,
                 block,
                 instruction_id,
                 &mut side_effects_enabled_var,
             );
         }
-        self.block_queue.extend(function.dfg[block].successors());
+        self.block_queue.extend(dfg[block].successors());
     }
 
     fn fold_constants_into_instruction(
         &mut self,
         dfg: &mut DataFlowGraph,
-        block: BasicBlockId,
+        dom: &mut DominatorTree,
+        mut block: BasicBlockId,
         id: InstructionId,
         side_effects_enabled_var: &mut ValueId,
     ) {
-        let constraint_simplification_mapping =
-            self.constraint_simplification_mappings.get(side_effects_enabled_var);
-        let instruction = Self::resolve_instruction(
-            id,
-            block,
-            dfg,
-            &mut self.dom,
-            constraint_simplification_mapping,
-        );
+        let constraint_simplification_mapping = self.get_constraint_map(*side_effects_enabled_var);
+
+        let instruction =
+            Self::resolve_instruction(id, block, dfg, dom, constraint_simplification_mapping);
+
         let old_results = dfg.instruction_results(id).to_vec();
 
         // If a copy of this instruction exists earlier in the block, then reuse the previous results.
-        if let Some(cached_results) =
-            self.get_cached(dfg, &instruction, *side_effects_enabled_var, block)
+        if let Some(cache_result) =
+            self.get_cached(dfg, dom, &instruction, *side_effects_enabled_var, block)
         {
-            Self::replace_result_ids(dfg, &old_results, cached_results);
-            return;
-        }
+            match cache_result {
+                CacheResult::Cached(cached) => {
+                    Self::replace_result_ids(dfg, &old_results, cached);
+                    return;
+                }
+                CacheResult::NeedToHoistToCommonBlock(dominator) => {
+                    // Just change the block to insert in the common dominator instead.
+                    // This will only move the current instance of the instruction right now.
+                    // When constant folding is run a second time later on, it'll catch
+                    // that the previous instance can be deduplicated to this instance.
+                    block = dominator;
+                }
+            }
+        };
 
         let new_results =
         // First try to inline a call to a brillig function with all constant arguments.
@@ -314,7 +364,7 @@ impl<'brillig> Context<'brillig> {
         block: BasicBlockId,
         dfg: &DataFlowGraph,
         dom: &mut DominatorTree,
-        constraint_simplification_mapping: Option<&HashMap<ValueId, Vec<(BasicBlockId, ValueId)>>>,
+        constraint_simplification_mapping: &HashMap<ValueId, SimplificationCache>,
     ) -> Instruction {
         let instruction = dfg[instruction_id].clone();
 
@@ -324,30 +374,28 @@ impl<'brillig> Context<'brillig> {
         // This allows us to reach a stable final `ValueId` for each instruction input as we add more
         // constraints to the cache.
         fn resolve_cache(
+            block: BasicBlockId,
             dfg: &DataFlowGraph,
             dom: &mut DominatorTree,
-            cache: Option<&HashMap<ValueId, Vec<(BasicBlockId, ValueId)>>>,
+            cache: &HashMap<ValueId, SimplificationCache>,
             value_id: ValueId,
-            block: BasicBlockId,
         ) -> ValueId {
             let resolved_id = dfg.resolve(value_id);
-            let Some(cached_values) = cache.and_then(|cache| cache.get(&resolved_id)) else {
-                return resolved_id;
-            };
-
-            for (cached_block, cached_value) in cached_values {
-                // We can only use the simplified value if it was simplified in a block that dominates the current one
-                if dom.dominates(*cached_block, block) {
-                    return resolve_cache(dfg, dom, cache, *cached_value, block);
+            match cache.get(&resolved_id) {
+                Some(simplification_cache) => {
+                    if let Some(simplified) = simplification_cache.get(block, dom) {
+                        resolve_cache(block, dfg, dom, cache, simplified)
+                    } else {
+                        resolved_id
+                    }
                 }
+                None => resolved_id,
             }
-
-            resolved_id
         }
 
         // Resolve any inputs to ensure that we're comparing like-for-like instructions.
         instruction.map_values(|value_id| {
-            resolve_cache(dfg, dom, constraint_simplification_mapping, value_id, block)
+            resolve_cache(block, dfg, dom, constraint_simplification_mapping, value_id)
         })
     }
 
@@ -398,7 +446,7 @@ impl<'brillig> Context<'brillig> {
                     self.get_constraint_map(side_effects_enabled_var)
                         .entry(complex)
                         .or_default()
-                        .push((block, simple));
+                        .add(dfg, simple, block);
                 }
             }
         }
@@ -420,10 +468,12 @@ impl<'brillig> Context<'brillig> {
         }
     }
 
+    /// Get the simplification mapping from complex to simpler instructions,
+    /// which all depend on the same side effect condition variable.
     fn get_constraint_map(
         &mut self,
         side_effects_enabled_var: ValueId,
-    ) -> &mut HashMap<ValueId, Vec<(BasicBlockId, ValueId)>> {
+    ) -> &mut HashMap<ValueId, SimplificationCache> {
         self.constraint_simplification_mappings.entry(side_effects_enabled_var).or_default()
     }
 
@@ -438,19 +488,20 @@ impl<'brillig> Context<'brillig> {
         }
     }
 
-    fn get_cached<'a>(
-        &'a mut self,
+    /// Get a cached result if it can be used in this context.
+    fn get_cached(
+        &self,
         dfg: &DataFlowGraph,
+        dom: &mut DominatorTree,
         instruction: &Instruction,
         side_effects_enabled_var: ValueId,
         block: BasicBlockId,
-    ) -> Option<&'a [ValueId]> {
+    ) -> Option<CacheResult> {
         let results_for_instruction = self.cached_instruction_results.get(instruction)?;
-
         let predicate = self.use_constraint_info && instruction.requires_acir_gen_predicate(dfg);
         let predicate = predicate.then_some(side_effects_enabled_var);
 
-        results_for_instruction.get(&predicate)?.get(block, &mut self.dom)
+        results_for_instruction.get(&predicate)?.get(block, dom, instruction.has_side_effects(dfg))
     }
 
     /// Checks if the given instruction is a call to a brillig function with all constant arguments.
@@ -617,7 +668,9 @@ impl<'brillig> Context<'brillig> {
 impl ResultCache {
     /// Records that an `Instruction` in block `block` produced the result values `results`.
     fn cache(&mut self, block: BasicBlockId, results: Vec<ValueId>) {
-        self.results.push((block, results));
+        if self.result.is_none() {
+            self.result = Some((block, results));
+        }
     }
 
     /// Returns a set of [`ValueId`]s produced from a copy of this [`Instruction`] which sits
@@ -626,13 +679,23 @@ impl ResultCache {
     /// We require that the cached instruction's block dominates `block` in order to avoid
     /// cycles causing issues (e.g. two instructions being replaced with the results of each other
     /// such that neither instruction exists anymore.)
-    fn get(&self, block: BasicBlockId, dom: &mut DominatorTree) -> Option<&[ValueId]> {
-        for (origin_block, results) in &self.results {
+    fn get(
+        &self,
+        block: BasicBlockId,
+        dom: &mut DominatorTree,
+        has_side_effects: bool,
+    ) -> Option<CacheResult> {
+        self.result.as_ref().and_then(|(origin_block, results)| {
             if dom.dominates(*origin_block, block) {
-                return Some(results);
+                Some(CacheResult::Cached(results))
+            } else if !has_side_effects {
+                // Insert a copy of this instruction in the common dominator
+                let dominator = dom.common_dominator(*origin_block, block);
+                Some(CacheResult::NeedToHoistToCommonBlock(dominator))
+            } else {
+                None
             }
-        }
-        None
+        })
     }
 }
 
@@ -940,32 +1003,22 @@ mod test {
     // Regression for #4600
     #[test]
     fn array_get_regression() {
-        // fn main f0 {
-        //   b0(v0: u1, v1: u64):
-        //     enable_side_effects_if v0
-        //     v2 = make_array [Field 0, Field 1]
-        //     v3 = array_get v2, index v1
-        //     v4 = not v0
-        //     enable_side_effects_if v4
-        //     v5 = array_get v2, index v1
-        // }
-        //
         // We want to make sure after constant folding both array_gets remain since they are
         // under different enable_side_effects_if contexts and thus one may be disabled while
         // the other is not. If one is removed, it is possible e.g. v4 is replaced with v2 which
         // is disabled (only gets from index 0) and thus returns the wrong result.
         let src = "
-            acir(inline) fn main f0 {
-              b0(v0: u1, v1: u64):
-                enable_side_effects v0
-                v4 = make_array [Field 0, Field 1] : [Field; 2]
-                v5 = array_get v4, index v1 -> Field
-                v6 = not v0
-                enable_side_effects v6
-                v7 = array_get v4, index v1 -> Field
-                return
-            }
-            ";
+        acir(inline) fn main f0 {
+          b0(v0: u1, v1: u64):
+            enable_side_effects v0
+            v4 = make_array [Field 0, Field 1] : [Field; 2]
+            v5 = array_get v4, index v1 -> Field
+            v6 = not v0
+            enable_side_effects v6
+            v7 = array_get v4, index v1 -> Field
+            return
+        }
+        ";
         let ssa = Ssa::from_str(src).unwrap();
 
         // Expected output is unchanged
@@ -1032,7 +1085,6 @@ mod test {
         //     v5 = call keccakf1600(v1)
         //     v6 = call keccakf1600(v2)
         // }
-        //
         // Here we're checking a situation where two identical arrays are being initialized twice and being assigned separate `ValueId`s.
         // This would result in otherwise identical instructions not being deduplicated.
         let main_id = Id::test_new(0);
@@ -1083,6 +1135,106 @@ mod test {
         assert_eq!(ending_instruction_count, 2);
     }
 
+    #[test]
+    fn deduplicate_across_blocks() {
+        // fn main f0 {
+        //   b0(v0: u1):
+        //     v1 = not v0
+        //     jmp b1()
+        //   b1():
+        //     v2 = not v0
+        //     return v2
+        // }
+        let main_id = Id::test_new(0);
+
+        // Compiling main
+        let mut builder = FunctionBuilder::new("main".into(), main_id);
+        let b1 = builder.insert_block();
+
+        let v0 = builder.add_parameter(Type::bool());
+        let _v1 = builder.insert_not(v0);
+        builder.terminate_with_jmp(b1, Vec::new());
+
+        builder.switch_to_block(b1);
+        let v2 = builder.insert_not(v0);
+        builder.terminate_with_return(vec![v2]);
+
+        let ssa = builder.finish();
+        let main = ssa.main();
+        assert_eq!(main.dfg[main.entry_block()].instructions().len(), 1);
+        assert_eq!(main.dfg[b1].instructions().len(), 1);
+
+        // Expected output:
+        //
+        // fn main f0 {
+        //   b0(v0: u1):
+        //     v1 = not v0
+        //     jmp b1()
+        //   b1():
+        //     return v1
+        // }
+        let ssa = ssa.fold_constants_using_constraints();
+        let main = ssa.main();
+        assert_eq!(main.dfg[main.entry_block()].instructions().len(), 1);
+        assert_eq!(main.dfg[b1].instructions().len(), 0);
+    }
+
+    #[test]
+    fn deduplicate_across_non_dominated_blocks() {
+        let src = "
+            brillig(inline) fn main f0 {
+              b0(v0: u32):
+                v2 = lt u32 1000, v0
+                jmpif v2 then: b1, else: b2
+              b1():
+                v4 = add v0, u32 1
+                v5 = lt v0, v4
+                constrain v5 == u1 1
+                jmp b2()
+              b2():
+                v7 = lt u32 1000, v0
+                jmpif v7 then: b3, else: b4
+              b3():
+                v8 = add v0, u32 1
+                v9 = lt v0, v8
+                constrain v9 == u1 1
+                jmp b4()
+              b4():
+                return
+            }
+        ";
+        let ssa = Ssa::from_str(src).unwrap();
+
+        // v4 has been hoisted, although:
+        // - v5 has not yet been removed since it was encountered earlier in the program
+        // - v8 hasn't been recognized as a duplicate of v6 yet since they still reference v4 and
+        //   v5 respectively
+        let expected = "
+            brillig(inline) fn main f0 {
+              b0(v0: u32):
+                v2 = lt u32 1000, v0
+                v4 = add v0, u32 1
+                jmpif v2 then: b1, else: b2
+              b1():
+                v5 = add v0, u32 1
+                v6 = lt v0, v5
+                constrain v6 == u1 1
+                jmp b2()
+              b2():
+                jmpif v2 then: b3, else: b4
+              b3():
+                v8 = lt v0, v4
+                constrain v8 == u1 1
+                jmp b4()
+              b4():
+                return
+            }
+        ";
+
+        let ssa = ssa.fold_constants_using_constraints();
+        assert_normalized_ssa_equals(ssa, expected);
+    }
+
     #[test]
     fn inlines_brillig_call_without_arguments() {
         let src = "
@@ -1260,46 +1412,87 @@ mod test {
     }
 
     #[test]
-    fn deduplicate_across_blocks() {
-        // fn main f0 {
-        //   b0(v0: u1):
-        //     v1 = not v0
-        //     jmp b1()
-        //   b1():
-        //     v2 = not v0
-        //     return v2
-        // }
-        let main_id = Id::test_new(0);
-
-        // Compiling main
-        let mut builder = FunctionBuilder::new("main".into(), main_id);
-        let b1 = builder.insert_block();
-
-        let v0 = builder.add_parameter(Type::bool());
-        let _v1 = builder.insert_not(v0);
-        builder.terminate_with_jmp(b1, Vec::new());
-
-        builder.switch_to_block(b1);
-        let v2 = builder.insert_not(v0);
-        builder.terminate_with_return(vec![v2]);
+    fn does_not_use_cached_constrain_in_block_that_is_not_dominated() {
+        let src = "
+            brillig(inline) fn main f0 {
+              b0(v0: Field, v1: Field):
+                v3 = eq v0, Field 0
+                jmpif v3 then: b1, else: b2
+              b1():
+                v5 = eq v1, Field 1
+                constrain v1 == Field 1
+                jmp b2()
+              b2():
+                v6 = eq v1, Field 0
+                constrain v1 == Field 0
+                return
+            }
+            ";
+        let ssa = Ssa::from_str(src).unwrap();
+        let ssa = ssa.fold_constants_using_constraints();
+        assert_normalized_ssa_equals(ssa, src);
+    }
 
-        let ssa = builder.finish();
-        let main = ssa.main();
-        assert_eq!(main.dfg[main.entry_block()].instructions().len(), 1);
-        assert_eq!(main.dfg[b1].instructions().len(), 1);
+    #[test]
+    fn does_not_hoist_constrain_to_common_ancestor() {
+        let src = "
+            brillig(inline) fn main f0 {
+              b0(v0: Field, v1: Field):
+                v3 = eq v0, Field 0
+                jmpif v3 then: b1, else: b2
+              b1():
+                constrain v1 == Field 1
+                jmp b2()
+              b2():
+                jmpif v0 then: b3, else: b4
+              b3():
+                constrain v1 == Field 1 // This was incorrectly hoisted to b0 but this condition is not valid when going b0 -> b2 -> b4
+                jmp b4()
+              b4():
+                return
+            }
+            ";
+        let ssa = Ssa::from_str(src).unwrap();
+        let ssa = ssa.fold_constants_using_constraints();
+        assert_normalized_ssa_equals(ssa, src);
+    }
 
-        // Expected output:
-        //
-        // fn main f0 {
-        //   b0(v0: u1):
-        //     v1 = not v0
-        //     jmp b1()
-        //   b1():
-        //     return v1
-        // }
+    #[test]
+    fn deduplicates_side_effecting_intrinsics() {
+        let src = "
+        // After EnableSideEffectsIf removal:
+        acir(inline) fn main f0 {
+          b0(v0: Field, v1: Field, v2: u1):
+            v4 = call is_unconstrained() -> u1
+            v7 = call to_be_radix(v0, u32 256) -> [u8; 1]    // `a.to_be_radix(256)`;
+            inc_rc v7
+            v8 = call to_be_radix(v0, u32 256) -> [u8; 1]    // duplicate load of `a`
+            inc_rc v8
+            v9 = cast v2 as Field                            // `if c { a.to_be_radix(256) }`
+            v10 = mul v0, v9                                 // attaching `c` to `a`
+            v11 = call to_be_radix(v10, u32 256) -> [u8; 1]  // calling `to_radix(c * a)`
+            inc_rc v11
+            enable_side_effects v2                           // side effect var for `c` shifted down by removal
+            return
+        }
+        ";
+        let ssa = Ssa::from_str(src).unwrap();
+        let expected = "
+        acir(inline) fn main f0 {
+          b0(v0: Field, v1: Field, v2: u1):
+            v4 = call is_unconstrained() -> u1
+            v7 = call to_be_radix(v0, u32 256) -> [u8; 1]
+            inc_rc v7
+            inc_rc v7
+            v8 = cast v2 as Field
+            v9 = mul v0, v8
+            v10 = call to_be_radix(v9, u32 256) -> [u8; 1]
+            inc_rc v10
+            enable_side_effects v2
+            return
+        }
+        ";
         let ssa = ssa.fold_constants_using_constraints();
-        let main = ssa.main();
-        assert_eq!(main.dfg[main.entry_block()].instructions().len(), 1);
-        assert_eq!(main.dfg[b1].instructions().len(), 0);
+        assert_normalized_ssa_equals(ssa, expected);
     }
 }
diff --git a/noir/noir-repo/compiler/noirc_evaluator/src/ssa/opt/flatten_cfg.rs b/noir/noir-repo/compiler/noirc_evaluator/src/ssa/opt/flatten_cfg.rs
index 5d114672a55..c8dd0e3c5a3 100644
--- a/noir/noir-repo/compiler/noirc_evaluator/src/ssa/opt/flatten_cfg.rs
+++ b/noir/noir-repo/compiler/noirc_evaluator/src/ssa/opt/flatten_cfg.rs
@@ -131,8 +131,7 @@
 //!   v11 = mul v4, Field 12
 //!   v12 = add v10, v11
 //!   store v12 at v5         (new store)
-use fxhash::FxHashMap as HashMap;
-use std::collections::{BTreeMap, HashSet};
+use fxhash::{FxHashMap as HashMap, FxHashSet as HashSet};
 
 use acvm::{acir::AcirField, acir::BlackBoxFunc, FieldElement};
 use iter_extended::vecmap;
@@ -186,18 +185,6 @@ struct Context<'f> {
     /// Maps start of branch -> end of branch
     branch_ends: HashMap<BasicBlockId, BasicBlockId>,
 
-    /// Maps an address to the old and new value of the element at that address
-    /// These only hold stores for one block at a time and is cleared
-    /// between inlining of branches.
-    store_values: HashMap<ValueId, Store>,
-
-    /// Stores all allocations local to the current branch.
-    /// Since these branches are local to the current branch (ie. only defined within one branch of
-    /// an if expression), they should not be merged with their previous value or stored value in
-    /// the other branch since there is no such value. The ValueId here is that which is returned
-    /// by the allocate instruction.
-    local_allocations: HashSet<ValueId>,
-
     /// A stack of each jmpif condition that was taken to reach a particular point in the program.
     /// When two branches are merged back into one, this constitutes a join point, and is analogous
     /// to the rest of the program after an if statement. When such a join point / end block is
@@ -214,13 +201,15 @@ struct Context<'f> {
     /// When processing a block, we pop this stack to get its arguments
     /// and at the end we push the arguments for his successor
     arguments_stack: Vec<Vec<ValueId>>,
-}
 
-#[derive(Clone)]
-pub(crate) struct Store {
-    old_value: ValueId,
-    new_value: ValueId,
-    call_stack: CallStack,
+    /// Stores all allocations local to the current branch.
+    ///
+    /// Since these branches are local to the current branch (i.e. only defined within one branch of
+    /// an if expression), they should not be merged with their previous value or stored value in
+    /// the other branch since there is no such value.
+    ///
+    /// The `ValueId` here is that which is returned by the allocate instruction.
+    local_allocations: HashSet<ValueId>,
 }
 
 #[derive(Clone)]
@@ -231,8 +220,6 @@ struct ConditionalBranch {
     old_condition: ValueId,
     // The condition of the branch
     condition: ValueId,
-    // The store values accumulated when processing the branch
-    store_values: HashMap<ValueId, Store>,
     // The allocations accumulated when processing the branch
     local_allocations: HashSet<ValueId>,
 }
@@ -263,12 +250,11 @@ fn flatten_function_cfg(function: &mut Function, no_predicates: &HashMap<Functio
     let mut context = Context {
         inserter: FunctionInserter::new(function),
         cfg,
-        store_values: HashMap::default(),
-        local_allocations: HashSet::new(),
         branch_ends,
         slice_sizes: HashMap::default(),
         condition_stack: Vec::new(),
         arguments_stack: Vec::new(),
+        local_allocations: HashSet::default(),
     };
     context.flatten(no_predicates);
 }
@@ -429,14 +415,12 @@ impl<'f> Context<'f> {
         let old_condition = *condition;
         let then_condition = self.inserter.resolve(old_condition);
 
-        let old_stores = std::mem::take(&mut self.store_values);
         let old_allocations = std::mem::take(&mut self.local_allocations);
         let branch = ConditionalBranch {
             old_condition,
             condition: self.link_condition(then_condition),
-            store_values: old_stores,
-            local_allocations: old_allocations,
             last_block: *then_destination,
+            local_allocations: old_allocations,
         };
         let cond_context = ConditionalContext {
             condition: then_condition,
@@ -473,19 +457,12 @@ impl<'f> Context<'f> {
         );
         let else_condition = self.link_condition(else_condition);
 
-        // Make sure the else branch sees the previous values of each store
-        // rather than any values created in the 'then' branch.
-        let old_stores = std::mem::take(&mut cond_context.then_branch.store_values);
-        cond_context.then_branch.store_values = std::mem::take(&mut self.store_values);
-        self.undo_stores_in_then_branch(&cond_context.then_branch.store_values);
-
         let old_allocations = std::mem::take(&mut self.local_allocations);
         let else_branch = ConditionalBranch {
             old_condition: cond_context.then_branch.old_condition,
             condition: else_condition,
-            store_values: old_stores,
-            local_allocations: old_allocations,
             last_block: *block,
+            local_allocations: old_allocations,
         };
         cond_context.then_branch.local_allocations.clear();
         cond_context.else_branch = Some(else_branch);
@@ -509,10 +486,8 @@ impl<'f> Context<'f> {
         }
 
         let mut else_branch = cond_context.else_branch.unwrap();
-        let stores_in_branch = std::mem::replace(&mut self.store_values, else_branch.store_values);
         self.local_allocations = std::mem::take(&mut else_branch.local_allocations);
         else_branch.last_block = *block;
-        else_branch.store_values = stores_in_branch;
         cond_context.else_branch = Some(else_branch);
 
         // We must remember to reset whether side effects are enabled when both branches
@@ -580,8 +555,6 @@ impl<'f> Context<'f> {
                 .first()
         });
 
-        let call_stack = cond_context.call_stack;
-        self.merge_stores(cond_context.then_branch, cond_context.else_branch, call_stack);
         self.arguments_stack.pop();
         self.arguments_stack.pop();
         self.arguments_stack.push(args);
@@ -636,116 +609,29 @@ impl<'f> Context<'f> {
         self.insert_instruction_with_typevars(enable_side_effects, None, call_stack);
     }
 
-    /// Merge any store instructions found in each branch.
-    ///
-    /// This function relies on the 'then' branch being merged before the 'else' branch of a jmpif
-    /// instruction. If this ordering is changed, the ordering that store values are merged within
-    /// this function also needs to be changed to reflect that.
-    fn merge_stores(
-        &mut self,
-        then_branch: ConditionalBranch,
-        else_branch: Option<ConditionalBranch>,
-        call_stack: CallStack,
-    ) {
-        // Address -> (then_value, else_value, value_before_the_if)
-        let mut new_map = BTreeMap::new();
-
-        for (address, store) in then_branch.store_values {
-            new_map.insert(address, (store.new_value, store.old_value, store.old_value));
-        }
-
-        if else_branch.is_some() {
-            for (address, store) in else_branch.clone().unwrap().store_values {
-                if let Some(entry) = new_map.get_mut(&address) {
-                    entry.1 = store.new_value;
-                } else {
-                    new_map.insert(address, (store.old_value, store.new_value, store.old_value));
-                }
-            }
-        }
-
-        let then_condition = then_branch.condition;
-        let block = self.inserter.function.entry_block();
-
-        // Merging must occur in a separate loop as we cannot borrow `self` as mutable while `value_merger` does
-        let mut new_values = HashMap::default();
-        for (address, (then_case, else_case, _)) in &new_map {
-            let instruction = Instruction::IfElse {
-                then_condition,
-                then_value: *then_case,
-                else_value: *else_case,
-            };
-            let dfg = &mut self.inserter.function.dfg;
-            let value = dfg
-                .insert_instruction_and_results(instruction, block, None, call_stack.clone())
-                .first();
-
-            new_values.insert(address, value);
-        }
-
-        // Replace stores with new merged values
-        for (address, (_, _, old_value)) in &new_map {
-            let value = new_values[address];
-            let address = *address;
-            self.insert_instruction_with_typevars(
-                Instruction::Store { address, value },
-                None,
-                call_stack.clone(),
-            );
-
-            if let Some(store) = self.store_values.get_mut(&address) {
-                store.new_value = value;
-            } else {
-                self.store_values.insert(
-                    address,
-                    Store {
-                        old_value: *old_value,
-                        new_value: value,
-                        call_stack: call_stack.clone(),
-                    },
-                );
-            }
-        }
-    }
-
-    fn remember_store(&mut self, address: ValueId, new_value: ValueId, call_stack: CallStack) {
-        if !self.local_allocations.contains(&address) {
-            if let Some(store_value) = self.store_values.get_mut(&address) {
-                store_value.new_value = new_value;
-            } else {
-                let load = Instruction::Load { address };
-
-                let load_type = Some(vec![self.inserter.function.dfg.type_of_value(new_value)]);
-                let old_value = self
-                    .insert_instruction_with_typevars(load.clone(), load_type, call_stack.clone())
-                    .first();
-
-                self.store_values.insert(address, Store { old_value, new_value, call_stack });
-            }
-        }
-    }
-
     /// Push the given instruction to the end of the entry block of the current function.
     ///
     /// Note that each ValueId of the instruction will be mapped via self.inserter.resolve.
     /// As a result, the instruction that will be pushed will actually be a new instruction
     /// with a different InstructionId from the original. The results of the given instruction
     /// will also be mapped to the results of the new instruction.
-    fn push_instruction(&mut self, id: InstructionId) -> Vec<ValueId> {
+    ///
+    /// `previous_allocate_result` should only be set to the result of an allocate instruction
+    /// if that instruction was the instruction immediately previous to this one - if there are
+    /// any instructions in between it should be None.
+    fn push_instruction(&mut self, id: InstructionId) {
         let (instruction, call_stack) = self.inserter.map_instruction(id);
         let instruction = self.handle_instruction_side_effects(instruction, call_stack.clone());
-        let is_allocate = matches!(instruction, Instruction::Allocate);
 
+        let instruction_is_allocate = matches!(&instruction, Instruction::Allocate);
         let entry = self.inserter.function.entry_block();
         let results = self.inserter.push_instruction_value(instruction, id, entry, call_stack);
 
         // Remember an allocate was created local to this branch so that we do not try to merge store
         // values across branches for it later.
-        if is_allocate {
+        if instruction_is_allocate {
             self.local_allocations.insert(results.first());
         }
-
-        results.results().into_owned()
     }
 
     /// If we are currently in a branch, we need to modify constrain instructions
@@ -782,8 +668,32 @@ impl<'f> Context<'f> {
                     Instruction::Constrain(lhs, rhs, message)
                 }
                 Instruction::Store { address, value } => {
-                    self.remember_store(address, value, call_stack);
-                    Instruction::Store { address, value }
+                    // If this instruction immediately follows an allocate, and stores to that
+                    // address there is no previous value to load and we don't need a merge anyway.
+                    if self.local_allocations.contains(&address) {
+                        Instruction::Store { address, value }
+                    } else {
+                        // Instead of storing `value`, store `if condition { value } else { previous_value }`
+                        let typ = self.inserter.function.dfg.type_of_value(value);
+                        let load = Instruction::Load { address };
+                        let previous_value = self
+                            .insert_instruction_with_typevars(
+                                load,
+                                Some(vec![typ]),
+                                call_stack.clone(),
+                            )
+                            .first();
+
+                        let instruction = Instruction::IfElse {
+                            then_condition: condition,
+                            then_value: value,
+
+                            else_value: previous_value,
+                        };
+
+                        let updated_value = self.insert_instruction(instruction, call_stack);
+                        Instruction::Store { address, value: updated_value }
+                    }
                 }
                 Instruction::RangeCheck { value, max_bit_size, assert_message } => {
                     // Replace value with `value * predicate` to zero out value when predicate is inactive.
@@ -905,23 +815,11 @@ impl<'f> Context<'f> {
             call_stack,
         )
     }
-
-    fn undo_stores_in_then_branch(&mut self, store_values: &HashMap<ValueId, Store>) {
-        for (address, store) in store_values {
-            let address = *address;
-            let value = store.old_value;
-            let instruction = Instruction::Store { address, value };
-            // Considering the location of undoing a store to be the same as the original store.
-            self.insert_instruction_with_typevars(instruction, None, store.call_stack.clone());
-        }
-    }
 }
 
 #[cfg(test)]
 mod test {
-    use std::sync::Arc;
-
-    use acvm::{acir::AcirField, FieldElement};
+    use acvm::acir::AcirField;
 
     use crate::ssa::{
         function_builder::FunctionBuilder,
@@ -1023,15 +921,13 @@ mod test {
               b0(v0: u1, v1: &mut Field):
                 enable_side_effects v0
                 v2 = load v1 -> Field
-                store Field 5 at v1
-                v4 = not v0
-                store v2 at v1
+                v3 = cast v0 as Field
+                v5 = sub Field 5, v2
+                v6 = mul v3, v5
+                v7 = add v2, v6
+                store v7 at v1
+                v8 = not v0
                 enable_side_effects u1 1
-                v6 = cast v0 as Field
-                v7 = sub Field 5, v2
-                v8 = mul v6, v7
-                v9 = add v2, v8
-                store v9 at v1
                 return
             }
             ";
@@ -1062,17 +958,20 @@ mod test {
               b0(v0: u1, v1: &mut Field):
                 enable_side_effects v0
                 v2 = load v1 -> Field
-                store Field 5 at v1
-                v4 = not v0
-                store v2 at v1
-                enable_side_effects v4
-                v5 = load v1 -> Field
-                store Field 6 at v1
+                v3 = cast v0 as Field
+                v5 = sub Field 5, v2
+                v6 = mul v3, v5
+                v7 = add v2, v6
+                store v7 at v1
+                v8 = not v0
+                enable_side_effects v8
+                v9 = load v1 -> Field
+                v10 = cast v8 as Field
+                v12 = sub Field 6, v9
+                v13 = mul v10, v12
+                v14 = add v9, v13
+                store v14 at v1
                 enable_side_effects u1 1
-                v8 = cast v0 as Field
-                v10 = mul v8, Field -1
-                v11 = add Field 6, v10
-                store v11 at v1
                 return
             }
             ";
@@ -1115,84 +1014,123 @@ mod test {
         //    b7      b8
         //      ↘   ↙
         //       b9
-        let src = "
-        acir(inline) fn main f0 {
-          b0(v0: u1, v1: u1):
-            v2 = allocate -> &mut Field
-            store Field 0 at v2
-            v4 = load v2 -> Field
-            // call v1(Field 0, v4)
-            jmp b1()
-          b1():
-            store Field 1 at v2
-            v6 = load v2 -> Field
-            // call v1(Field 1, v6)
-            jmpif v0 then: b2, else: b3
-          b2():
-            store Field 2 at v2
-            v8 = load v2 -> Field
-            // call v1(Field 2, v8)
-            jmp b4()
-          b4():
-            v12 = load v2 -> Field
-            // call v1(Field 4, v12)
-            jmpif v1 then: b5, else: b6
-          b5():
-            store Field 5 at v2
-            v14 = load v2 -> Field
-            // call v1(Field 5, v14)
-            jmp b7()
-          b7():
-            v18 = load v2 -> Field
-            // call v1(Field 7, v18)
-            jmp b9()
-          b9():
-            v22 = load v2 -> Field
-            // call v1(Field 9, v22)
-            v23 = load v2 -> Field
-            return v23
-          b6():
-            store Field 6 at v2
-            v16 = load v2 -> Field
-            // call v1(Field 6, v16)
-            jmp b7()
-          b3():
-            store Field 3 at v2
-            v10 = load v2 -> Field
-            // call v1(Field 3, v10)
-            jmp b8()
-          b8():
-            v20 = load v2 -> Field
-            // call v1(Field 8, v20)
-            jmp b9()
-        }
-        ";
+        let main_id = Id::test_new(0);
+        let mut builder = FunctionBuilder::new("main".into(), main_id);
 
-        let ssa = Ssa::from_str(src).unwrap();
-        let ssa = ssa.flatten_cfg().mem2reg();
+        let b1 = builder.insert_block();
+        let b2 = builder.insert_block();
+        let b3 = builder.insert_block();
+        let b4 = builder.insert_block();
+        let b5 = builder.insert_block();
+        let b6 = builder.insert_block();
+        let b7 = builder.insert_block();
+        let b8 = builder.insert_block();
+        let b9 = builder.insert_block();
+
+        let c1 = builder.add_parameter(Type::bool());
+        let c4 = builder.add_parameter(Type::bool());
+
+        let r1 = builder.insert_allocate(Type::field());
+
+        let store_value = |builder: &mut FunctionBuilder, value: u128| {
+            let value = builder.field_constant(value);
+            builder.insert_store(r1, value);
+        };
+
+        let test_function = Id::test_new(1);
+
+        let call_test_function = |builder: &mut FunctionBuilder, block: u128| {
+            let block = builder.field_constant(block);
+            let load = builder.insert_load(r1, Type::field());
+            builder.insert_call(test_function, vec![block, load], Vec::new());
+        };
+
+        let switch_store_and_test_function =
+            |builder: &mut FunctionBuilder, block, block_number: u128| {
+                builder.switch_to_block(block);
+                store_value(builder, block_number);
+                call_test_function(builder, block_number);
+            };
+
+        let switch_and_test_function =
+            |builder: &mut FunctionBuilder, block, block_number: u128| {
+                builder.switch_to_block(block);
+                call_test_function(builder, block_number);
+            };
+
+        store_value(&mut builder, 0);
+        call_test_function(&mut builder, 0);
+        builder.terminate_with_jmp(b1, vec![]);
+
+        switch_store_and_test_function(&mut builder, b1, 1);
+        builder.terminate_with_jmpif(c1, b2, b3);
+
+        switch_store_and_test_function(&mut builder, b2, 2);
+        builder.terminate_with_jmp(b4, vec![]);
+
+        switch_store_and_test_function(&mut builder, b3, 3);
+        builder.terminate_with_jmp(b8, vec![]);
+
+        switch_and_test_function(&mut builder, b4, 4);
+        builder.terminate_with_jmpif(c4, b5, b6);
+
+        switch_store_and_test_function(&mut builder, b5, 5);
+        builder.terminate_with_jmp(b7, vec![]);
+
+        switch_store_and_test_function(&mut builder, b6, 6);
+        builder.terminate_with_jmp(b7, vec![]);
+
+        switch_and_test_function(&mut builder, b7, 7);
+        builder.terminate_with_jmp(b9, vec![]);
+
+        switch_and_test_function(&mut builder, b8, 8);
+        builder.terminate_with_jmp(b9, vec![]);
+
+        switch_and_test_function(&mut builder, b9, 9);
+        let load = builder.insert_load(r1, Type::field());
+        builder.terminate_with_return(vec![load]);
+
+        let ssa = builder.finish().flatten_cfg().mem2reg();
 
         // Expected results after mem2reg removes the allocation and each load and store:
-        let expected = "
-        acir(inline) fn main f0 {
-          b0(v0: u1, v1: u1):
-            v2 = allocate -> &mut Field
-            enable_side_effects v0
-            v3 = mul v0, v1
-            enable_side_effects v3
-            v4 = not v1
-            v5 = mul v0, v4
-            enable_side_effects v0
-            v6 = cast v3 as Field
-            v8 = mul v6, Field -1
-            v10 = add Field 6, v8
-            v11 = not v0
-            enable_side_effects u1 1
-            v13 = cast v0 as Field
-            v15 = sub v10, Field 3
-            v16 = mul v13, v15
-            v17 = add Field 3, v16
-            return v17
-        }";
+        //
+        // fn main f0 {
+        //   b0(v0: u1, v1: u1):
+        //     call test_function(Field 0, Field 0)
+        //     call test_function(Field 1, Field 1)
+        //     enable_side_effects v0
+        //     call test_function(Field 2, Field 2)
+        //     call test_function(Field 4, Field 2)
+        //     v29 = and v0, v1
+        //     enable_side_effects v29
+        //     call test_function(Field 5, Field 5)
+        //     v32 = not v1
+        //     v33 = and v0, v32
+        //     enable_side_effects v33
+        //     call test_function(Field 6, Field 6)
+        //     enable_side_effects v0
+        //     v36 = mul v1, Field 5
+        //     v37 = mul v32, Field 2
+        //     v38 = add v36, v37
+        //     v39 = mul v1, Field 5
+        //     v40 = mul v32, Field 6
+        //     v41 = add v39, v40
+        //     call test_function(Field 7, v42)
+        //     v43 = not v0
+        //     enable_side_effects v43
+        //     store Field 3 at v2
+        //     call test_function(Field 3, Field 3)
+        //     call test_function(Field 8, Field 3)
+        //     enable_side_effects Field 1
+        //     v47 = mul v0, v41
+        //     v48 = mul v43, Field 1
+        //     v49 = add v47, v48
+        //     v50 = mul v0, v44
+        //     v51 = mul v43, Field 3
+        //     v52 = add v50, v51
+        //     call test_function(Field 9, v53)
+        //     return v54
+        // }
 
         let main = ssa.main();
         let ret = match main.dfg[main.entry_block()].terminator() {
@@ -1201,12 +1139,7 @@ mod test {
         };
 
         let merged_values = get_all_constants_reachable_from_instruction(&main.dfg, ret);
-        assert_eq!(
-            merged_values,
-            vec![FieldElement::from(3u128), FieldElement::from(6u128), -FieldElement::from(1u128)]
-        );
-
-        assert_normalized_ssa_equals(ssa, expected);
+        assert_eq!(merged_values, vec![1, 3, 5, 6]);
     }
 
     #[test]
@@ -1287,7 +1220,7 @@ mod test {
     fn get_all_constants_reachable_from_instruction(
         dfg: &DataFlowGraph,
         value: ValueId,
-    ) -> Vec<FieldElement> {
+    ) -> Vec<u128> {
         match dfg[value] {
             Value::Instruction { instruction, .. } => {
                 let mut values = vec![];
@@ -1305,7 +1238,7 @@ mod test {
                 values.dedup();
                 values
             }
-            Value::NumericConstant { constant, .. } => vec![constant],
+            Value::NumericConstant { constant, .. } => vec![constant.to_u128()],
             _ => Vec::new(),
         }
     }
@@ -1344,63 +1277,74 @@ mod test {
     fn should_not_merge_incorrectly_to_false() {
         // Regression test for #1792
         // Tests that it does not simplify a true constraint an always-false constraint
-        // acir(inline) fn main f1 {
-        //     b0(v0: [u8; 2]):
-        //       v5 = array_get v0, index u8 0
-        //       v6 = cast v5 as u32
-        //       v8 = truncate v6 to 1 bits, max_bit_size: 32
-        //       v9 = cast v8 as u1
-        //       v10 = allocate
-        //       store u8 0 at v10
-        //       jmpif v9 then: b2, else: b3
-        //     b2():
-        //       v12 = cast v5 as Field
-        //       v13 = add v12, Field 1
-        //       store v13 at v10
-        //       jmp b4()
-        //     b4():
-        //       constrain v9 == u1 1
-        //       return
-        //     b3():
-        //       store u8 0 at v10
-        //       jmp b4()
-        //   }
-        let main_id = Id::test_new(1);
-        let mut builder = FunctionBuilder::new("main".into(), main_id);
-        builder.insert_block(); // b0
-        let b1 = builder.insert_block();
-        let b2 = builder.insert_block();
-        let b3 = builder.insert_block();
-        let element_type = Arc::new(vec![Type::unsigned(8)]);
-        let array_type = Type::Array(element_type.clone(), 2);
-        let array = builder.add_parameter(array_type);
-        let zero = builder.numeric_constant(0_u128, Type::unsigned(8));
-        let v5 = builder.insert_array_get(array, zero, Type::unsigned(8));
-        let v6 = builder.insert_cast(v5, Type::unsigned(32));
-        let i_two = builder.numeric_constant(2_u128, Type::unsigned(32));
-        let v8 = builder.insert_binary(v6, BinaryOp::Mod, i_two);
-        let v9 = builder.insert_cast(v8, Type::bool());
-        let v10 = builder.insert_allocate(Type::field());
-        builder.insert_store(v10, zero);
-        builder.terminate_with_jmpif(v9, b1, b2);
-        builder.switch_to_block(b1);
-        let one = builder.field_constant(1_u128);
-        let v5b = builder.insert_cast(v5, Type::field());
-        let v13: Id<Value> = builder.insert_binary(v5b, BinaryOp::Add, one);
-        let v14 = builder.insert_cast(v13, Type::unsigned(8));
-        builder.insert_store(v10, v14);
-        builder.terminate_with_jmp(b3, vec![]);
-        builder.switch_to_block(b2);
-        builder.insert_store(v10, zero);
-        builder.terminate_with_jmp(b3, vec![]);
-        builder.switch_to_block(b3);
-        let v_true = builder.numeric_constant(true, Type::bool());
-        let v12 = builder.insert_binary(v9, BinaryOp::Eq, v_true);
-        builder.insert_constrain(v12, v_true, None);
-        builder.terminate_with_return(vec![]);
-        let ssa = builder.finish();
+
+        let src = "
+        acir(inline) fn main f0 {
+          b0(v0: [u8; 2]):
+            v2 = array_get v0, index u8 0 -> u8
+            v3 = cast v2 as u32
+            v4 = truncate v3 to 1 bits, max_bit_size: 32
+            v5 = cast v4 as u1
+            v6 = allocate -> &mut Field
+            store u8 0 at v6
+            jmpif v5 then: b2, else: b1
+          b2():
+            v7 = cast v2 as Field
+            v9 = add v7, Field 1
+            v10 = cast v9 as u8
+            store v10 at v6
+            jmp b3()
+          b3():
+            constrain v5 == u1 1
+            return
+          b1():
+            store u8 0 at v6
+            jmp b3()
+        }
+        ";
+
+        let ssa = Ssa::from_str(src).unwrap();
+
+        let expected = "
+        acir(inline) fn main f0 {
+          b0(v0: [u8; 2]):
+            v2 = array_get v0, index u8 0 -> u8
+            v3 = cast v2 as u32
+            v4 = truncate v3 to 1 bits, max_bit_size: 32
+            v5 = cast v4 as u1
+            v6 = allocate -> &mut Field
+            store u8 0 at v6
+            enable_side_effects v5
+            v7 = cast v2 as Field
+            v9 = add v7, Field 1
+            v10 = cast v9 as u8
+            v11 = load v6 -> u8
+            v12 = cast v4 as Field
+            v13 = cast v11 as Field
+            v14 = sub v9, v13
+            v15 = mul v12, v14
+            v16 = add v13, v15
+            v17 = cast v16 as u8
+            store v17 at v6
+            v18 = not v5
+            enable_side_effects v18
+            v19 = load v6 -> u8
+            v20 = cast v18 as Field
+            v21 = cast v19 as Field
+            v23 = sub Field 0, v21
+            v24 = mul v20, v23
+            v25 = add v21, v24
+            v26 = cast v25 as u8
+            store v26 at v6
+            enable_side_effects u1 1
+            constrain v5 == u1 1
+            return
+        }
+        ";
+
         let flattened_ssa = ssa.flatten_cfg();
         let main = flattened_ssa.main();
+
         // Now assert that there is not an always-false constraint after flattening:
         let mut constrain_count = 0;
         for instruction in main.dfg[main.entry_block()].instructions() {
@@ -1414,6 +1358,8 @@ mod test {
             }
         }
         assert_eq!(constrain_count, 1);
+
+        assert_normalized_ssa_equals(flattened_ssa, expected);
     }
 
     #[test]
@@ -1549,7 +1495,7 @@ mod test {
           b2():
             return
           b1():
-            jmp b2()
+            jmp b2()           
         }
         ";
         let merged_ssa = Ssa::from_str(src).unwrap();
diff --git a/noir/noir-repo/compiler/noirc_evaluator/src/ssa/opt/inlining.rs b/noir/noir-repo/compiler/noirc_evaluator/src/ssa/opt/inlining.rs
index 6cf7070e65e..f91487fd73e 100644
--- a/noir/noir-repo/compiler/noirc_evaluator/src/ssa/opt/inlining.rs
+++ b/noir/noir-repo/compiler/noirc_evaluator/src/ssa/opt/inlining.rs
@@ -1089,7 +1089,6 @@ mod test {
     }
 
     #[test]
-    #[ignore]
     #[should_panic(
         expected = "Attempted to recur more than 1000 times during inlining function 'main': acir(inline) fn main f0 {"
     )]
diff --git a/noir/noir-repo/compiler/noirc_evaluator/src/ssa/opt/mem2reg.rs b/noir/noir-repo/compiler/noirc_evaluator/src/ssa/opt/mem2reg.rs
index 0690dbbf204..53a31ae57c1 100644
--- a/noir/noir-repo/compiler/noirc_evaluator/src/ssa/opt/mem2reg.rs
+++ b/noir/noir-repo/compiler/noirc_evaluator/src/ssa/opt/mem2reg.rs
@@ -18,6 +18,7 @@
 //!   - A reference with 0 aliases means we were unable to find which reference this reference
 //!     refers to. If such a reference is stored to, we must conservatively invalidate every
 //!     reference in the current block.
+//! - We also track the last load instruction to each address per block.
 //!
 //! From there, to figure out the value of each reference at the end of block, iterate each instruction:
 //! - On `Instruction::Allocate`:
@@ -28,6 +29,13 @@
 //!   - Furthermore, if the result of the load is a reference, mark the result as an alias
 //!     of the reference it dereferences to (if known).
 //!     - If which reference it dereferences to is not known, this load result has no aliases.
+//!   - We also track the last instance of a load instruction to each address in a block.
+//!     If we see that the last load instruction was from the same address as the current load instruction,
+//!     we move to replace the result of the current load with the result of the previous load.
+//!     This removal requires a couple conditions:
+//!     - No store occurs to that address before the next load,
+//!     - The address is not used as an argument to a call
+//!     This optimization helps us remove repeated loads for which there are not known values.
 //! - On `Instruction::Store { address, value }`:
 //!   - If the address of the store is known:
 //!     - If the address has exactly 1 alias:
@@ -40,11 +48,13 @@
 //!     - Conservatively mark every alias in the block to `Unknown`.
 //!   - Additionally, if there were no Loads to any alias of the address between this Store and
 //!     the previous Store to the same address, the previous store can be removed.
+//!   - Remove the instance of the last load instruction to the address and its aliases
 //! - On `Instruction::Call { arguments }`:
 //!   - If any argument of the call is a reference, set the value of each alias of that
 //!     reference to `Unknown`
 //!   - Any builtin functions that may return aliases if their input also contains a
 //!     reference should be tracked. Examples: `slice_push_back`, `slice_insert`, `slice_remove`, etc.
+//!   - Remove the instance of the last load instruction for any reference arguments and their aliases
 //!
 //! On a terminator instruction:
 //! - If the terminator is a `Jmp`:
@@ -274,6 +284,9 @@ impl<'f> PerFunctionContext<'f> {
         if let Some(first_predecessor) = predecessors.next() {
             let mut first = self.blocks.get(&first_predecessor).cloned().unwrap_or_default();
             first.last_stores.clear();
+            // Last loads are tracked per block. During unification we are creating a new block from the current one,
+            // so we must clear the last loads of the current block before we return the new block.
+            first.last_loads.clear();
 
             // Note that we have to start folding with the first block as the accumulator.
             // If we started with an empty block, an empty block union'd with any other block
@@ -410,6 +423,28 @@ impl<'f> PerFunctionContext<'f> {
 
                     self.last_loads.insert(address, (instruction, block_id));
                 }
+
+                // Check whether the block has a repeat load from the same address (w/ no calls or stores in between the loads).
+                // If we do have a repeat load, we can remove the current load and map its result to the previous load's result.
+                if let Some(last_load) = references.last_loads.get(&address) {
+                    let Instruction::Load { address: previous_address } =
+                        &self.inserter.function.dfg[*last_load]
+                    else {
+                        panic!("Expected a Load instruction here");
+                    };
+                    let result = self.inserter.function.dfg.instruction_results(instruction)[0];
+                    let previous_result =
+                        self.inserter.function.dfg.instruction_results(*last_load)[0];
+                    if *previous_address == address {
+                        self.inserter.map_value(result, previous_result);
+                        self.instructions_to_remove.insert(instruction);
+                    }
+                }
+                // We want to set the load for every load even if the address has a known value
+                // and the previous load instruction was removed.
+                // We are safe to still remove a repeat load in this case as we are mapping from the current load's
+                // result to the previous load, which if it was removed should already have a mapping to the known value.
+                references.set_last_load(address, instruction);
             }
             Instruction::Store { address, value } => {
                 let address = self.inserter.function.dfg.resolve(*address);
@@ -435,6 +470,8 @@ impl<'f> PerFunctionContext<'f> {
                 }
 
                 references.set_known_value(address, value);
+                // If we see a store to an address, the last load to that address needs to remain.
+                references.keep_last_load_for(address, self.inserter.function);
                 references.last_stores.insert(address, instruction);
             }
             Instruction::Allocate => {
@@ -542,6 +579,9 @@ impl<'f> PerFunctionContext<'f> {
                 let value = self.inserter.function.dfg.resolve(*value);
                 references.set_unknown(value);
                 references.mark_value_used(value, self.inserter.function);
+
+                // If a reference is an argument to a call, the last load to that address and its aliases needs to remain.
+                references.keep_last_load_for(value, self.inserter.function);
             }
         }
     }
@@ -572,6 +612,12 @@ impl<'f> PerFunctionContext<'f> {
                 let destination_parameters = self.inserter.function.dfg[*destination].parameters();
                 assert_eq!(destination_parameters.len(), arguments.len());
 
+                // If we have multiple parameters that alias that same argument value,
+                // then those parameters also alias each other.
+                // We save parameters with repeat arguments to later mark those
+                // parameters as aliasing one another.
+                let mut arg_set: HashMap<ValueId, BTreeSet<ValueId>> = HashMap::default();
+
                 // Add an alias for each reference parameter
                 for (parameter, argument) in destination_parameters.iter().zip(arguments) {
                     if self.inserter.function.dfg.value_is_reference(*parameter) {
@@ -581,10 +627,27 @@ impl<'f> PerFunctionContext<'f> {
                             if let Some(aliases) = references.aliases.get_mut(expression) {
                                 // The argument reference is possibly aliased by this block parameter
                                 aliases.insert(*parameter);
+
+                                // Check if we have seen the same argument
+                                let seen_parameters = arg_set.entry(argument).or_default();
+                                // Add the current parameter to the parameters we have seen for this argument.
+                                // The previous parameters and the current one alias one another.
+                                seen_parameters.insert(*parameter);
                             }
                         }
                     }
                 }
+
+                // Set the aliases of the parameters
+                for (_, aliased_params) in arg_set {
+                    for param in aliased_params.iter() {
+                        self.set_aliases(
+                            references,
+                            *param,
+                            AliasSet::known_multiple(aliased_params.clone()),
+                        );
+                    }
+                }
             }
             TerminatorInstruction::Return { return_values, .. } => {
                 // Removing all `last_stores` for each returned reference is more important here
@@ -612,6 +675,8 @@ mod tests {
             map::Id,
             types::Type,
         },
+        opt::assert_normalized_ssa_equals,
+        Ssa,
     };
 
     #[test]
@@ -822,88 +887,53 @@ mod tests {
     // is later stored in a successor block
     #[test]
     fn load_aliases_in_predecessor_block() {
-        // fn main {
-        //     b0():
-        //       v0 = allocate
-        //       store Field 0 at v0
-        //       v2 = allocate
-        //       store v0 at v2
-        //       v3 = load v2
-        //       v4 = load v2
-        //       jmp b1()
-        //     b1():
-        //       store Field 1 at v3
-        //       store Field 2 at v4
-        //       v7 = load v3
-        //       v8 = eq v7, Field 2
-        //       return
-        // }
-        let main_id = Id::test_new(0);
-        let mut builder = FunctionBuilder::new("main".into(), main_id);
-
-        let v0 = builder.insert_allocate(Type::field());
-
-        let zero = builder.field_constant(0u128);
-        builder.insert_store(v0, zero);
-
-        let v2 = builder.insert_allocate(Type::Reference(Arc::new(Type::field())));
-        builder.insert_store(v2, v0);
-
-        let v3 = builder.insert_load(v2, Type::field());
-        let v4 = builder.insert_load(v2, Type::field());
-        let b1 = builder.insert_block();
-        builder.terminate_with_jmp(b1, vec![]);
-
-        builder.switch_to_block(b1);
-
-        let one = builder.field_constant(1u128);
-        builder.insert_store(v3, one);
-
-        let two = builder.field_constant(2u128);
-        builder.insert_store(v4, two);
-
-        let v8 = builder.insert_load(v3, Type::field());
-        let _ = builder.insert_binary(v8, BinaryOp::Eq, two);
-
-        builder.terminate_with_return(vec![]);
-
-        let ssa = builder.finish();
-        assert_eq!(ssa.main().reachable_blocks().len(), 2);
+        let src = "
+        acir(inline) fn main f0 {
+          b0():
+            v0 = allocate -> &mut Field
+            store Field 0 at v0
+            v2 = allocate -> &mut &mut Field
+            store v0 at v2
+            v3 = load v2 -> &mut Field
+            v4 = load v2 -> &mut Field
+            jmp b1()
+          b1():
+            store Field 1 at v3
+            store Field 2 at v4
+            v7 = load v3 -> Field
+            v8 = eq v7, Field 2
+            return
+        }
+        ";
 
-        // Expected result:
-        // acir fn main f0 {
-        //   b0():
-        //     v9 = allocate
-        //     store Field 0 at v9
-        //     v10 = allocate
-        //     jmp b1()
-        //   b1():
-        //     return
-        // }
-        let ssa = ssa.mem2reg();
-        println!("{}", ssa);
+        let mut ssa = Ssa::from_str(src).unwrap();
+        let main = ssa.main_mut();
 
-        let main = ssa.main();
-        assert_eq!(main.reachable_blocks().len(), 2);
+        let instructions = main.dfg[main.entry_block()].instructions();
+        assert_eq!(instructions.len(), 6); // The final return is not counted
 
         // All loads should be removed
-        assert_eq!(count_loads(main.entry_block(), &main.dfg), 0);
-        assert_eq!(count_loads(b1, &main.dfg), 0);
-
         // The first store is not removed as it is used as a nested reference in another store.
-        // We would need to track whether the store where `v9` is the store value gets removed to know whether
+        // We would need to track whether the store where `v0` is the store value gets removed to know whether
         // to remove it.
-        assert_eq!(count_stores(main.entry_block(), &main.dfg), 1);
         // The first store in b1 is removed since there is another store to the same reference
         // in the same block, and the store is not needed before the later store.
         // The rest of the stores are also removed as no loads are done within any blocks
         // to the stored values.
-        assert_eq!(count_stores(b1, &main.dfg), 0);
-
-        let b1_instructions = main.dfg[b1].instructions();
+        let expected = "
+        acir(inline) fn main f0 {
+          b0():
+            v0 = allocate -> &mut Field
+            store Field 0 at v0
+            v2 = allocate -> &mut &mut Field
+            jmp b1()
+          b1():
+            return
+        }
+        ";
 
-        // We expect the last eq to be optimized out
-        assert_eq!(b1_instructions.len(), 0);
+        let ssa = ssa.mem2reg();
+        assert_normalized_ssa_equals(ssa, expected);
     }
 
     #[test]
@@ -933,7 +963,7 @@ mod tests {
         //       v10 = eq v9, Field 2
         //       constrain v9 == Field 2
         //       v11 = load v2
-        //       v12 = load v10
+        //       v12 = load v11
         //       v13 = eq v12, Field 2
         //       constrain v11 == Field 2
         //       return
@@ -992,7 +1022,7 @@ mod tests {
         let main = ssa.main();
         assert_eq!(main.reachable_blocks().len(), 4);
 
-        // The store from the original SSA should remain
+        // The stores from the original SSA should remain
         assert_eq!(count_stores(main.entry_block(), &main.dfg), 2);
         assert_eq!(count_stores(b2, &main.dfg), 1);
 
@@ -1039,4 +1069,160 @@ mod tests {
         let main = ssa.main();
         assert_eq!(count_loads(main.entry_block(), &main.dfg), 1);
     }
+
+    #[test]
+    fn remove_repeat_loads() {
+        // This tests starts with two loads from the same unknown load.
+        // Specifically you should look for `load v2` in `b3`.
+        // We should be able to remove the second repeated load.
+        let src = "
+        acir(inline) fn main f0 {
+          b0():
+            v0 = allocate -> &mut Field
+            store Field 0 at v0
+            v2 = allocate -> &mut &mut Field
+            store v0 at v2
+            jmp b1(Field 0)
+          b1(v3: Field):
+            v4 = eq v3, Field 0
+            jmpif v4 then: b2, else: b3
+          b2():
+            v5 = load v2 -> &mut Field
+            store Field 2 at v5
+            v8 = add v3, Field 1
+            jmp b1(v8)
+          b3():
+            v9 = load v0 -> Field
+            v10 = eq v9, Field 2
+            constrain v9 == Field 2
+            v11 = load v2 -> &mut Field
+            v12 = load v2 -> &mut Field
+            v13 = load v12 -> Field
+            v14 = eq v13, Field 2
+            constrain v13 == Field 2
+            return
+        }
+        ";
+
+        let ssa = Ssa::from_str(src).unwrap();
+
+        // The repeated load from v3 should be removed
+        // b3 should only have three loads now rather than four previously
+        //
+        // All stores are expected to remain.
+        let expected = "
+        acir(inline) fn main f0 {
+          b0():
+            v1 = allocate -> &mut Field
+            store Field 0 at v1
+            v3 = allocate -> &mut &mut Field
+            store v1 at v3
+            jmp b1(Field 0)
+          b1(v0: Field):
+            v4 = eq v0, Field 0
+            jmpif v4 then: b3, else: b2
+          b3():
+            v11 = load v3 -> &mut Field
+            store Field 2 at v11
+            v13 = add v0, Field 1
+            jmp b1(v13)
+          b2():
+            v5 = load v1 -> Field
+            v7 = eq v5, Field 2
+            constrain v5 == Field 2
+            v8 = load v3 -> &mut Field
+            v9 = load v8 -> Field
+            v10 = eq v9, Field 2
+            constrain v9 == Field 2
+            return
+        }
+        ";
+
+        let ssa = ssa.mem2reg();
+        assert_normalized_ssa_equals(ssa, expected);
+    }
+
+    #[test]
+    fn keep_repeat_loads_passed_to_a_call() {
+        // The test is the exact same as `remove_repeat_loads` above except with the call
+        // to `f1` between the repeated loads.
+        let src = "
+        acir(inline) fn main f0 {
+          b0():
+            v1 = allocate -> &mut Field
+            store Field 0 at v1
+            v3 = allocate -> &mut &mut Field
+            store v1 at v3
+            jmp b1(Field 0)
+          b1(v0: Field):
+            v4 = eq v0, Field 0
+            jmpif v4 then: b3, else: b2
+          b3():
+            v13 = load v3 -> &mut Field
+            store Field 2 at v13
+            v15 = add v0, Field 1
+            jmp b1(v15)
+          b2():
+            v5 = load v1 -> Field
+            v7 = eq v5, Field 2
+            constrain v5 == Field 2
+            v8 = load v3 -> &mut Field
+            call f1(v3)
+            v10 = load v3 -> &mut Field
+            v11 = load v10 -> Field
+            v12 = eq v11, Field 2
+            constrain v11 == Field 2
+            return
+        }
+        acir(inline) fn foo f1 {
+          b0(v0: &mut Field):
+            return
+        }  
+        ";
+
+        let ssa = Ssa::from_str(src).unwrap();
+
+        let ssa = ssa.mem2reg();
+        // We expect the program to be unchanged
+        assert_normalized_ssa_equals(ssa, src);
+    }
+
+    #[test]
+    fn keep_repeat_loads_with_alias_store() {
+        // v7, v8, and v9 alias one another. We want to make sure that a repeat load to v7 with a store
+        // to its aliases in between the repeat loads does not remove those loads.
+        let src = "
+        acir(inline) fn main f0 {
+          b0(v0: u1):
+            jmpif v0 then: b2, else: b1
+          b2():
+            v6 = allocate -> &mut Field
+            store Field 0 at v6
+            jmp b3(v6, v6, v6)
+          b3(v1: &mut Field, v2: &mut Field, v3: &mut Field):
+            v8 = load v1 -> Field
+            store Field 2 at v2
+            v10 = load v1 -> Field
+            store Field 1 at v3
+            v11 = load v1 -> Field
+            store Field 3 at v3
+            v13 = load v1 -> Field
+            constrain v8 == Field 0
+            constrain v10 == Field 2
+            constrain v11 == Field 1
+            constrain v13 == Field 3
+            return
+          b1():
+            v4 = allocate -> &mut Field
+            store Field 1 at v4
+            jmp b3(v4, v4, v4)
+        }
+        ";
+
+        let ssa = Ssa::from_str(src).unwrap();
+
+        let ssa = ssa.mem2reg();
+        // We expect the program to be unchanged
+        assert_normalized_ssa_equals(ssa, src);
+    }
 }
diff --git a/noir/noir-repo/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs b/noir/noir-repo/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
index 777c16dacd1..5883ce25936 100644
--- a/noir/noir-repo/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
+++ b/noir/noir-repo/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
@@ -19,8 +19,10 @@
 //! When unrolling ACIR code, we remove reference count instructions because they are
 //! only used by Brillig bytecode.
 use acvm::{acir::AcirField, FieldElement};
+use im::HashSet;
 
 use crate::{
+    brillig::brillig_gen::convert_ssa_function,
     errors::RuntimeError,
     ssa::{
         ir::{
@@ -37,38 +39,60 @@ use crate::{
         ssa_gen::Ssa,
     },
 };
-use fxhash::{FxHashMap as HashMap, FxHashSet as HashSet};
+use fxhash::FxHashMap as HashMap;
 
 impl Ssa {
     /// Loop unrolling can return errors, since ACIR functions need to be fully unrolled.
     /// This meta-pass will keep trying to unroll loops and simplifying the SSA until no more errors are found.
-    #[tracing::instrument(level = "trace", skip(ssa))]
-    pub(crate) fn unroll_loops_iteratively(mut ssa: Ssa) -> Result<Ssa, RuntimeError> {
-        for (_, function) in ssa.functions.iter_mut() {
+    ///
+    /// The `max_bytecode_incr_pct`, when given, is used to limit the growth of the Brillig bytecode size
+    /// after unrolling small loops to some percentage of the original loop. For example a value of 150 would
+    /// mean the new loop can be 150% (ie. 2.5 times) larger than the original loop. It will still contain
+    /// fewer SSA instructions, but that can still result in more Brillig opcodes.
+    #[tracing::instrument(level = "trace", skip(self))]
+    pub(crate) fn unroll_loops_iteratively(
+        mut self: Ssa,
+        max_bytecode_increase_percent: Option<i32>,
+    ) -> Result<Ssa, RuntimeError> {
+        for (_, function) in self.functions.iter_mut() {
+            // Take a snapshot of the function to compare byte size increase,
+            // but only if the setting indicates we have to, otherwise skip it.
+            let orig_func_and_max_incr_pct = max_bytecode_increase_percent
+                .filter(|_| function.runtime().is_brillig())
+                .map(|max_incr_pct| (function.clone(), max_incr_pct));
+
             // Try to unroll loops first:
-            let mut unroll_errors = function.try_unroll_loops();
+            let (mut has_unrolled, mut unroll_errors) = function.try_unroll_loops();
 
             // Keep unrolling until no more errors are found
             while !unroll_errors.is_empty() {
                 let prev_unroll_err_count = unroll_errors.len();
 
                 // Simplify the SSA before retrying
-
-                // Do a mem2reg after the last unroll to aid simplify_cfg
-                function.mem2reg();
-                function.simplify_function();
-                // Do another mem2reg after simplify_cfg to aid the next unroll
-                function.mem2reg();
+                simplify_between_unrolls(function);
 
                 // Unroll again
-                unroll_errors = function.try_unroll_loops();
+                let (new_unrolled, new_errors) = function.try_unroll_loops();
+                unroll_errors = new_errors;
+                has_unrolled |= new_unrolled;
+
                 // If we didn't manage to unroll any more loops, exit
                 if unroll_errors.len() >= prev_unroll_err_count {
                     return Err(unroll_errors.swap_remove(0));
                 }
             }
+
+            if has_unrolled {
+                if let Some((orig_function, max_incr_pct)) = orig_func_and_max_incr_pct {
+                    let new_size = brillig_bytecode_size(function);
+                    let orig_size = brillig_bytecode_size(&orig_function);
+                    if !is_new_size_ok(orig_size, new_size, max_incr_pct) {
+                        *function = orig_function;
+                    }
+                }
+            }
         }
-        Ok(ssa)
+        Ok(self)
     }
 }
 
@@ -77,7 +101,7 @@ impl Function {
     // This can also be true for ACIR, but we have no alternative to unrolling in ACIR.
     // Brillig also generally prefers smaller code rather than faster code,
     // so we only attempt to unroll small loops, which we decide on a case-by-case basis.
-    fn try_unroll_loops(&mut self) -> Vec<RuntimeError> {
+    fn try_unroll_loops(&mut self) -> (bool, Vec<RuntimeError>) {
         Loops::find_all(self).unroll_each(self)
     }
 }
@@ -170,8 +194,10 @@ impl Loops {
 
     /// Unroll all loops within a given function.
     /// Any loops which fail to be unrolled (due to using non-constant indices) will be unmodified.
-    fn unroll_each(mut self, function: &mut Function) -> Vec<RuntimeError> {
+    /// Returns whether any blocks have been modified
+    fn unroll_each(mut self, function: &mut Function) -> (bool, Vec<RuntimeError>) {
         let mut unroll_errors = vec![];
+        let mut has_unrolled = false;
         while let Some(next_loop) = self.yet_to_unroll.pop() {
             if function.runtime().is_brillig() && !next_loop.is_small_loop(function, &self.cfg) {
                 continue;
@@ -181,13 +207,17 @@ impl Loops {
             if next_loop.blocks.iter().any(|block| self.modified_blocks.contains(block)) {
                 let mut new_loops = Self::find_all(function);
                 new_loops.failed_to_unroll = self.failed_to_unroll;
-                return unroll_errors.into_iter().chain(new_loops.unroll_each(function)).collect();
+                let (new_unrolled, new_errors) = new_loops.unroll_each(function);
+                return (has_unrolled || new_unrolled, [unroll_errors, new_errors].concat());
             }
 
             // Don't try to unroll the loop again if it is known to fail
             if !self.failed_to_unroll.contains(&next_loop.header) {
                 match next_loop.unroll(function, &self.cfg) {
-                    Ok(_) => self.modified_blocks.extend(next_loop.blocks),
+                    Ok(_) => {
+                        has_unrolled = true;
+                        self.modified_blocks.extend(next_loop.blocks);
+                    }
                     Err(call_stack) => {
                         self.failed_to_unroll.insert(next_loop.header);
                         unroll_errors.push(RuntimeError::UnknownLoopBound { call_stack });
@@ -195,7 +225,7 @@ impl Loops {
                 }
             }
         }
-        unroll_errors
+        (has_unrolled, unroll_errors)
     }
 }
 
@@ -947,21 +977,59 @@ impl<'f> LoopIteration<'f> {
     }
 }
 
+/// Unrolling leaves some duplicate instructions which can potentially be removed.
+fn simplify_between_unrolls(function: &mut Function) {
+    // Do a mem2reg after the last unroll to aid simplify_cfg
+    function.mem2reg();
+    function.simplify_function();
+    // Do another mem2reg after simplify_cfg to aid the next unroll
+    function.mem2reg();
+}
+
+/// Convert the function to Brillig bytecode and return the resulting size.
+fn brillig_bytecode_size(function: &Function) -> usize {
+    // We need to do some SSA passes in order for the conversion to be able to go ahead,
+    // otherwise we can hit `unreachable!()` instructions in `convert_ssa_instruction`.
+    // Creating a clone so as not to modify the originals.
+    let mut temp = function.clone();
+
+    // Might as well give it the best chance.
+    simplify_between_unrolls(&mut temp);
+
+    // This is to try to prevent hitting ICE.
+    temp.dead_instruction_elimination(false);
+
+    convert_ssa_function(&temp, false).byte_code.len()
+}
+
+/// Decide if the new bytecode size is acceptable, compared to the original.
+///
+/// The maximum increase can be expressed as a negative value if we demand a decrease.
+/// (Values -100 and under mean the new size should be 0).
+fn is_new_size_ok(orig_size: usize, new_size: usize, max_incr_pct: i32) -> bool {
+    let max_size_pct = 100i32.saturating_add(max_incr_pct).max(0) as usize;
+    let max_size = orig_size.saturating_mul(max_size_pct);
+    new_size.saturating_mul(100) <= max_size
+}
+
 #[cfg(test)]
 mod tests {
     use acvm::FieldElement;
+    use test_case::test_case;
 
     use crate::errors::RuntimeError;
     use crate::ssa::{ir::value::ValueId, opt::assert_normalized_ssa_equals, Ssa};
 
-    use super::{BoilerplateStats, Loops};
+    use super::{is_new_size_ok, BoilerplateStats, Loops};
 
-    /// Tries to unroll all loops in each SSA function.
+    /// Tries to unroll all loops in each SSA function once, calling the `Function` directly,
+    /// bypassing the iterative loop done by the SSA which does further optimisations.
+    ///
     /// If any loop cannot be unrolled, it is left as-is or in a partially unrolled state.
     fn try_unroll_loops(mut ssa: Ssa) -> (Ssa, Vec<RuntimeError>) {
         let mut errors = vec![];
         for function in ssa.functions.values_mut() {
-            errors.extend(function.try_unroll_loops());
+            errors.extend(function.try_unroll_loops().1);
         }
         (ssa, errors)
     }
@@ -1221,9 +1289,26 @@ mod tests {
 
         let (ssa, errors) = try_unroll_loops(ssa);
         assert_eq!(errors.len(), 0, "Unroll should have no errors");
+        // Check that it's still the original
         assert_normalized_ssa_equals(ssa, parse_ssa().to_string().as_str());
     }
 
+    #[test]
+    fn test_brillig_unroll_iteratively_respects_max_increase() {
+        let ssa = brillig_unroll_test_case();
+        let ssa = ssa.unroll_loops_iteratively(Some(-90)).unwrap();
+        // Check that it's still the original
+        assert_normalized_ssa_equals(ssa, brillig_unroll_test_case().to_string().as_str());
+    }
+
+    #[test]
+    fn test_brillig_unroll_iteratively_with_large_max_increase() {
+        let ssa = brillig_unroll_test_case();
+        let ssa = ssa.unroll_loops_iteratively(Some(50)).unwrap();
+        // Check that it did the unroll
+        assert_eq!(ssa.main().reachable_blocks().len(), 2, "The loop should be unrolled");
+    }
+
     /// Test that `break` and `continue` stop unrolling without any panic.
     #[test]
     fn test_brillig_unroll_break_and_continue() {
@@ -1377,4 +1462,14 @@ mod tests {
         let loop0 = loops.yet_to_unroll.pop().expect("there should be a loop");
         loop0.boilerplate_stats(function, &loops.cfg).expect("there should be stats")
     }
+
+    #[test_case(1000, 700, 50, true; "size decreased")]
+    #[test_case(1000, 1500, 50, true; "size increased just by the max")]
+    #[test_case(1000, 1501, 50, false; "size increased over the max")]
+    #[test_case(1000, 700, -50, false; "size decreased but not enough")]
+    #[test_case(1000, 250, -50, true; "size decreased over expectations")]
+    #[test_case(1000, 250, -1250, false; "demanding more than minus 100 is handled")]
+    fn test_is_new_size_ok(old: usize, new: usize, max: i32, ok: bool) {
+        assert_eq!(is_new_size_ok(old, new, max), ok);
+    }
 }
diff --git a/noir/noir-repo/compiler/noirc_evaluator/src/ssa/ssa_gen/context.rs b/noir/noir-repo/compiler/noirc_evaluator/src/ssa/ssa_gen/context.rs
index 0c6041029da..e39eed79021 100644
--- a/noir/noir-repo/compiler/noirc_evaluator/src/ssa/ssa_gen/context.rs
+++ b/noir/noir-repo/compiler/noirc_evaluator/src/ssa/ssa_gen/context.rs
@@ -172,6 +172,7 @@ impl<'a> FunctionContext<'a> {
     /// Always returns a Value::Mutable wrapping the allocate instruction.
     pub(super) fn new_mutable_variable(&mut self, value_to_store: ValueId) -> Value {
         let element_type = self.builder.current_function.dfg.type_of_value(value_to_store);
+        self.builder.increment_array_reference_count(value_to_store);
         let alloc = self.builder.insert_allocate(element_type);
         self.builder.insert_store(alloc, value_to_store);
         let typ = self.builder.type_of_value(value_to_store);
@@ -732,10 +733,6 @@ impl<'a> FunctionContext<'a> {
         let element_types = Self::convert_type(element_type);
         values.map_both(element_types, |value, element_type| {
             let reference = value.eval_reference();
-            // Reference counting in brillig relies on us incrementing reference
-            // counts when arrays/slices are constructed or indexed.
-            // Thus, if we dereference an lvalue which happens to be array/slice we should increment its reference counter.
-            self.builder.increment_array_reference_count(reference);
             self.builder.insert_load(reference, element_type).into()
         })
     }
@@ -916,7 +913,10 @@ impl<'a> FunctionContext<'a> {
         let parameters = self.builder.current_function.dfg.block_parameters(entry).to_vec();
 
         for parameter in parameters {
-            self.builder.increment_array_reference_count(parameter);
+            // Avoid reference counts for immutable arrays that aren't behind references.
+            if self.builder.current_function.dfg.value_is_reference(parameter) {
+                self.builder.increment_array_reference_count(parameter);
+            }
         }
 
         entry
@@ -933,7 +933,9 @@ impl<'a> FunctionContext<'a> {
         dropped_parameters.retain(|parameter| !terminator_args.contains(parameter));
 
         for parameter in dropped_parameters {
-            self.builder.decrement_array_reference_count(parameter);
+            if self.builder.current_function.dfg.value_is_reference(parameter) {
+                self.builder.decrement_array_reference_count(parameter);
+            }
         }
     }
 
diff --git a/noir/noir-repo/compiler/noirc_evaluator/src/ssa/ssa_gen/mod.rs b/noir/noir-repo/compiler/noirc_evaluator/src/ssa/ssa_gen/mod.rs
index c50f0a7f45c..d28236bd360 100644
--- a/noir/noir-repo/compiler/noirc_evaluator/src/ssa/ssa_gen/mod.rs
+++ b/noir/noir-repo/compiler/noirc_evaluator/src/ssa/ssa_gen/mod.rs
@@ -665,12 +665,11 @@ impl<'a> FunctionContext<'a> {
         values = values.map(|value| {
             let value = value.eval(self);
 
-            // Make sure to increment array reference counts on each let binding
-            self.builder.increment_array_reference_count(value);
-
             Tree::Leaf(if let_expr.mutable {
                 self.new_mutable_variable(value)
             } else {
+                // `new_mutable_variable` already increments rcs internally
+                self.builder.increment_array_reference_count(value);
                 value::Value::Normal(value)
             })
         });
diff --git a/noir/noir-repo/noir_stdlib/src/hash/poseidon2.nr b/noir/noir-repo/noir_stdlib/src/hash/poseidon2.nr
index f2167c43c2c..419f07a2aca 100644
--- a/noir/noir-repo/noir_stdlib/src/hash/poseidon2.nr
+++ b/noir/noir-repo/noir_stdlib/src/hash/poseidon2.nr
@@ -13,11 +13,7 @@ pub struct Poseidon2 {
 impl Poseidon2 {
     #[no_predicates]
     pub fn hash<let N: u32>(input: [Field; N], message_size: u32) -> Field {
-        if message_size == N {
-            Poseidon2::hash_internal(input, N, false)
-        } else {
-            Poseidon2::hash_internal(input, message_size, true)
-        }
+        Poseidon2::hash_internal(input, message_size, message_size != N)
     }
 
     pub(crate) fn new(iv: Field) -> Poseidon2 {
diff --git a/noir/noir-repo/test_programs/execution_success/inline_decompose_hint_brillig_call/Nargo.toml b/noir/noir-repo/test_programs/execution_success/inline_decompose_hint_brillig_call/Nargo.toml
new file mode 100644
index 00000000000..ecac2dfb197
--- /dev/null
+++ b/noir/noir-repo/test_programs/execution_success/inline_decompose_hint_brillig_call/Nargo.toml
@@ -0,0 +1,7 @@
+[package]
+name = "inline_decompose_hint_brillig_call"
+version = "0.1.0"
+type = "bin"
+authors = [""]
+
+[dependencies]
diff --git a/noir/noir-repo/test_programs/execution_success/inline_decompose_hint_brillig_call/src/main.nr b/noir/noir-repo/test_programs/execution_success/inline_decompose_hint_brillig_call/src/main.nr
new file mode 100644
index 00000000000..e500f0f976d
--- /dev/null
+++ b/noir/noir-repo/test_programs/execution_success/inline_decompose_hint_brillig_call/src/main.nr
@@ -0,0 +1,15 @@
+use std::embedded_curve_ops::{EmbeddedCurvePoint, EmbeddedCurveScalar, fixed_base_scalar_mul};
+
+fn main() -> pub Field {
+    let pre_address = 0x23d95e303879a5d0bbef78ecbc335e559da37431f6dcd11da54ed375c2846813;
+    let (a, b) = std::field::bn254::decompose(pre_address);
+    let curve = EmbeddedCurveScalar { lo: a, hi: b };
+    let key = fixed_base_scalar_mul(curve);
+    let point = EmbeddedCurvePoint {
+        x: 0x111223493147f6785514b1c195bb37a2589f22a6596d30bb2bb145fdc9ca8f1e,
+        y: 0x273bbffd678edce8fe30e0deafc4f66d58357c06fd4a820285294b9746c3be95,
+        is_infinite: false,
+    };
+    let address_point = key.add(point);
+    address_point.x
+}
diff --git a/noir/noir-repo/tooling/nargo_cli/build.rs b/noir/noir-repo/tooling/nargo_cli/build.rs
index 740e5ed2052..f0334eaf713 100644
--- a/noir/noir-repo/tooling/nargo_cli/build.rs
+++ b/noir/noir-repo/tooling/nargo_cli/build.rs
@@ -213,8 +213,13 @@ fn test_{test_name}(force_brillig: ForceBrillig, inliner_aggressiveness: Inliner
     nargo.arg("--program-dir").arg(test_program_dir);
     nargo.arg("{test_command}").arg("--force");
     nargo.arg("--inliner-aggressiveness").arg(inliner_aggressiveness.0.to_string());
+
     if force_brillig.0 {{
         nargo.arg("--force-brillig");
+
+        // Set the maximum increase so that part of the optimization is exercised (it might fail).
+        nargo.arg("--max-bytecode-increase-percent");
+        nargo.arg("50");
     }}
 
     {test_content}
diff --git a/noir/noir-repo/tooling/noirc_abi_wasm/build.sh b/noir/noir-repo/tooling/noirc_abi_wasm/build.sh
index c07d2d8a4c1..16fb26e55db 100755
--- a/noir/noir-repo/tooling/noirc_abi_wasm/build.sh
+++ b/noir/noir-repo/tooling/noirc_abi_wasm/build.sh
@@ -25,7 +25,7 @@ function run_if_available {
 require_command jq
 require_command cargo
 require_command wasm-bindgen
-#require_command wasm-opt
+require_command wasm-opt
 
 self_path=$(dirname "$(readlink -f "$0")")
 pname=$(cargo read-manifest | jq -r '.name')
diff --git a/noir/noir-repo/yarn.lock b/noir/noir-repo/yarn.lock
index 3c8df2b1772..f7b7b3df372 100644
--- a/noir/noir-repo/yarn.lock
+++ b/noir/noir-repo/yarn.lock
@@ -221,20 +221,20 @@ __metadata:
   languageName: node
   linkType: hard
 
-"@aztec/bb.js@portal:../../../../barretenberg/ts::locator=integration-tests%40workspace%3Acompiler%2Fintegration-tests":
-  version: 0.0.0-use.local
-  resolution: "@aztec/bb.js@portal:../../../../barretenberg/ts::locator=integration-tests%40workspace%3Acompiler%2Fintegration-tests"
+"@aztec/bb.js@npm:0.63.1":
+  version: 0.63.1
+  resolution: "@aztec/bb.js@npm:0.63.1"
   dependencies:
     comlink: ^4.4.1
     commander: ^10.0.1
     debug: ^4.3.4
     fflate: ^0.8.0
-    pako: ^2.1.0
     tslib: ^2.4.0
   bin:
-    bb.js: ./dest/node/main.js
+    bb.js: dest/node/main.js
+  checksum: b80730f1cb87e4d2ca21d991a42950bc069367896db309ab3f909c5f53efa9291538d51e35bc3c6d2eea042ca33c279ae59eb3f5d844a24336c7bb9664c2404b
   languageName: node
-  linkType: soft
+  linkType: hard
 
 "@babel/code-frame@npm:^7.0.0, @babel/code-frame@npm:^7.10.4, @babel/code-frame@npm:^7.12.11, @babel/code-frame@npm:^7.16.0, @babel/code-frame@npm:^7.22.13, @babel/code-frame@npm:^7.23.5, @babel/code-frame@npm:^7.8.3":
   version: 7.23.5
@@ -14123,7 +14123,7 @@ __metadata:
   version: 0.0.0-use.local
   resolution: "integration-tests@workspace:compiler/integration-tests"
   dependencies:
-    "@aztec/bb.js": "portal:../../../../barretenberg/ts"
+    "@aztec/bb.js": 0.63.1
     "@noir-lang/noir_js": "workspace:*"
     "@noir-lang/noir_wasm": "workspace:*"
     "@nomicfoundation/hardhat-chai-matchers": ^2.0.0