From 8ded16eba9c7056b81136a31cd029d6904fd8214 Mon Sep 17 00:00:00 2001 From: oleh Date: Tue, 17 Sep 2024 03:30:15 +0200 Subject: [PATCH 01/10] feat: initial noir support (#1) --- packages/compiler/src/bin/compiler.rs | 12 ++- packages/compiler/src/lib.rs | 11 +++ packages/compiler/src/noir.rs | 115 ++++++++++++++++++++++++++ 3 files changed, 136 insertions(+), 2 deletions(-) create mode 100644 packages/compiler/src/noir.rs diff --git a/packages/compiler/src/bin/compiler.rs b/packages/compiler/src/bin/compiler.rs index ba53749..9591fab 100644 --- a/packages/compiler/src/bin/compiler.rs +++ b/packages/compiler/src/bin/compiler.rs @@ -60,12 +60,14 @@ enum Commands { Decomposed { #[arg(short, long)] decomposed_regex_path: String, - #[arg(short, long)] + #[arg(long)] halo2_dir_path: Option, #[arg(short, long)] circom_file_path: Option, #[arg(short, long)] template_name: Option, + #[arg(long)] + noir_file_path: Option, #[arg(short, long)] gen_substrs: Option, }, @@ -74,12 +76,14 @@ enum Commands { raw_regex: String, #[arg(short, long)] substrs_json_path: Option, - #[arg(short, long)] + #[arg(long)] halo2_dir_path: Option, #[arg(short, long)] circom_file_path: Option, #[arg(short, long)] template_name: Option, + #[arg(long)] + noir_file_path: Option, #[arg(short, long)] gen_substrs: Option, }, @@ -99,6 +103,7 @@ fn process_decomposed(cli: Cli) { halo2_dir_path, circom_file_path, template_name, + noir_file_path, gen_substrs, } = cli.command { @@ -107,6 +112,7 @@ fn process_decomposed(cli: Cli) { halo2_dir_path.as_deref(), circom_file_path.as_deref(), template_name.as_deref(), + noir_file_path.as_deref(), gen_substrs, ) { eprintln!("Error: {}", e); @@ -122,6 +128,7 @@ fn process_raw(cli: Cli) { halo2_dir_path, circom_file_path, template_name, + noir_file_path, gen_substrs, } = cli.command { @@ -131,6 +138,7 @@ fn process_raw(cli: Cli) { halo2_dir_path.as_deref(), circom_file_path.as_deref(), template_name.as_deref(), + noir_file_path.as_deref(), gen_substrs, ) { eprintln!("Error: {}", e); diff --git a/packages/compiler/src/lib.rs b/packages/compiler/src/lib.rs index 3a7fa04..ca1bc04 100644 --- a/packages/compiler/src/lib.rs +++ b/packages/compiler/src/lib.rs @@ -1,6 +1,7 @@ mod circom; mod errors; mod halo2; +mod noir; mod regex; mod structs; mod wasm; @@ -9,6 +10,7 @@ use circom::gen_circom_template; use errors::CompilerError; use halo2::gen_halo2_tables; use itertools::Itertools; +use noir::gen_noir_fn; use regex::{create_regex_and_dfa_from_str_and_defs, get_regex_and_dfa}; use std::{fs::File, path::PathBuf}; use structs::{DecomposedRegexConfig, RegexAndDFA, SubstringDefinitionsJson}; @@ -55,6 +57,7 @@ fn generate_outputs( halo2_dir_path: Option<&str>, circom_file_path: Option<&str>, circom_template_name: Option<&str>, + noir_file_path: Option<&str>, num_public_parts: usize, gen_substrs: bool, ) -> Result<(), CompilerError> { @@ -86,6 +89,10 @@ fn generate_outputs( )?; } + if let Some(noir_file_path) = noir_file_path { + gen_noir_fn(regex_and_dfa, &PathBuf::from(noir_file_path))?; + } + Ok(()) } @@ -107,6 +114,7 @@ pub fn gen_from_decomposed( halo2_dir_path: Option<&str>, circom_file_path: Option<&str>, circom_template_name: Option<&str>, + noir_file_path: Option<&str>, gen_substrs: Option, ) -> Result<(), CompilerError> { let mut decomposed_regex_config: DecomposedRegexConfig = @@ -126,6 +134,7 @@ pub fn gen_from_decomposed( halo2_dir_path, circom_file_path, circom_template_name, + noir_file_path, num_public_parts, gen_substrs, )?; @@ -153,6 +162,7 @@ pub fn gen_from_raw( halo2_dir_path: Option<&str>, circom_file_path: Option<&str>, template_name: Option<&str>, + noir_file_path: Option<&str>, gen_substrs: Option, ) -> Result<(), CompilerError> { let substrs_defs_json = load_substring_definitions_json(substrs_json_path)?; @@ -167,6 +177,7 @@ pub fn gen_from_raw( halo2_dir_path, circom_file_path, template_name, + noir_file_path, num_public_parts, gen_substrs, )?; diff --git a/packages/compiler/src/noir.rs b/packages/compiler/src/noir.rs new file mode 100644 index 0000000..0abc8d1 --- /dev/null +++ b/packages/compiler/src/noir.rs @@ -0,0 +1,115 @@ +use std::{collections::HashSet, fs::File, io::Write, iter::FromIterator, path::Path}; + +use itertools::Itertools; + +use crate::structs::RegexAndDFA; + +const ACCEPT_STATE_ID: &str = "accept"; + +pub fn gen_noir_fn(regex_and_dfa: &RegexAndDFA, path: &Path) -> Result<(), std::io::Error> { + let noir_fn = to_noir_fn(regex_and_dfa); + let mut file = File::create(path)?; + file.write_all(noir_fn.as_bytes())?; + file.flush()?; + Ok(()) +} + +fn to_noir_fn(regex_and_dfa: &RegexAndDFA) -> String { + let accept_state_ids = { + let accept_states = regex_and_dfa + .dfa + .states + .iter() + .filter(|s| s.state_type == ACCEPT_STATE_ID) + .map(|s| s.state_id) + .collect_vec(); + assert!(accept_states.len() > 0, "no accept states"); + accept_states + }; + + const BYTE_SIZE: u32 = 256; // u8 size + let mut lookup_table_body = String::new(); + + // curr_state + char_code -> next_state + let mut rows: Vec<(usize, u8, usize)> = vec![]; + + for state in regex_and_dfa.dfa.states.iter() { + for (&tran_next_state_id, tran) in &state.transitions { + for &char_code in tran { + rows.push((state.state_id, char_code, tran_next_state_id)); + } + } + if state.state_type == ACCEPT_STATE_ID { + let existing_char_codes = &state + .transitions + .iter() + .flat_map(|(_, tran)| tran.iter().copied().collect_vec()) + .collect::>(); + let all_char_codes = HashSet::from_iter(0..=255); + let mut char_codes = all_char_codes.difference(existing_char_codes).collect_vec(); + char_codes.sort(); // to be deterministic + for &char_code in char_codes { + rows.push((state.state_id, char_code, state.state_id)); + } + } + } + + for (curr_state_id, char_code, next_state_id) in rows { + lookup_table_body += + &format!("table[{curr_state_id} * {BYTE_SIZE} + {char_code}] = {next_state_id};\n",); + } + + lookup_table_body = indent(&lookup_table_body); + let table_size = BYTE_SIZE as usize * regex_and_dfa.dfa.states.len(); + let lookup_table = format!( + r#" +comptime fn make_lookup_table() -> [Field; {table_size}] {{ + let mut table = [0; {table_size}]; +{lookup_table_body} + + table +}} + "# + ); + + let final_states_condition_body = accept_state_ids + .iter() + .map(|id| format!("(s == {id})")) + .collect_vec() + .join(" | "); + let fn_body = format!( + r#" +global table = comptime {{ make_lookup_table() }}; +pub fn regex_match(input: [u8; N]) {{ + // regex: {regex_pattern} + let mut s = 0; + for i in 0..input.len() {{ + s = table[s * {BYTE_SIZE} + input[i] as Field]; + }} + assert({final_states_condition_body}, f"no match: {{s}}"); +}} + "#, + regex_pattern = regex_and_dfa.regex_pattern, + ); + format!( + r#" + {fn_body} + {lookup_table} + "# + ) + .trim() + .to_owned() +} + +fn indent(s: &str) -> String { + s.split("\n") + .map(|s| { + if s.trim().is_empty() { + s.to_owned() + } else { + format!("{}{}", " ", s) + } + }) + .collect::>() + .join("\n") +} From db2e12904b73f9d3a3d2846760fb4c2d4e3d6fb7 Mon Sep 17 00:00:00 2001 From: Elena Fuentes Bongenaar Date: Tue, 17 Sep 2024 17:45:21 -0600 Subject: [PATCH 02/10] Added gen_substrs support for Noir. This works both in decomposed & raw setting. The substrings are returned as BoundedVec since we don't know their exact length upfront, but we know they're not longer than N. To support both settings (decomposed and raw) we have to use `substring_ranges` instead of `substring_boundaries`. --- packages/compiler/src/lib.rs | 2 +- packages/compiler/src/noir.rs | 113 +++++++++++++++++++++++++++++----- 2 files changed, 98 insertions(+), 17 deletions(-) diff --git a/packages/compiler/src/lib.rs b/packages/compiler/src/lib.rs index ca1bc04..98aa0d6 100644 --- a/packages/compiler/src/lib.rs +++ b/packages/compiler/src/lib.rs @@ -90,7 +90,7 @@ fn generate_outputs( } if let Some(noir_file_path) = noir_file_path { - gen_noir_fn(regex_and_dfa, &PathBuf::from(noir_file_path))?; + gen_noir_fn(regex_and_dfa, &PathBuf::from(noir_file_path), gen_substrs)?; } Ok(()) diff --git a/packages/compiler/src/noir.rs b/packages/compiler/src/noir.rs index 0abc8d1..339ce41 100644 --- a/packages/compiler/src/noir.rs +++ b/packages/compiler/src/noir.rs @@ -1,20 +1,32 @@ -use std::{collections::HashSet, fs::File, io::Write, iter::FromIterator, path::Path}; - -use itertools::Itertools; - use crate::structs::RegexAndDFA; +use itertools::Itertools; +use std::{collections::HashSet, fs::File, io::Write, iter::FromIterator, path::Path}; const ACCEPT_STATE_ID: &str = "accept"; -pub fn gen_noir_fn(regex_and_dfa: &RegexAndDFA, path: &Path) -> Result<(), std::io::Error> { - let noir_fn = to_noir_fn(regex_and_dfa); +pub fn gen_noir_fn( + regex_and_dfa: &RegexAndDFA, + path: &Path, + gen_substrs: bool, +) -> Result<(), std::io::Error> { + let noir_fn = to_noir_fn(regex_and_dfa, gen_substrs); let mut file = File::create(path)?; file.write_all(noir_fn.as_bytes())?; file.flush()?; Ok(()) } -fn to_noir_fn(regex_and_dfa: &RegexAndDFA) -> String { +/// Generates Noir code based on the DFA and whether a substring should be extracted. +/// +/// # Arguments +/// +/// * `regex_and_dfa` - The `RegexAndDFA` struct containing the regex pattern and DFA. +/// * `gen_substrs` - A boolean indicating whether to generate substrings. +/// +/// # Returns +/// +/// A `String` that contains the Noir code +fn to_noir_fn(regex_and_dfa: &RegexAndDFA, gen_substrs: bool) -> String { let accept_state_ids = { let accept_states = regex_and_dfa .dfa @@ -59,7 +71,7 @@ fn to_noir_fn(regex_and_dfa: &RegexAndDFA) -> String { &format!("table[{curr_state_id} * {BYTE_SIZE} + {char_code}] = {next_state_id};\n",); } - lookup_table_body = indent(&lookup_table_body); + lookup_table_body = indent(&lookup_table_body, 1); let table_size = BYTE_SIZE as usize * regex_and_dfa.dfa.states.len(); let lookup_table = format!( r#" @@ -72,13 +84,78 @@ comptime fn make_lookup_table() -> [Field; {table_size}] {{ "# ); + // substring_ranges contains the transitions that belong to the substring. + // in Noir we only need to know in what state the substring needs to be extracted, the transitions are not needed + // Example: SubstringDefinitions { substring_ranges: [{(2, 3)}, {(6, 7), (7, 7)}, {(8, 9)}], substring_boundaries: None } + // for each substring, get the first transition and get the end state + let substr_states: Vec = regex_and_dfa + .substrings + .substring_ranges + .iter() + .flat_map(|range_set| range_set.iter().next().map(|&(_, end_state)| end_state)) // Extract the second element (end state) of each tuple + .collect(); + // Note: substring_boundaries is only filled if the substring info is coming from decomposed setting + // and will be empty in the raw setting (using json file for substr transitions). This is why substring_ranges is used here + let final_states_condition_body = accept_state_ids .iter() .map(|id| format!("(s == {id})")) .collect_vec() .join(" | "); - let fn_body = format!( - r#" + + // If substrings have to be extracted, the function returns that amount of BoundedVec, + // otherwise there is no return type + let fn_body = if gen_substrs { + let nr_substrs = substr_states.len(); + // Initialize a substring BoundedVec for each substr that has to be extracted + let mut bounded_vecs_initialization = (0..nr_substrs) + .map(|index| format!("let mut substr{} = BoundedVec::new();", index)) + .collect::>() + .join("\n"); + bounded_vecs_initialization = indent(&bounded_vecs_initialization, 1); // Indent once for inside the function + + // Fill each substring when at the corresponding state + let mut conditions = substr_states + .iter() + .enumerate() + .map(|(index, state)| { + format!( + "if (s == {state}) {{ + substr{index_plus_one}.push(temp); +}}", + index_plus_one = index + ) + }) + .collect::>() + .join("\n"); + conditions = indent(&conditions, 2); // Indent twice to align with the for loop's body + + format!( + r#" +global table = comptime {{ make_lookup_table() }}; +pub fn regex_match(input: [u8; N]) -> [BoundedVec; {nr_substrs}] {{ + // regex: {regex_pattern} + let mut s = 0; + +{bounded_vecs_initialization} + + for i in 0..input.len() {{ + let temp = input[i] as Field; + s = table[s * {BYTE_SIZE} + input[i] as Field]; +{conditions} + }} + assert({final_states_condition_body}, f"no match: {{s}}"); + [{bounded_vec_names}] +}}"#, + regex_pattern = regex_and_dfa.regex_pattern, + bounded_vec_names = (0..nr_substrs) + .map(|index| format!("substr{}", index)) + .collect::>() + .join(", "), + ) + } else { + format!( + r#" global table = comptime {{ make_lookup_table() }}; pub fn regex_match(input: [u8; N]) {{ // regex: {regex_pattern} @@ -87,10 +164,11 @@ pub fn regex_match(input: [u8; N]) {{ s = table[s * {BYTE_SIZE} + input[i] as Field]; }} assert({final_states_condition_body}, f"no match: {{s}}"); -}} - "#, - regex_pattern = regex_and_dfa.regex_pattern, - ); +}}"#, + regex_pattern = regex_and_dfa.regex_pattern, + ) + }; + format!( r#" {fn_body} @@ -101,13 +179,16 @@ pub fn regex_match(input: [u8; N]) {{ .to_owned() } -fn indent(s: &str) -> String { +/// Indents each line of the given string by a specified number of levels. +/// Each level adds four spaces to the beginning of non-whitespace lines. +fn indent(s: &str, level: usize) -> String { + let indent_str = " ".repeat(level); s.split("\n") .map(|s| { if s.trim().is_empty() { s.to_owned() } else { - format!("{}{}", " ", s) + format!("{}{}", indent_str, s) } }) .collect::>() From 33736caddf5e99c465c44862822010ed7d9efcb4 Mon Sep 17 00:00:00 2001 From: Elena Fuentes Bongenaar Date: Tue, 17 Sep 2024 17:52:16 -0600 Subject: [PATCH 03/10] Set gen_substrs to false by default for raw. --- packages/compiler/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/compiler/src/lib.rs b/packages/compiler/src/lib.rs index 98aa0d6..afc7195 100644 --- a/packages/compiler/src/lib.rs +++ b/packages/compiler/src/lib.rs @@ -170,7 +170,7 @@ pub fn gen_from_raw( let regex_and_dfa = create_regex_and_dfa_from_str_and_defs(raw_regex, substrs_defs_json)?; - let gen_substrs = gen_substrs.unwrap_or(true); + let gen_substrs = gen_substrs.unwrap_or(false); generate_outputs( ®ex_and_dfa, From f7ae1862443688e559074a50cb018b6f81b303c3 Mon Sep 17 00:00:00 2001 From: Elena Fuentes Bongenaar Date: Thu, 19 Sep 2024 12:22:46 -0600 Subject: [PATCH 04/10] Per state 1 or more substrings will be extracted, depending on the regex and input. This fix makes sure this is supported. Changes: - regex_match returns a Vec of substrings instead of an array with known length - per state where substrings have to be extracted; add the byte either to a new substring or an already started one Note that substr_count is used to extract the correct "current" substring from the Vec. This is a workaround - first implementation was using `pop` but this gave an error. --- packages/compiler/src/noir.rs | 59 ++++++++++++++++++++--------------- 1 file changed, 33 insertions(+), 26 deletions(-) diff --git a/packages/compiler/src/noir.rs b/packages/compiler/src/noir.rs index 339ce41..36440cc 100644 --- a/packages/compiler/src/noir.rs +++ b/packages/compiler/src/noir.rs @@ -103,28 +103,30 @@ comptime fn make_lookup_table() -> [Field; {table_size}] {{ .collect_vec() .join(" | "); - // If substrings have to be extracted, the function returns that amount of BoundedVec, + // If substrings have to be extracted, the function returns a vector of BoundedVec // otherwise there is no return type let fn_body = if gen_substrs { - let nr_substrs = substr_states.len(); - // Initialize a substring BoundedVec for each substr that has to be extracted - let mut bounded_vecs_initialization = (0..nr_substrs) - .map(|index| format!("let mut substr{} = BoundedVec::new();", index)) - .collect::>() - .join("\n"); - bounded_vecs_initialization = indent(&bounded_vecs_initialization, 1); // Indent once for inside the function // Fill each substring when at the corresponding state + // Per state potentially multiple substrings should be extracted + // The code keeps track of whether a substring was already in the making, or a new one is started let mut conditions = substr_states .iter() - .enumerate() - .map(|(index, state)| { + .map(|state| { format!( - "if (s == {state}) {{ - substr{index_plus_one}.push(temp); -}}", - index_plus_one = index - ) + "if ((s_next == {state}) & (consecutive_substr == 0)) {{ + let mut substr0 = BoundedVec::new(); + substr0.push(temp); + substrings.push(substr0); + consecutive_substr = 1; + substr_count += 1; +}} else if ((s_next == {state}) & (s == {state})) {{ + let mut current: BoundedVec = substrings.get(substr_count - 1); + current.push(temp); + substrings.set(substr_count - 1, current); +}} else if (s == {state}) {{ + consecutive_substr = 0; +}}") }) .collect::>() .join("\n"); @@ -133,25 +135,30 @@ comptime fn make_lookup_table() -> [Field; {table_size}] {{ format!( r#" global table = comptime {{ make_lookup_table() }}; -pub fn regex_match(input: [u8; N]) -> [BoundedVec; {nr_substrs}] {{ +pub fn regex_match(input: [u8; N]) -> Vec> {{ // regex: {regex_pattern} - let mut s = 0; - -{bounded_vecs_initialization} + let mut substrings: Vec> = Vec::new(); + // Workaround for pop bug with Vec + let mut substr_count = 0; + + // "Previous" state + let mut s: Field = 0; + // "Next"/upcoming state + let mut s_next: Field = 0; + + let mut consecutive_substr = 0; for i in 0..input.len() {{ let temp = input[i] as Field; - s = table[s * {BYTE_SIZE} + input[i] as Field]; + s_next = table[s * 256 + temp]; + // Fill up substrings {conditions} + s = s_next; }} assert({final_states_condition_body}, f"no match: {{s}}"); - [{bounded_vec_names}] + substrings }}"#, - regex_pattern = regex_and_dfa.regex_pattern, - bounded_vec_names = (0..nr_substrs) - .map(|index| format!("substr{}", index)) - .collect::>() - .join(", "), + regex_pattern = regex_and_dfa.regex_pattern ) } else { format!( From 3a853e70030f52343a9e0a6057344e49da2abe18 Mon Sep 17 00:00:00 2001 From: Elena Fuentes Bongenaar Date: Thu, 26 Sep 2024 13:16:13 -0600 Subject: [PATCH 05/10] Take transitions into account for extracting substrings. --- packages/compiler/src/noir.rs | 90 +++++++++++++++++++++-------------- 1 file changed, 55 insertions(+), 35 deletions(-) diff --git a/packages/compiler/src/noir.rs b/packages/compiler/src/noir.rs index 36440cc..a521334 100644 --- a/packages/compiler/src/noir.rs +++ b/packages/compiler/src/noir.rs @@ -1,3 +1,7 @@ +use std::{collections::BTreeSet, fs::File, io::Write, path::Path}; + +use itertools::Itertools; + use crate::structs::RegexAndDFA; use itertools::Itertools; use std::{collections::HashSet, fs::File, io::Write, iter::FromIterator, path::Path}; @@ -84,16 +88,8 @@ comptime fn make_lookup_table() -> [Field; {table_size}] {{ "# ); - // substring_ranges contains the transitions that belong to the substring. - // in Noir we only need to know in what state the substring needs to be extracted, the transitions are not needed - // Example: SubstringDefinitions { substring_ranges: [{(2, 3)}, {(6, 7), (7, 7)}, {(8, 9)}], substring_boundaries: None } - // for each substring, get the first transition and get the end state - let substr_states: Vec = regex_and_dfa - .substrings - .substring_ranges - .iter() - .flat_map(|range_set| range_set.iter().next().map(|&(_, end_state)| end_state)) // Extract the second element (end state) of each tuple - .collect(); + // substring_ranges contains the transitions that belong to the substring + let substr_ranges: &Vec> = ®ex_and_dfa.substrings.substring_ranges; // Note: substring_boundaries is only filled if the substring info is coming from decomposed setting // and will be empty in the raw setting (using json file for substr transitions). This is why substring_ranges is used here @@ -106,31 +102,55 @@ comptime fn make_lookup_table() -> [Field; {table_size}] {{ // If substrings have to be extracted, the function returns a vector of BoundedVec // otherwise there is no return type let fn_body = if gen_substrs { - - // Fill each substring when at the corresponding state - // Per state potentially multiple substrings should be extracted - // The code keeps track of whether a substring was already in the making, or a new one is started - let mut conditions = substr_states - .iter() - .map(|state| { - format!( - "if ((s_next == {state}) & (consecutive_substr == 0)) {{ - let mut substr0 = BoundedVec::new(); - substr0.push(temp); - substrings.push(substr0); - consecutive_substr = 1; - substr_count += 1; -}} else if ((s_next == {state}) & (s == {state})) {{ - let mut current: BoundedVec = substrings.get(substr_count - 1); - current.push(temp); - substrings.set(substr_count - 1, current); -}} else if (s == {state}) {{ - consecutive_substr = 0; -}}") - }) - .collect::>() - .join("\n"); - conditions = indent(&conditions, 2); // Indent twice to align with the for loop's body + let mut first_condition = true; + + let mut conditions = substr_ranges + .iter() + .enumerate() + .map(|(set_idx, range_set)| { + // Combine the range conditions into a single line using `|` operator + let range_conditions = range_set + .iter() + .map(|(range_start, range_end)| format!("(s == {range_start}) & (s_next == {range_end})")) + .collect::>() + .join(" | "); + + // For the first condition, use `if`, for others, use `else if` + let start_part = if first_condition { + first_condition = false; + "if" + } else { + "else if" + }; + + // The body of the condition handling substring creation/updating + format!( + "{start_part} ({range_conditions}) {{ + if (consecutive_substr == 0) {{ + let mut substr{set_idx} = BoundedVec::new(); + substr{set_idx}.push(temp); + substrings.push(substr{set_idx}); + consecutive_substr = 1; + substr_count += 1; + }} else if (consecutive_substr == 1) {{ + let mut current: BoundedVec = substrings.get(substr_count - 1); + current.push(temp); + substrings.set(substr_count - 1, current); + }} +}}" + ) + }) + .collect::>() + .join("\n"); + + // Add the final else if for resetting the consecutive_substr + let final_conditions = format!( + "{conditions} else if (consecutive_substr == 1) {{ + consecutive_substr = 0; +}}" + ); + + conditions = indent(&final_conditions, 2); // Indent twice to align with the for loop's body format!( r#" From 8e6aa14e1d87d9c3fe9bcafe0065d6bf35faed11 Mon Sep 17 00:00:00 2001 From: Elena Fuentes Bongenaar Date: Mon, 30 Sep 2024 12:03:01 -0600 Subject: [PATCH 06/10] $ and ^ support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For caret anchor: Mark beginning of input byte array with 255, which makes the check for caret anchor (ˆ) works. Note that ^ is only taken into consideration in the decomposed mode. --- packages/compiler/src/noir.rs | 128 ++++++++++++++++++++-------------- 1 file changed, 75 insertions(+), 53 deletions(-) diff --git a/packages/compiler/src/noir.rs b/packages/compiler/src/noir.rs index a521334..e974e53 100644 --- a/packages/compiler/src/noir.rs +++ b/packages/compiler/src/noir.rs @@ -1,10 +1,11 @@ -use std::{collections::BTreeSet, fs::File, io::Write, path::Path}; +use std::{ + collections::BTreeSet, collections::HashSet, fs::File, io::Write, iter::FromIterator, + path::Path, +}; use itertools::Itertools; use crate::structs::RegexAndDFA; -use itertools::Itertools; -use std::{collections::HashSet, fs::File, io::Write, iter::FromIterator, path::Path}; const ACCEPT_STATE_ID: &str = "accept"; @@ -25,13 +26,16 @@ pub fn gen_noir_fn( /// # Arguments /// /// * `regex_and_dfa` - The `RegexAndDFA` struct containing the regex pattern and DFA. -/// * `gen_substrs` - A boolean indicating whether to generate substrings. +/// * `gen_substrs` - A boolean indicating whether to generate substrings. /// /// # Returns /// /// A `String` that contains the Noir code fn to_noir_fn(regex_and_dfa: &RegexAndDFA, gen_substrs: bool) -> String { - let accept_state_ids = { + // Multiple accepting states are not supported + // This is a vector nonetheless, to support an extra accepting state we'll use + // to allow any character occurrences after the original accepting state + let mut accept_state_ids: Vec = { let accept_states = regex_and_dfa .dfa .states @@ -39,7 +43,10 @@ fn to_noir_fn(regex_and_dfa: &RegexAndDFA, gen_substrs: bool) -> String { .filter(|s| s.state_type == ACCEPT_STATE_ID) .map(|s| s.state_id) .collect_vec(); - assert!(accept_states.len() > 0, "no accept states"); + assert!( + accept_states.len() == 1, + "there should be exactly 1 accept state" + ); accept_states }; @@ -49,25 +56,33 @@ fn to_noir_fn(regex_and_dfa: &RegexAndDFA, gen_substrs: bool) -> String { // curr_state + char_code -> next_state let mut rows: Vec<(usize, u8, usize)> = vec![]; + // $ support + // In case that there is no end_anchor, we add an additional accepting state to which any + // character occurence after the accepting state will go. + // This needs to be a new state, otherwise substring extraction won't work correctly + if !regex_and_dfa.has_end_anchor { + let original_accept_id = accept_state_ids.get(0).unwrap().clone(); + // Create a new highest state + let extra_accept_id = regex_and_dfa + .dfa + .states + .iter() + .max_by_key(|state| state.state_id) + .map(|state| state.state_id) + .unwrap() + + 1; + accept_state_ids.push(extra_accept_id); + for char_code in 0..=254 { + rows.push((original_accept_id, char_code, extra_accept_id)); + rows.push((extra_accept_id, char_code, extra_accept_id)); + } + } for state in regex_and_dfa.dfa.states.iter() { for (&tran_next_state_id, tran) in &state.transitions { for &char_code in tran { rows.push((state.state_id, char_code, tran_next_state_id)); } } - if state.state_type == ACCEPT_STATE_ID { - let existing_char_codes = &state - .transitions - .iter() - .flat_map(|(_, tran)| tran.iter().copied().collect_vec()) - .collect::>(); - let all_char_codes = HashSet::from_iter(0..=255); - let mut char_codes = all_char_codes.difference(existing_char_codes).collect_vec(); - char_codes.sort(); // to be deterministic - for &char_code in char_codes { - rows.push((state.state_id, char_code, state.state_id)); - } - } } for (curr_state_id, char_code, next_state_id) in rows { @@ -76,7 +91,10 @@ fn to_noir_fn(regex_and_dfa: &RegexAndDFA, gen_substrs: bool) -> String { } lookup_table_body = indent(&lookup_table_body, 1); - let table_size = BYTE_SIZE as usize * regex_and_dfa.dfa.states.len(); + let mut table_size = BYTE_SIZE as usize * regex_and_dfa.dfa.states.len(); + if !regex_and_dfa.has_end_anchor { + table_size += BYTE_SIZE as usize; + } let lookup_table = format!( r#" comptime fn make_lookup_table() -> [Field; {table_size}] {{ @@ -102,30 +120,32 @@ comptime fn make_lookup_table() -> [Field; {table_size}] {{ // If substrings have to be extracted, the function returns a vector of BoundedVec // otherwise there is no return type let fn_body = if gen_substrs { - let mut first_condition = true; - - let mut conditions = substr_ranges - .iter() - .enumerate() - .map(|(set_idx, range_set)| { - // Combine the range conditions into a single line using `|` operator - let range_conditions = range_set - .iter() - .map(|(range_start, range_end)| format!("(s == {range_start}) & (s_next == {range_end})")) - .collect::>() - .join(" | "); - - // For the first condition, use `if`, for others, use `else if` - let start_part = if first_condition { - first_condition = false; - "if" - } else { - "else if" - }; - - // The body of the condition handling substring creation/updating - format!( - "{start_part} ({range_conditions}) {{ + let mut first_condition = true; + + let mut conditions = substr_ranges + .iter() + .enumerate() + .map(|(set_idx, range_set)| { + // Combine the range conditions into a single line using `|` operator + let range_conditions = range_set + .iter() + .map(|(range_start, range_end)| { + format!("(s == {range_start}) & (s_next == {range_end})") + }) + .collect::>() + .join(" | "); + + // For the first condition, use `if`, for others, use `else if` + let start_part = if first_condition { + first_condition = false; + "if" + } else { + "else if" + }; + + // The body of the condition handling substring creation/updating + format!( + "{start_part} ({range_conditions}) {{ if (consecutive_substr == 0) {{ let mut substr{set_idx} = BoundedVec::new(); substr{set_idx}.push(temp); @@ -138,17 +158,17 @@ comptime fn make_lookup_table() -> [Field; {table_size}] {{ substrings.set(substr_count - 1, current); }} }}" - ) - }) - .collect::>() - .join("\n"); - - // Add the final else if for resetting the consecutive_substr - let final_conditions = format!( - "{conditions} else if (consecutive_substr == 1) {{ + ) + }) + .collect::>() + .join("\n"); + + // Add the final else if for resetting the consecutive_substr + let final_conditions = format!( + "{conditions} else if (consecutive_substr == 1) {{ consecutive_substr = 0; }}" - ); + ); conditions = indent(&final_conditions, 2); // Indent twice to align with the for loop's body @@ -163,6 +183,7 @@ pub fn regex_match(input: [u8; N]) -> Vec> {{ // "Previous" state let mut s: Field = 0; + s = table[255]; // "Next"/upcoming state let mut s_next: Field = 0; @@ -187,6 +208,7 @@ global table = comptime {{ make_lookup_table() }}; pub fn regex_match(input: [u8; N]) {{ // regex: {regex_pattern} let mut s = 0; + s = table[255]; for i in 0..input.len() {{ s = table[s * {BYTE_SIZE} + input[i] as Field]; }} From 9f3acd6424dba02e8ec18da5ec7d95173f3cd5bd Mon Sep 17 00:00:00 2001 From: Elena Fuentes Bongenaar Date: Mon, 30 Sep 2024 18:20:07 -0600 Subject: [PATCH 07/10] Added the "reset" flow when a shortcut is made from any state to the states reachable from state 0. Substrings only get saved when they are part of a path that doesn't reset. --- packages/compiler/src/noir.rs | 43 ++++++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 11 deletions(-) diff --git a/packages/compiler/src/noir.rs b/packages/compiler/src/noir.rs index e974e53..d6dd8ef 100644 --- a/packages/compiler/src/noir.rs +++ b/packages/compiler/src/noir.rs @@ -124,8 +124,7 @@ comptime fn make_lookup_table() -> [Field; {table_size}] {{ let mut conditions = substr_ranges .iter() - .enumerate() - .map(|(set_idx, range_set)| { + .map(|range_set| { // Combine the range conditions into a single line using `|` operator let range_conditions = range_set .iter() @@ -147,16 +146,11 @@ comptime fn make_lookup_table() -> [Field; {table_size}] {{ format!( "{start_part} ({range_conditions}) {{ if (consecutive_substr == 0) {{ - let mut substr{set_idx} = BoundedVec::new(); - substr{set_idx}.push(temp); - substrings.push(substr{set_idx}); + current_substring.push(temp); consecutive_substr = 1; - substr_count += 1; }} else if (consecutive_substr == 1) {{ - let mut current: BoundedVec = substrings.get(substr_count - 1); - current.push(temp); - substrings.set(substr_count - 1, current); - }} + current_substring.push(temp); + }} }}" ) }) @@ -165,7 +159,14 @@ comptime fn make_lookup_table() -> [Field; {table_size}] {{ // Add the final else if for resetting the consecutive_substr let final_conditions = format!( - "{conditions} else if (consecutive_substr == 1) {{ + "{conditions} else if ((consecutive_substr == 1) & (s_next == 0)) {{ + current_substring = BoundedVec::new(); + consecutive_substr = 0; +}} else if (consecutive_substr == 1) {{ + // The substring is done so \"save\" it + substrings.push(current_substring); + // reset the substring holder for next use + current_substring = BoundedVec::new(); consecutive_substr = 0; }}" ); @@ -188,15 +189,35 @@ pub fn regex_match(input: [u8; N]) -> Vec> {{ let mut s_next: Field = 0; let mut consecutive_substr = 0; + let mut current_substring = BoundedVec::new(); for i in 0..input.len() {{ let temp = input[i] as Field; + let mut reset = false; s_next = table[s * 256 + temp]; + if s_next == 0 {{ + // Check if there is any transition that could be done from a "restart" + s_next = table[temp]; + // whether the next state changes or not, we mark this as a reset. + reset = true; + s = 0; + }} + + // If a substring was in the making, but the state was reset + // we disregard previous progress because apparently it is invalid + if (reset & (consecutive_substr == 1)) {{ + current_substring = BoundedVec::new(); + consecutive_substr = 0; + }} // Fill up substrings {conditions} s = s_next; }} assert({final_states_condition_body}, f"no match: {{s}}"); + // Add pending substring that hasn't been added + if consecutive_substr == 1 {{ + substrings.push(current_substring); + }} substrings }}"#, regex_pattern = regex_and_dfa.regex_pattern From 41378909099cc124d8d3b899cf8309d77a6602aa Mon Sep 17 00:00:00 2001 From: Elena Fuentes Bongenaar Date: Tue, 1 Oct 2024 10:38:21 -0600 Subject: [PATCH 08/10] Removed old code --- packages/compiler/src/noir.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/packages/compiler/src/noir.rs b/packages/compiler/src/noir.rs index d6dd8ef..80862d2 100644 --- a/packages/compiler/src/noir.rs +++ b/packages/compiler/src/noir.rs @@ -179,8 +179,6 @@ global table = comptime {{ make_lookup_table() }}; pub fn regex_match(input: [u8; N]) -> Vec> {{ // regex: {regex_pattern} let mut substrings: Vec> = Vec::new(); - // Workaround for pop bug with Vec - let mut substr_count = 0; // "Previous" state let mut s: Field = 0; From cd14d891e9b74cd058942a9c7e19461b17c9f804 Mon Sep 17 00:00:00 2001 From: Elena Fuentes Bongenaar Date: Tue, 1 Oct 2024 17:26:23 -0600 Subject: [PATCH 09/10] Escape newline and carriage return characters in regex patterns --- packages/compiler/src/noir.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/compiler/src/noir.rs b/packages/compiler/src/noir.rs index 80862d2..3651895 100644 --- a/packages/compiler/src/noir.rs +++ b/packages/compiler/src/noir.rs @@ -218,7 +218,7 @@ pub fn regex_match(input: [u8; N]) -> Vec> {{ }} substrings }}"#, - regex_pattern = regex_and_dfa.regex_pattern + regex_pattern = regex_and_dfa.regex_pattern.replace('\n', "\\n").replace('\r', "\\r") ) } else { format!( @@ -233,7 +233,7 @@ pub fn regex_match(input: [u8; N]) {{ }} assert({final_states_condition_body}, f"no match: {{s}}"); }}"#, - regex_pattern = regex_and_dfa.regex_pattern, + regex_pattern = regex_and_dfa.regex_pattern.replace('\n', "\\n").replace('\r', "\\r"), ) }; From 01231c2855d60cbb5acde53451d006d0c6c6d7ff Mon Sep 17 00:00:00 2001 From: Elena Fuentes Bongenaar Date: Thu, 5 Dec 2024 16:29:44 -0600 Subject: [PATCH 10/10] Noir support edit --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 58c06b5..0a5f7e8 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # zk-regex -A library to compile regex verification in circom. Explained on [our blog post](https://prove.email/blog/zkregex). You can use regex to specify how to parse an email in a ZK Email proof when defining a new patterm on [the ZK Email SDK registry](https://sdk.prove.email/). Noir coming soon. +A library to compile regex verification in circom. Explained on [our blog post](https://prove.email/blog/zkregex). You can use regex to specify how to parse an email in a ZK Email proof when defining a new patterm on [the ZK Email SDK registry](https://sdk.prove.email/). Noir support is also available.