From b85bd8a761427786550cd77f0cb1e1f8824e3974 Mon Sep 17 00:00:00 2001 From: Kisaragi Marine Date: Tue, 17 Oct 2023 21:07:24 +0900 Subject: [PATCH 01/25] refactor: avoid char (partial) --- package/origlang-compiler/src/lexer.rs | 302 ++++++++----------- package/origlang-compiler/src/lexer/error.rs | 3 + 2 files changed, 130 insertions(+), 175 deletions(-) diff --git a/package/origlang-compiler/src/lexer.rs b/package/origlang-compiler/src/lexer.rs index 9e79ad8a..b7824177 100644 --- a/package/origlang-compiler/src/lexer.rs +++ b/package/origlang-compiler/src/lexer.rs @@ -83,40 +83,30 @@ impl Lexer { } fn drain_space(&self) { - while !self.reached_end() && matches!(self.current_char().expect("drain_space"), ' ' | '\t') { - self.consume_char().unwrap(); + while !self.reached_end() && (self.try_str(" ").unwrap() == Some(" ") || self.try_str("\t").unwrap() == Some("\t")) { + self.advance_bytes(1).expect("?!") } } - fn try_char(&self, t: char) -> Result, LexerError> { - trace!("lexer:try:{t:?}"); - if !self.reached_end() && self.current_char()? == t { - self.consume_char()?; - Ok(Some(t)) - } else { - Ok(None) - } - } - - fn try_char_peek(&self, t: char) -> Result, LexerError> { - trace!("lexer:try:{t}"); - if !self.reached_end() && self.current_char()? == t { - Ok(Some(t)) + fn try_str<'s>(&self, s: &'s str) -> Result, LexerError> { + trace!("lexer:try:{s:?}"); + let start = self.source_bytes_nth.get(); + let end_exclusive = start.as_usize() + s.len(); + if let Some(b) = self.source.get((start.as_usize())..end_exclusive) { + if s == b { + self.set_current_index(Utf8CharBoundaryStartByte::new(end_exclusive))?; + Ok(Some(s)) + } else { + Ok(None) + } } else { - Ok(None) + Err(LexerError::OutOfRange { + current: start, + max: self.source.len(), + }) } } - fn try_any(&self, t: &[char]) -> Result, LexerError> { - for c in t { - if let Some(x) = self.try_char_peek(*c)? { - return Ok(Some(x)) - } - } - - Ok(None) - } - #[allow(clippy::too_many_lines, clippy::unnecessary_wraps)] fn next_inner(&self) -> Result { macro_rules! fold { @@ -134,12 +124,12 @@ impl Lexer { } else { None } - .or_else(|| self.try_char('\n').expect("huh?").map(|_| Token::NewLine)) + .or_else(|| self.try_str(r#"\n"#).expect("huh?").map(|_| Token::NewLine)) .or_else(|| fold!( - self.try_char('=').expect("huh?"), + self.try_str(r#"="#).expect("huh?"), { - let double_eq = self.try_char('=').expect("huh?"); + let double_eq = self.try_str(r#"="#).expect("huh?"); if double_eq.is_some() { Some(Token::PartEqEq) } else { @@ -149,31 +139,31 @@ impl Lexer { None ) ) - .or_else(|| self.try_char('+').expect("huh?").map(|_| Token::SymPlus)) - .or_else(|| self.try_char('-').expect("huh?").map(|_| Token::SymMinus)) - .or_else(|| self.try_char('*').expect("huh?").map(|_| Token::SymAsterisk)) + .or_else(|| self.try_str(r#"+"#).expect("huh?").map(|_| Token::SymPlus)) + .or_else(|| self.try_str(r#"-"#).expect("huh?").map(|_| Token::SymMinus)) + .or_else(|| self.try_str(r#"*"#).expect("huh?").map(|_| Token::SymAsterisk)) .or_else(|| fold!( - self.try_char('/').expect("huh?"), + self.try_str(r#"/"#).expect("huh?"), fold!( - self.try_char('/').expect("huh?"), + self.try_str(r#"/"#).expect("huh?"), Some(self.scan_line_comment().expect("unable to parse comment")), Some(Token::SymSlash) ), None ) ) - .or_else(|| self.try_char('(').expect("huh?").map(|_| Token::SymLeftPar)) - .or_else(|| self.try_char(')').expect("huh?").map(|_| Token::SymRightPar)) + .or_else(|| self.try_str(r#"("#).expect("huh?").map(|_| Token::SymLeftPar)) + .or_else(|| self.try_str(r#")"#).expect("huh?").map(|_| Token::SymRightPar)) .or_else(|| { - if let Some(_) = self.try_char('<').expect("huh?") { - if let Some(_) = self.try_char('=').expect("huh?") { - if let Some(_) = self.try_char('>').expect("huh?") { + if let Some(_) = self.try_str(r#"<"#).expect("huh?") { + if let Some(_) = self.try_str(r#"="#).expect("huh?") { + if let Some(_) = self.try_str(r#">"#).expect("huh?") { Some(Token::PartLessEqMore) } else { Some(Token::PartLessEq) } - } else if let Some(_) = self.try_char('<').expect("huh?") { + } else if let Some(_) = self.try_str(r#"<"#).expect("huh?") { Some(Token::PartLessLess) } else { Some(Token::SymLess) @@ -183,10 +173,10 @@ impl Lexer { } }) .or_else(|| { - if let Some(_) = self.try_char('>').expect("huh?") { - if let Some(_) = self.try_char('=').expect("huh?") { + if let Some(_) = self.try_str(r#">"#).expect("huh?") { + if let Some(_) = self.try_str(r#"="#).expect("huh?") { Some(Token::PartMoreEq) - } else if let Some(_) = self.try_char('>').expect("huh?") { + } else if let Some(_) = self.try_str(r#">"#).expect("huh?") { Some(Token::PartMoreMore) } else { Some(Token::SymMore) @@ -197,9 +187,9 @@ impl Lexer { }) .or_else(|| fold!( - self.try_char('!').expect("huh?"), + self.try_str(r#"!"#).expect("huh?"), fold!( - self.try_char('=').expect("huh?"), + self.try_str(r#"="#).expect("huh?"), Some(Token::PartBangEq), Some(Token::SymBang) ), @@ -208,62 +198,34 @@ impl Lexer { ) .or_else(|| fold!( - self.try_char('"').expect("huh?"), + self.try_str(r#"""#).expect("huh?"), Some(self.scan_string_literal().expect("unable to parse string literal")), None ) ) - .or_else(|| - fold!( - self.try_any(&ASCII_NUMERIC_CHARS).expect("huh?"), - Some(self.scan_digits().expect("huh?")), - None - ) - ) + .or_else(|| self.scan_digits().expect("huh?")) .or_else(|| fold!( - self.try_char(',').expect("huh?"), + self.try_str(r#","#).expect("huh?"), Some(Token::SymComma), None ) ) .or_else(|| fold!( - self.try_char(':').expect("huh?"), + self.try_str(r#":"#).expect("huh?"), Some(Token::SymColon), None ) ) .or_else(|| { - self.one_or_many_accumulator( - String::new(), - (true, false), - |x, (first, exit_on_next_iteration)| { - if exit_on_next_iteration { - return ControlFlow::Break(()) - } - - let discarding = x == '_' && first; - let is_identifier = x.is_ascii_alphabetic() || (!first && x.is_ascii_digit()); - - if is_identifier { - ControlFlow::Continue((false, false)) - } else if discarding { - ControlFlow::Continue((false, true)) - } else { - ControlFlow::Break(()) - } - }, - |x, identifier| { - identifier.push(x); - debug!("identifier: {identifier:?}"); - } - ) + self.scan_identifier() .ok() + .flatten() .map(|scanned| { - let is_keyword = KEYWORDS.contains(&scanned.as_str()); + let is_keyword = KEYWORDS.contains(&scanned.as_name()); if is_keyword { - match scanned.as_str() { + match scanned.as_name() { "var" => Token::VarKeyword, "true" => Token::KeywordTrue, "false" => Token::KeywordFalse, @@ -281,7 +243,7 @@ impl Lexer { } } } else { - Token::Identifier { inner: Identifier::new(scanned) } + Token::Identifier { inner: scanned } } }) }) @@ -320,111 +282,54 @@ impl Lexer { } } - fn one_or_many(&self, scan_while: impl Fn(char) -> bool, ignore_trailing_char_on_exit: bool) -> Result { - let mut buf = String::new(); - loop { - if self.reached_end() { - break - } - - let c = self.current_char()?; - if !scan_while(c) { - if ignore_trailing_char_on_exit { - self.consume_char()?; - } + fn scan_digit_suffix_opt(&self) -> Result>, LexerError> { + for s in ["i8", "i16", "i32", "i64"] { + let a = self.try_str(s)?; - break + if let Some(a) = a { + self.advance_bytes((s.len() + 1))?; + return Ok(Some(a.to_string().into_boxed_str())) } - let c = self.consume_char()?; - - buf.push(c); } - Ok(buf) + Ok(None) } - fn one_or_many_accumulator( - &self, - scan_sequence_accumulator: Acc, - registers: R, - judge: impl Fn(char, R) -> ControlFlow<(), R>, - accumulate_before_next_iteration_after_break: impl Fn(char, &mut Acc) - ) -> Result { - let mut acc = scan_sequence_accumulator; - let mut registers = registers; + fn scan_digits(&self) -> Result, LexerError> { + debug!("lexer:digit"); + let b = self.current_byte()?; + let mut plus = 0; loop { - if self.reached_end() { - break - } + let r = self.byte_skip_n(plus); - let c = self.current_char()?; - let cf = judge(c, registers); - match cf { - ControlFlow::Continue(c) => { - registers = c; - self.consume_char()?; - } - ControlFlow::Break(_b) => { + if let Ok(b) = r { + if (b'0'..b'9').contains(&b) { + plus += 1; + } else { break } + } else { + break } - - accumulate_before_next_iteration_after_break(c, &mut acc); } - Ok(acc) - } - - fn scan_digit_suffix_opt(&self) -> Result>, LexerError> { - let v = if self.current_char()? == 'i' { - self.consume_char()?; - if self.current_char()? == '8' { - self.consume_char()?; - Some("i8".to_string().into_boxed_str()) - } else if self.current_char()? == '1' { - self.consume_char()?; - if self.current_char()? == '6' { - self.consume_char()?; - Some("i16".to_string().into_boxed_str()) - } else { - return Err(LexerError::InvalidSuffix); - } - } else if self.current_char()? == '3' { - self.consume_char()?; - if self.current_char()? == '2' { - self.consume_char()?; - Some("i32".to_string().into_boxed_str()) - } else { - return Err(LexerError::InvalidSuffix); - } - } else if self.current_char()? == '6' { - self.consume_char()?; - if self.current_char()? == '4' { - self.consume_char()?; - Some("i64".to_string().into_boxed_str()) - } else { - return Err(LexerError::InvalidSuffix); - } - } else { - return Err(LexerError::InvalidSuffix); - } + if b != 0 { + Ok(None) } else { - None - }; + let start = self.source_bytes_nth.get().as_usize(); + let end_inclusive = start + plus; + self.advance_bytes(end_inclusive)?; + + let builtin_suffix = self.scan_digit_suffix_opt()?; + Ok(Some( + Token::Digits { + sequence: self.source[start..end_inclusive].to_string(), + suffix: builtin_suffix, + } + )) + } - Ok(v) - } - - fn scan_digits(&self) -> Result { - debug!("lexer:digit"); - let buf = self.one_or_many(|c| ASCII_NUMERIC_CHARS.contains(&c), false)?; - let builtin_suffix = self.scan_digit_suffix_opt()?; - - Ok(Token::Digits { - sequence: buf, - suffix: builtin_suffix, - }) } fn scan_string_literal(&self) -> Result { @@ -494,6 +399,10 @@ impl Lexer { Ok(Token::StringLiteral(string_char_literal_content)) } + fn advance_bytes(&self, advance: usize) -> Result<(), LineComputationError> { + self.set_current_index(Utf8CharBoundaryStartByte::new(self.source_bytes_nth.get().as_usize() + advance)) + } + #[inline(never)] fn set_current_index(&self, future_index: Utf8CharBoundaryStartByte) -> Result<(), LineComputationError> { if future_index == self.source_bytes_nth.get() { @@ -538,12 +447,15 @@ impl Lexer { } fn scan_line_comment(&self) -> Result { - let content = self.one_or_many(|c| c != '\n', false)?; + let start = self.source_bytes_nth.get().as_usize(); + let pos = self.source[start..].find("\n").unwrap_or(self.source.len()); + self.advance_bytes((pos))?; + let content = self.source[start..pos].to_string(); Ok(Token::Comment { content: Comment { - content, - }, + content + } }) } @@ -656,4 +568,44 @@ impl Lexer { fn create_reset_token(&self) -> TemporalLexerUnwindToken { TemporalLexerUnwindToken::new(self.source_bytes_nth.get()) } + + fn scan_identifier(&self) -> Result, LexerError> { + let first = self.current_byte()?; + let mut plus = 0; + + if first.is_ascii_alphabetic() || first == b'_' { + plus += 1; + loop { + let b = self.byte_skip_n(plus)?; + if b.is_ascii_alphanumeric() || b == b'_' { + plus += 1; + } else { + break + } + } + + let start = self.source_bytes_nth.get().as_usize(); + let end_exclusive = start + plus; + self.set_current_index(Utf8CharBoundaryStartByte::new(end_exclusive + 1))?; + + Ok(Some(Identifier::new(self.source[start..end_exclusive].to_string()))) + } else { + Ok(None) + } + } + + fn current_byte(&self) -> Result { + self.source.bytes().nth(self.source_bytes_nth.get().as_usize()).ok_or_else(|| self.report_out_of_range_error()) + } + + fn byte_skip_n(&self, skip: usize) -> Result { + self.source.bytes().nth(self.source_bytes_nth.get().as_usize() + skip).ok_or_else(|| self.report_out_of_range_error()) + } + + fn report_out_of_range_error(&self) -> LexerError { + LexerError::OutOfRange { + current: self.source_bytes_nth.get(), + max: self.source.len(), + } + } } diff --git a/package/origlang-compiler/src/lexer/error.rs b/package/origlang-compiler/src/lexer/error.rs index a3c33e43..79defd8e 100644 --- a/package/origlang-compiler/src/lexer/error.rs +++ b/package/origlang-compiler/src/lexer/error.rs @@ -1,5 +1,6 @@ use thiserror::Error; use crate::chars::boundary::Utf8CharBoundaryStartByte; +use crate::chars::line::LineComputationError; #[derive(Error, Debug, Eq, PartialEq)] #[allow(clippy::module_name_repetitions)] @@ -17,4 +18,6 @@ pub enum LexerError { MalformedAsUtf8 { boundary: Utf8CharBoundaryStartByte, }, + #[error("fatal: internal bug: {0}")] + FatalLineComputationError(#[from] LineComputationError) } From 37dd2167c0cd6efea946a93d969cca518b7d4ea0 Mon Sep 17 00:00:00 2001 From: Kisaragi Marine Date: Tue, 17 Oct 2023 21:18:55 +0900 Subject: [PATCH 02/25] fix: early-return when tokens are exhausted prior to handle digit suffix --- package/origlang-compiler/src/lexer.rs | 6 +++++- package/origlang-compiler/src/lexer/tests.rs | 14 ++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/package/origlang-compiler/src/lexer.rs b/package/origlang-compiler/src/lexer.rs index b7824177..f8de0fe4 100644 --- a/package/origlang-compiler/src/lexer.rs +++ b/package/origlang-compiler/src/lexer.rs @@ -283,6 +283,10 @@ impl Lexer { } fn scan_digit_suffix_opt(&self) -> Result>, LexerError> { + if self.reached_end() { + return Ok(None) + } + for s in ["i8", "i16", "i32", "i64"] { let a = self.try_str(s)?; @@ -314,7 +318,7 @@ impl Lexer { } } - if b != 0 { + if b == 0 { Ok(None) } else { let start = self.source_bytes_nth.get().as_usize(); diff --git a/package/origlang-compiler/src/lexer/tests.rs b/package/origlang-compiler/src/lexer/tests.rs index 60cc549c..414a2a17 100644 --- a/package/origlang-compiler/src/lexer/tests.rs +++ b/package/origlang-compiler/src/lexer/tests.rs @@ -195,3 +195,17 @@ fn token_location() { } }); } + +#[test] +fn digit_regression() { + const D: &str = "123456"; + let lexer = Lexer::create(D); + assert_eq!(lexer.next().data, Token::Digits { + sequence: D.to_string(), + suffix: None, + }); + + const EMPTY: &str = ""; + let lexer = Lexer::create(EMPTY); + assert_eq!(lexer.next().data, Token::EndOfFile); +} From 36de6184a7db03d01ec84c3efe25c7d0caca7b8f Mon Sep 17 00:00:00 2001 From: Kisaragi Marine Date: Tue, 17 Oct 2023 21:28:10 +0900 Subject: [PATCH 03/25] fix: give correct locals --- package/origlang-compiler/src/lexer.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/package/origlang-compiler/src/lexer.rs b/package/origlang-compiler/src/lexer.rs index f8de0fe4..f548cca2 100644 --- a/package/origlang-compiler/src/lexer.rs +++ b/package/origlang-compiler/src/lexer.rs @@ -318,12 +318,12 @@ impl Lexer { } } - if b == 0 { + if plus == 0 { Ok(None) } else { let start = self.source_bytes_nth.get().as_usize(); let end_inclusive = start + plus; - self.advance_bytes(end_inclusive)?; + self.set_current_index(Utf8CharBoundaryStartByte::new(end_inclusive))?; let builtin_suffix = self.scan_digit_suffix_opt()?; Ok(Some( From 693962df9dfa110791932fd99ad48703414e12b6 Mon Sep 17 00:00:00 2001 From: Kisaragi Marine Date: Tue, 17 Oct 2023 21:35:47 +0900 Subject: [PATCH 04/25] fix: undo incorrect bulk-replace --- package/origlang-compiler/src/lexer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package/origlang-compiler/src/lexer.rs b/package/origlang-compiler/src/lexer.rs index f548cca2..e9ae995f 100644 --- a/package/origlang-compiler/src/lexer.rs +++ b/package/origlang-compiler/src/lexer.rs @@ -124,7 +124,7 @@ impl Lexer { } else { None } - .or_else(|| self.try_str(r#"\n"#).expect("huh?").map(|_| Token::NewLine)) + .or_else(|| self.try_str("\n").expect("huh?").map(|_| Token::NewLine)) .or_else(|| fold!( self.try_str(r#"="#).expect("huh?"), From 4661bfcce3501d99e4888b27570600acdb9cdfe0 Mon Sep 17 00:00:00 2001 From: Kisaragi Marine Date: Tue, 17 Oct 2023 21:36:16 +0900 Subject: [PATCH 05/25] refactor: don't report error when if it will out-of-bounds --- package/origlang-compiler/src/lexer.rs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/package/origlang-compiler/src/lexer.rs b/package/origlang-compiler/src/lexer.rs index e9ae995f..120d73eb 100644 --- a/package/origlang-compiler/src/lexer.rs +++ b/package/origlang-compiler/src/lexer.rs @@ -88,7 +88,7 @@ impl Lexer { } } - fn try_str<'s>(&self, s: &'s str) -> Result, LexerError> { + fn try_str<'s>(&self, s: &'s str) -> Result, LineComputationError> { trace!("lexer:try:{s:?}"); let start = self.source_bytes_nth.get(); let end_exclusive = start.as_usize() + s.len(); @@ -100,10 +100,7 @@ impl Lexer { Ok(None) } } else { - Err(LexerError::OutOfRange { - current: start, - max: self.source.len(), - }) + Ok(None) } } From f02a3056ad2040e6414f6540215cc81b5700fc7b Mon Sep 17 00:00:00 2001 From: Kisaragi Marine Date: Tue, 17 Oct 2023 21:40:35 +0900 Subject: [PATCH 06/25] fix: fix off-by-one on set_current_index's argument --- package/origlang-compiler/src/lexer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package/origlang-compiler/src/lexer.rs b/package/origlang-compiler/src/lexer.rs index 120d73eb..78072358 100644 --- a/package/origlang-compiler/src/lexer.rs +++ b/package/origlang-compiler/src/lexer.rs @@ -587,7 +587,7 @@ impl Lexer { let start = self.source_bytes_nth.get().as_usize(); let end_exclusive = start + plus; - self.set_current_index(Utf8CharBoundaryStartByte::new(end_exclusive + 1))?; + self.set_current_index(Utf8CharBoundaryStartByte::new(end_exclusive))?; Ok(Some(Identifier::new(self.source[start..end_exclusive].to_string()))) } else { From c97c3ce433ee67703bfea9213f691d3bf6466b06 Mon Sep 17 00:00:00 2001 From: Kisaragi Marine Date: Tue, 17 Oct 2023 21:50:08 +0900 Subject: [PATCH 07/25] refactor: delete unused local --- package/origlang-compiler/src/lexer.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/package/origlang-compiler/src/lexer.rs b/package/origlang-compiler/src/lexer.rs index 78072358..0eb6c3a0 100644 --- a/package/origlang-compiler/src/lexer.rs +++ b/package/origlang-compiler/src/lexer.rs @@ -298,7 +298,6 @@ impl Lexer { fn scan_digits(&self) -> Result, LexerError> { debug!("lexer:digit"); - let b = self.current_byte()?; let mut plus = 0; loop { From f85011a29358f590de54c523fa81dcff97f2e25a Mon Sep 17 00:00:00 2001 From: Kisaragi Marine Date: Tue, 17 Oct 2023 21:50:57 +0900 Subject: [PATCH 08/25] fix: remove unnecessary advance --- package/origlang-compiler/src/lexer.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/package/origlang-compiler/src/lexer.rs b/package/origlang-compiler/src/lexer.rs index 0eb6c3a0..b0ee3a4b 100644 --- a/package/origlang-compiler/src/lexer.rs +++ b/package/origlang-compiler/src/lexer.rs @@ -83,9 +83,14 @@ impl Lexer { } fn drain_space(&self) { - while !self.reached_end() && (self.try_str(" ").unwrap() == Some(" ") || self.try_str("\t").unwrap() == Some("\t")) { - self.advance_bytes(1).expect("?!") + trace!("drain_space: start vvvvvvvvvvvvvvvvvvv"); + while !self.reached_end() { + if self.try_str(" ").unwrap() == Some(" ") || self.try_str("\t").unwrap() == Some("\t") { + } else { + break + } } + trace!("drain_space: end ^^^^^^^^^^^^^^^^^^^"); } fn try_str<'s>(&self, s: &'s str) -> Result, LineComputationError> { From 9f88c0d5f0704857bfb4fcb15cd5dabb82b35da1 Mon Sep 17 00:00:00 2001 From: Kisaragi Marine Date: Tue, 17 Oct 2023 21:51:24 +0900 Subject: [PATCH 09/25] refactor: split sub-expression to local --- package/origlang-compiler/src/lexer.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/package/origlang-compiler/src/lexer.rs b/package/origlang-compiler/src/lexer.rs index b0ee3a4b..5eac7a4b 100644 --- a/package/origlang-compiler/src/lexer.rs +++ b/package/origlang-compiler/src/lexer.rs @@ -326,10 +326,14 @@ impl Lexer { let end_inclusive = start + plus; self.set_current_index(Utf8CharBoundaryStartByte::new(end_inclusive))?; + let scanned = self.source[start..end_inclusive].to_string(); let builtin_suffix = self.scan_digit_suffix_opt()?; + + debug!("digit: done ({scanned} {builtin_suffix:?})"); + Ok(Some( Token::Digits { - sequence: self.source[start..end_inclusive].to_string(), + sequence: scanned, suffix: builtin_suffix, } )) From 34420d10992834081183f78be06cca8ee2971ae9 Mon Sep 17 00:00:00 2001 From: Kisaragi Marine Date: Tue, 17 Oct 2023 21:54:33 +0900 Subject: [PATCH 10/25] refactor: rename confusing named method --- package/origlang-compiler/src/lexer.rs | 51 +++++++++++++------------- 1 file changed, 26 insertions(+), 25 deletions(-) diff --git a/package/origlang-compiler/src/lexer.rs b/package/origlang-compiler/src/lexer.rs index 5eac7a4b..09843a7f 100644 --- a/package/origlang-compiler/src/lexer.rs +++ b/package/origlang-compiler/src/lexer.rs @@ -85,7 +85,7 @@ impl Lexer { fn drain_space(&self) { trace!("drain_space: start vvvvvvvvvvvvvvvvvvv"); while !self.reached_end() { - if self.try_str(" ").unwrap() == Some(" ") || self.try_str("\t").unwrap() == Some("\t") { + if self.try_and_eat_str(" ").unwrap() == Some(" ") || self.try_and_eat_str("\t").unwrap() == Some("\t") { } else { break } @@ -93,7 +93,7 @@ impl Lexer { trace!("drain_space: end ^^^^^^^^^^^^^^^^^^^"); } - fn try_str<'s>(&self, s: &'s str) -> Result, LineComputationError> { + fn try_and_eat_str<'s>(&self, s: &'s str) -> Result, LineComputationError> { trace!("lexer:try:{s:?}"); let start = self.source_bytes_nth.get(); let end_exclusive = start.as_usize() + s.len(); @@ -126,12 +126,12 @@ impl Lexer { } else { None } - .or_else(|| self.try_str("\n").expect("huh?").map(|_| Token::NewLine)) + .or_else(|| self.try_and_eat_str("\n").expect("huh?").map(|_| Token::NewLine)) .or_else(|| fold!( - self.try_str(r#"="#).expect("huh?"), + self.try_and_eat_str(r#"="#).expect("huh?"), { - let double_eq = self.try_str(r#"="#).expect("huh?"); + let double_eq = self.try_and_eat_str(r#"="#).expect("huh?"); if double_eq.is_some() { Some(Token::PartEqEq) } else { @@ -141,31 +141,31 @@ impl Lexer { None ) ) - .or_else(|| self.try_str(r#"+"#).expect("huh?").map(|_| Token::SymPlus)) - .or_else(|| self.try_str(r#"-"#).expect("huh?").map(|_| Token::SymMinus)) - .or_else(|| self.try_str(r#"*"#).expect("huh?").map(|_| Token::SymAsterisk)) + .or_else(|| self.try_and_eat_str(r#"+"#).expect("huh?").map(|_| Token::SymPlus)) + .or_else(|| self.try_and_eat_str(r#"-"#).expect("huh?").map(|_| Token::SymMinus)) + .or_else(|| self.try_and_eat_str(r#"*"#).expect("huh?").map(|_| Token::SymAsterisk)) .or_else(|| fold!( - self.try_str(r#"/"#).expect("huh?"), + self.try_and_eat_str(r#"/"#).expect("huh?"), fold!( - self.try_str(r#"/"#).expect("huh?"), + self.try_and_eat_str(r#"/"#).expect("huh?"), Some(self.scan_line_comment().expect("unable to parse comment")), Some(Token::SymSlash) ), None ) ) - .or_else(|| self.try_str(r#"("#).expect("huh?").map(|_| Token::SymLeftPar)) - .or_else(|| self.try_str(r#")"#).expect("huh?").map(|_| Token::SymRightPar)) + .or_else(|| self.try_and_eat_str(r#"("#).expect("huh?").map(|_| Token::SymLeftPar)) + .or_else(|| self.try_and_eat_str(r#")"#).expect("huh?").map(|_| Token::SymRightPar)) .or_else(|| { - if let Some(_) = self.try_str(r#"<"#).expect("huh?") { - if let Some(_) = self.try_str(r#"="#).expect("huh?") { - if let Some(_) = self.try_str(r#">"#).expect("huh?") { + if let Some(_) = self.try_and_eat_str(r#"<"#).expect("huh?") { + if let Some(_) = self.try_and_eat_str(r#"="#).expect("huh?") { + if let Some(_) = self.try_and_eat_str(r#">"#).expect("huh?") { Some(Token::PartLessEqMore) } else { Some(Token::PartLessEq) } - } else if let Some(_) = self.try_str(r#"<"#).expect("huh?") { + } else if let Some(_) = self.try_and_eat_str(r#"<"#).expect("huh?") { Some(Token::PartLessLess) } else { Some(Token::SymLess) @@ -175,10 +175,10 @@ impl Lexer { } }) .or_else(|| { - if let Some(_) = self.try_str(r#">"#).expect("huh?") { - if let Some(_) = self.try_str(r#"="#).expect("huh?") { + if let Some(_) = self.try_and_eat_str(r#">"#).expect("huh?") { + if let Some(_) = self.try_and_eat_str(r#"="#).expect("huh?") { Some(Token::PartMoreEq) - } else if let Some(_) = self.try_str(r#">"#).expect("huh?") { + } else if let Some(_) = self.try_and_eat_str(r#">"#).expect("huh?") { Some(Token::PartMoreMore) } else { Some(Token::SymMore) @@ -189,9 +189,9 @@ impl Lexer { }) .or_else(|| fold!( - self.try_str(r#"!"#).expect("huh?"), + self.try_and_eat_str(r#"!"#).expect("huh?"), fold!( - self.try_str(r#"="#).expect("huh?"), + self.try_and_eat_str(r#"="#).expect("huh?"), Some(Token::PartBangEq), Some(Token::SymBang) ), @@ -200,7 +200,7 @@ impl Lexer { ) .or_else(|| fold!( - self.try_str(r#"""#).expect("huh?"), + self.try_and_eat_str(r#"""#).expect("huh?"), Some(self.scan_string_literal().expect("unable to parse string literal")), None ) @@ -208,14 +208,14 @@ impl Lexer { .or_else(|| self.scan_digits().expect("huh?")) .or_else(|| fold!( - self.try_str(r#","#).expect("huh?"), + self.try_and_eat_str(r#","#).expect("huh?"), Some(Token::SymComma), None ) ) .or_else(|| fold!( - self.try_str(r#":"#).expect("huh?"), + self.try_and_eat_str(r#":"#).expect("huh?"), Some(Token::SymColon), None ) @@ -290,7 +290,7 @@ impl Lexer { } for s in ["i8", "i16", "i32", "i64"] { - let a = self.try_str(s)?; + let a = self.try_and_eat_str(s)?; if let Some(a) = a { self.advance_bytes((s.len() + 1))?; @@ -414,6 +414,7 @@ impl Lexer { #[inline(never)] fn set_current_index(&self, future_index: Utf8CharBoundaryStartByte) -> Result<(), LineComputationError> { + debug!("index: requested = {future_index:?}"); if future_index == self.source_bytes_nth.get() { // no computation is needed Ok(()) From 7676bdc27a359bc0182749f78316f6b9df4ce34b Mon Sep 17 00:00:00 2001 From: Kisaragi Marine Date: Tue, 17 Oct 2023 21:56:21 +0900 Subject: [PATCH 11/25] fix: remove unnecessary advance --- package/origlang-compiler/src/lexer.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/package/origlang-compiler/src/lexer.rs b/package/origlang-compiler/src/lexer.rs index 09843a7f..d96c24ce 100644 --- a/package/origlang-compiler/src/lexer.rs +++ b/package/origlang-compiler/src/lexer.rs @@ -293,7 +293,6 @@ impl Lexer { let a = self.try_and_eat_str(s)?; if let Some(a) = a { - self.advance_bytes((s.len() + 1))?; return Ok(Some(a.to_string().into_boxed_str())) } } From 68e4e2f3c40af567b8d3c38ad8cec706ffba1745 Mon Sep 17 00:00:00 2001 From: Kisaragi Marine Date: Tue, 17 Oct 2023 21:56:35 +0900 Subject: [PATCH 12/25] docs: mark error-prone --- package/origlang-compiler/src/lexer.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/package/origlang-compiler/src/lexer.rs b/package/origlang-compiler/src/lexer.rs index d96c24ce..32bd832a 100644 --- a/package/origlang-compiler/src/lexer.rs +++ b/package/origlang-compiler/src/lexer.rs @@ -93,6 +93,8 @@ impl Lexer { trace!("drain_space: end ^^^^^^^^^^^^^^^^^^^"); } + /// Note + /// calling [`Self::advance_bytes`], [`Self::advance`], or [`Self::set_current_index`] is error-prone. fn try_and_eat_str<'s>(&self, s: &'s str) -> Result, LineComputationError> { trace!("lexer:try:{s:?}"); let start = self.source_bytes_nth.get(); From 4c5f4f5454e98e3d3815f918e96e874fd7f75e22 Mon Sep 17 00:00:00 2001 From: Kisaragi Marine Date: Tue, 17 Oct 2023 21:58:13 +0900 Subject: [PATCH 13/25] fix: set proper range for sub-slicing --- package/origlang-compiler/src/lexer.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/package/origlang-compiler/src/lexer.rs b/package/origlang-compiler/src/lexer.rs index 32bd832a..d9a58a69 100644 --- a/package/origlang-compiler/src/lexer.rs +++ b/package/origlang-compiler/src/lexer.rs @@ -459,10 +459,10 @@ impl Lexer { fn scan_line_comment(&self) -> Result { let start = self.source_bytes_nth.get().as_usize(); - let pos = self.source[start..].find("\n").unwrap_or(self.source.len()); - self.advance_bytes((pos))?; + let rel_pos = self.source[start..].find("\n").unwrap_or(self.source.len()); + self.advance_bytes(rel_pos)?; - let content = self.source[start..pos].to_string(); + let content = self.source[start..(start + rel_pos)].to_string(); Ok(Token::Comment { content: Comment { content From 699cef76f224206be3b68d9dd5975dc2c273579c Mon Sep 17 00:00:00 2001 From: Kisaragi Marine Date: Tue, 17 Oct 2023 22:07:45 +0900 Subject: [PATCH 14/25] fix: do not early-return on error --- package/origlang-compiler/src/lexer.rs | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/package/origlang-compiler/src/lexer.rs b/package/origlang-compiler/src/lexer.rs index d9a58a69..f026bd35 100644 --- a/package/origlang-compiler/src/lexer.rs +++ b/package/origlang-compiler/src/lexer.rs @@ -581,20 +581,27 @@ impl Lexer { } fn scan_identifier(&self) -> Result, LexerError> { + debug!("lexer:identifier"); + let first = self.current_byte()?; let mut plus = 0; if first.is_ascii_alphabetic() || first == b'_' { plus += 1; loop { - let b = self.byte_skip_n(plus)?; - if b.is_ascii_alphanumeric() || b == b'_' { - plus += 1; + trace!("lexer:identifier: {plus}"); + if let Ok(b) = self.byte_skip_n(plus) { + if b.is_ascii_alphanumeric() || b == b'_' { + plus += 1; + } else { + break + } } else { break } } + debug!("lexer:identifier: {plus}"); let start = self.source_bytes_nth.get().as_usize(); let end_exclusive = start + plus; self.set_current_index(Utf8CharBoundaryStartByte::new(end_exclusive))?; From 9402c41440123efa4ee08338096d99187f44f6be Mon Sep 17 00:00:00 2001 From: Kisaragi Marine Date: Tue, 17 Oct 2023 22:26:12 +0900 Subject: [PATCH 15/25] refactor: simplify scan_string_literal --- package/origlang-compiler/src/lexer.rs | 68 ++------------------------ 1 file changed, 5 insertions(+), 63 deletions(-) diff --git a/package/origlang-compiler/src/lexer.rs b/package/origlang-compiler/src/lexer.rs index f026bd35..7f671cb2 100644 --- a/package/origlang-compiler/src/lexer.rs +++ b/package/origlang-compiler/src/lexer.rs @@ -343,70 +343,12 @@ impl Lexer { } fn scan_string_literal(&self) -> Result { - fn calc_skip_byte_in_utf8(start: Utf8CharBoundaryStartByte, source: &str) -> Option { - // well, at least, this code accesses memory to sequential order. - const BATCH_SIZE: usize = 32; - let sub_slice = &source.as_bytes()[start.as_usize()..]; - for step in 0..(sub_slice.len() / BATCH_SIZE) { - let offset = step * BATCH_SIZE; - let chunk = &sub_slice[offset..(offset + BATCH_SIZE)]; - for (sub_offset, b) in chunk.iter().enumerate() { - if *b == b'"' { - return Some(Utf8CharBoundaryStartByte::new(offset + sub_offset)) - } - } - } - - let last_offset = sub_slice.len() / BATCH_SIZE * BATCH_SIZE; - let last_byte = sub_slice.len(); - - #[allow(clippy::needless_range_loop)] - for offset in last_offset..last_byte { - if sub_slice[offset] == b'"' { - return Some(Utf8CharBoundaryStartByte::new(offset)); - } - } - - None - } - debug!("lexer:lit:string"); - - // this search is exact at this point. - // However, once we introduce escape sequence or another delimiter for string literal, - // this code is likely to needed to be rewritten. - - let Some(skip_byte_in_utf8) = calc_skip_byte_in_utf8(self.source_bytes_nth.get(), &self.source) else { - return Err(LexerError::UnclosedStringLiteral) - }; - - let mut string_char_literal_content = { - // the starting quote is handled in `next_inner`, so this boundary is either first - // char in the literal, or ending quote. - let maybe_first_char_boundary = self.source_bytes_nth.get(); - let quote_end_boundary = Utf8CharBoundaryStartByte::new(maybe_first_char_boundary.as_usize() + skip_byte_in_utf8.as_usize()); - - // assert!(found_boundary_nth >= current_chars_nth, "{found_boundary_nth:?} >= {current_chars_nth:?}"); - - let s = &self.source[(maybe_first_char_boundary.as_usize())..(quote_end_boundary.as_usize())]; - self.source_bytes_nth.set(quote_end_boundary); - s.to_string() - }; - - loop { - if self.reached_end() { - break - } + let start = self.source_bytes_nth.get().as_usize(); + let rel_pos = self.source[start..].find('"').unwrap_or(self.source.len() - start); + self.advance_bytes(rel_pos + 1)?; - let c = self.current_char()?; - if c == '"' { - // 終わりのダブルクォーテーションは捨てる - self.consume_char()?; - break - } - let c = self.consume_char()?; - string_char_literal_content.push(c); - } - Ok(Token::StringLiteral(string_char_literal_content)) + let s = self.source[start..(start + rel_pos)].to_string(); + Ok(Token::StringLiteral(s)) } fn advance_bytes(&self, advance: usize) -> Result<(), LineComputationError> { From 6a2b56103264d97862a91223e899159f990f0b33 Mon Sep 17 00:00:00 2001 From: Kisaragi Marine Date: Tue, 17 Oct 2023 22:32:16 +0900 Subject: [PATCH 16/25] refactor: remove unused method --- package/origlang-compiler/src/lexer.rs | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/package/origlang-compiler/src/lexer.rs b/package/origlang-compiler/src/lexer.rs index 7f671cb2..e89e247e 100644 --- a/package/origlang-compiler/src/lexer.rs +++ b/package/origlang-compiler/src/lexer.rs @@ -480,26 +480,11 @@ impl Lexer { Ok(c) } - pub(crate) fn consume_char(&self) -> Result { - let c = self.current_char()?; - // trace!("consume: `{c}` (\\U{{{k:06X}}})", k = c as u32); - self.advance(); - Ok(c) - } - fn reached_end(&self) -> bool { // <&str>::len() yields length of BYTES, not CHARS self.source_bytes_nth.get().as_usize() >= self.source.len() } - fn advance(&self) { - trace!("lexer:advance"); - let new = self.source_bytes_nth.get().stride(self.current_char_stride().unwrap()); - self.set_current_index(new).map_err(|e| { - warn!("discarding error: {e}"); - }).unwrap_or_default(); - } - /// パースに失敗するかも知れないものをパースしようと試みる。 /// 成功したならパースした値 /// 失敗したならNoneを返しつつ内部インデックスをこの関数を呼び出したときの値に戻す: From 9e4a11b459dc3c63fbe2acbebd21a992bf9d5178 Mon Sep 17 00:00:00 2001 From: Kisaragi Marine Date: Tue, 17 Oct 2023 22:32:36 +0900 Subject: [PATCH 17/25] perf: use current_byte on set_current_index --- package/origlang-compiler/src/lexer.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/package/origlang-compiler/src/lexer.rs b/package/origlang-compiler/src/lexer.rs index e89e247e..2fe57b9e 100644 --- a/package/origlang-compiler/src/lexer.rs +++ b/package/origlang-compiler/src/lexer.rs @@ -364,9 +364,9 @@ impl Lexer { } else { let b = self.source_bytes_nth.get().stride(Utf8CharStride::One); if future_index == b && self.current_char_stride() == Ok(Utf8CharStride::One) { - return if let Ok(c) = self.current_char() { + return if let Ok(c) = self.current_byte() { self.source_bytes_nth.set(b); - if c == '\n' { + if c == b'\n' { // new line, setting $(L + 1):C. self.current_line.set(NonZeroUsize::new(self.current_line.get().get() + 1).expect("we do not support this")); // SAFETY: 1 != 0 From bfb32fe1e6db93a2250f30aa71294a875df7ffc1 Mon Sep 17 00:00:00 2001 From: Kisaragi Marine Date: Tue, 17 Oct 2023 22:36:05 +0900 Subject: [PATCH 18/25] refactor: move unpopular method into local closure --- package/origlang-compiler/src/lexer.rs | 46 ++++++++++++++------------ 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/package/origlang-compiler/src/lexer.rs b/package/origlang-compiler/src/lexer.rs index 2fe57b9e..cae39862 100644 --- a/package/origlang-compiler/src/lexer.rs +++ b/package/origlang-compiler/src/lexer.rs @@ -252,10 +252,30 @@ impl Lexer { }) }) // dont eager evaluate - .unwrap_or_else(|| Token::UnexpectedChar { - // TODO: this is cold path, so may convert boundary to char_nth. - index: self.source_bytes_nth.get(), - char: self.current_char().expect("unexpected_char"), + .unwrap_or_else(|| { + fn current_char(this: &Lexer) -> Result { + let current_boundary = this.source_bytes_nth.get(); + let index = current_boundary.as_usize(); + let stride = this.current_char_stride()?; + + + let s = unsafe { this.source.get_unchecked(index..(index + stride.as_usize())) }; + + let c = s.chars().next().ok_or(LexerError::OutOfRange { + current: current_boundary, + // bytes in UTF-8 + max: this.source.len(), + })?; + + + Ok(c) + } + + Token::UnexpectedChar { + // TODO: this is cold path, so may convert boundary to char_nth. + index: self.source_bytes_nth.get(), + char: current_char(self).expect("unexpected_char"), + } }); Ok(v) } @@ -462,24 +482,6 @@ impl Lexer { Ok(stride) } - fn current_char(&self) -> Result { - let current_boundary = self.source_bytes_nth.get(); - let index = current_boundary.as_usize(); - let stride = self.current_char_stride()?; - - - let s = unsafe { self.source.get_unchecked(index..(index + stride.as_usize())) }; - - let c = s.chars().next().ok_or(LexerError::OutOfRange { - current: current_boundary, - // bytes in UTF-8 - max: self.source.len(), - })?; - - - Ok(c) - } - fn reached_end(&self) -> bool { // <&str>::len() yields length of BYTES, not CHARS self.source_bytes_nth.get().as_usize() >= self.source.len() From 3d8067322d59ed6852b3c7b5f4f9ce40657d8ae8 Mon Sep 17 00:00:00 2001 From: Kisaragi Marine Date: Tue, 17 Oct 2023 22:47:03 +0900 Subject: [PATCH 19/25] test: delete unnecessary test --- package/origlang-compiler/src/lexer/tests.rs | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/package/origlang-compiler/src/lexer/tests.rs b/package/origlang-compiler/src/lexer/tests.rs index 414a2a17..4fc610f7 100644 --- a/package/origlang-compiler/src/lexer/tests.rs +++ b/package/origlang-compiler/src/lexer/tests.rs @@ -93,17 +93,6 @@ fn parse_string_literal_mixed_4_3() { test("\u{10000}あ") } -#[test] -fn avoid_off_read() { - const S: &str = r#"var x = "4あ" -"#; - let lexer = Lexer::create(S); - let k = S.chars().count(); - for i in 0..k { - assert_eq!(lexer.consume_char().expect("oops"), S.chars().nth(i).expect("out of bounds from literal")) - } -} - use std::num::NonZeroUsize; use origlang_source_span::{Pointed, SourcePosition}; From aa2e63485e8b788672f6e6611bb119c5e64832eb Mon Sep 17 00:00:00 2001 From: Kisaragi Marine Date: Wed, 18 Oct 2023 00:09:07 +0900 Subject: [PATCH 20/25] fix: do not cache line number regress a few milliseconds on perf test --- package/origlang-compiler/src/lexer.rs | 96 ++++++++++---------------- 1 file changed, 35 insertions(+), 61 deletions(-) diff --git a/package/origlang-compiler/src/lexer.rs b/package/origlang-compiler/src/lexer.rs index cae39862..db8318a7 100644 --- a/package/origlang-compiler/src/lexer.rs +++ b/package/origlang-compiler/src/lexer.rs @@ -39,12 +39,26 @@ impl AssociateWithPos for T { pub struct Lexer { source_bytes_nth: Cell, source: String, - current_line: Cell, - current_column: Cell, newline_codepoint_nth_index: OccurrenceSet, } impl Lexer { + // TODO: revisit line caching + pub fn line(&self) -> NonZeroUsize { + NonZeroUsize::new(self.newline_codepoint_nth_index.count_lowers_exclusive(&self.source_bytes_nth.get()) + 1).unwrap() + } + + pub fn column(&self) -> NonZeroUsize { + let start = self.source_bytes_nth.get().as_usize(); + let last = self.source[..start].rfind('\n'); + + if let Some(last) = last { + (self.source[last..start].chars().count()).try_into().unwrap() + } else { + (start + 1).try_into().unwrap() + } + } + #[must_use = "Lexer do nothing unless calling parsing function"] pub fn create(source: &str) -> Self { let src: Cow<'_, str> = if cfg!(windows) { @@ -69,14 +83,6 @@ impl Lexer { Self { source_bytes_nth: Cell::new(Utf8CharBoundaryStartByte::new(0)), - current_line: Cell::new( - // SAFETY: 1 != 0 - unsafe { NonZeroUsize::new_unchecked(1) } - ), - current_column: Cell::new( - // SAFETY: 1 != 0 - unsafe { NonZeroUsize::new_unchecked(1) } - ), source: src.to_string(), newline_codepoint_nth_index } @@ -301,8 +307,8 @@ impl Lexer { fn current_pos(&self) -> SourcePos { SourcePos { - line: self.current_line.get(), - column: self.current_column.get(), + line: self.line(), + column: self.column(), } } @@ -378,45 +384,9 @@ impl Lexer { #[inline(never)] fn set_current_index(&self, future_index: Utf8CharBoundaryStartByte) -> Result<(), LineComputationError> { debug!("index: requested = {future_index:?}"); - if future_index == self.source_bytes_nth.get() { - // no computation is needed - Ok(()) - } else { - let b = self.source_bytes_nth.get().stride(Utf8CharStride::One); - if future_index == b && self.current_char_stride() == Ok(Utf8CharStride::One) { - return if let Ok(c) = self.current_byte() { - self.source_bytes_nth.set(b); - if c == b'\n' { - // new line, setting $(L + 1):C. - self.current_line.set(NonZeroUsize::new(self.current_line.get().get() + 1).expect("we do not support this")); - // SAFETY: 1 != 0 - self.current_column.set(unsafe { NonZeroUsize::new_unchecked(1) }); - } else { - // not new line, setting L:$(C + 1). - self.current_column.set(NonZeroUsize::new(self.current_column.get().get() + 1).expect("we do not support this")); - } - Ok(()) - } else { - // ? - Err(LineComputationError::OutOfRange) - } - } else { - // trace!("set index to: {future_index}"); - let SourcePos { line, column } = - LineComputation::compute( - future_index.stride(Utf8CharStride::from('\n')), - &self.newline_codepoint_nth_index - )?; - - trace!("compute: {line}:{column}"); - self.source_bytes_nth.set(future_index); - self.current_line.set(line); - self.current_column.set(column); - - Ok(()) - // full computation - } - } + self.source_bytes_nth.set(future_index); + + Ok(()) } fn scan_line_comment(&self) -> Result { @@ -519,23 +489,27 @@ impl Lexer { plus += 1; loop { trace!("lexer:identifier: {plus}"); - if let Ok(b) = self.byte_skip_n(plus) { - if b.is_ascii_alphanumeric() || b == b'_' { - plus += 1; - } else { + match self.byte_skip_n(plus) { + Ok(b) => { + if b.is_ascii_alphanumeric() || b == b'_' { + plus += 1; + } else { + break + } + } + Err(e) => { + warn!("discarding error: {e}"); break } - } else { - break } } - debug!("lexer:identifier: {plus}"); + debug!("lexer:identifier: length of {plus}"); let start = self.source_bytes_nth.get().as_usize(); - let end_exclusive = start + plus; - self.set_current_index(Utf8CharBoundaryStartByte::new(end_exclusive))?; + let s = Identifier::new(self.source[start..(start + plus)].to_string()); + self.advance_bytes(plus)?; - Ok(Some(Identifier::new(self.source[start..end_exclusive].to_string()))) + Ok(Some(s)) } else { Ok(None) } From 65fda2b0fda76165c03510704dadef03b02eea96 Mon Sep 17 00:00:00 2001 From: Kisaragi Marine Date: Wed, 18 Oct 2023 22:21:52 +0900 Subject: [PATCH 21/25] fix: revisit line number caching --- package/origlang-compiler/src/lexer.rs | 82 ++++++++++++++++++++------ 1 file changed, 63 insertions(+), 19 deletions(-) diff --git a/package/origlang-compiler/src/lexer.rs b/package/origlang-compiler/src/lexer.rs index db8318a7..11117a7b 100644 --- a/package/origlang-compiler/src/lexer.rs +++ b/package/origlang-compiler/src/lexer.rs @@ -40,25 +40,11 @@ pub struct Lexer { source_bytes_nth: Cell, source: String, newline_codepoint_nth_index: OccurrenceSet, + line: Cell, + column: Cell, } impl Lexer { - // TODO: revisit line caching - pub fn line(&self) -> NonZeroUsize { - NonZeroUsize::new(self.newline_codepoint_nth_index.count_lowers_exclusive(&self.source_bytes_nth.get()) + 1).unwrap() - } - - pub fn column(&self) -> NonZeroUsize { - let start = self.source_bytes_nth.get().as_usize(); - let last = self.source[..start].rfind('\n'); - - if let Some(last) = last { - (self.source[last..start].chars().count()).try_into().unwrap() - } else { - (start + 1).try_into().unwrap() - } - } - #[must_use = "Lexer do nothing unless calling parsing function"] pub fn create(source: &str) -> Self { let src: Cow<'_, str> = if cfg!(windows) { @@ -84,7 +70,9 @@ impl Lexer { Self { source_bytes_nth: Cell::new(Utf8CharBoundaryStartByte::new(0)), source: src.to_string(), - newline_codepoint_nth_index + newline_codepoint_nth_index, + line: Cell::new(NonZeroUsize::new(1).unwrap()), + column: Cell::new(NonZeroUsize::new(1).unwrap()), } } @@ -307,8 +295,8 @@ impl Lexer { fn current_pos(&self) -> SourcePos { SourcePos { - line: self.line(), - column: self.column(), + line: self.line.get(), + column: self.column.get(), } } @@ -383,6 +371,62 @@ impl Lexer { #[inline(never)] fn set_current_index(&self, future_index: Utf8CharBoundaryStartByte) -> Result<(), LineComputationError> { + let old = self.source_bytes_nth.get().as_usize(); + let new = future_index.as_usize(); + + if old == new { + return Ok(()) + } + + let current_line = self.line.get().get(); + + let src = &self.source; + if old < new { + // forward + let new_line = current_line + src[old..new].bytes().filter(|x| *x == b'\n').count(); + let new_col = if let Some(old_relative) = src[old..new].rfind('\n') { + // .......................OLD.................NEW + // |<--------N------>| + new - (old + old_relative) + } else { + let mut c = self.column.get().get(); + c += (new - old); + + c + }; + + self.line.set(NonZeroUsize::new(new_line).expect("overflow")); + self.column.set(NonZeroUsize::new(new_col).expect("overflow")) + } else { + // back + let new_line = current_line - src[new..old].bytes().filter(|x| *x == b'\n').count(); + let new_col = if let Some(new_relative) = src[new..old].find('\n') { + // .......................NEW.................OLD + // |<--------N------>| + let nr = new + new_relative; + if let Some(most_recent_nl) = src[..nr].rfind('\n') { + // ..............NEW.................OLD + // |<--------N------>| + // |<-----MRN-------------->| + + // this is effectively static assertion, should not + // cost on runtime. + assert!(most_recent_nl < nr); + nr - most_recent_nl + } else { + nr + } + } else { + let mut c = self.column.get().get(); + c += old - new; + + c + }; + + self.line.set(NonZeroUsize::new(new_line).expect("overflow")); + self.column.set(NonZeroUsize::new(new_col).expect("overflow")) + } + debug!("index: requested = {future_index:?}"); self.source_bytes_nth.set(future_index); From 651346bc6dde1f57e2a65096d5f6d5d355636950 Mon Sep 17 00:00:00 2001 From: Kisaragi Marine Date: Wed, 18 Oct 2023 22:28:29 +0900 Subject: [PATCH 22/25] refactor: revisit row-col --- package/origlang-compiler/src/chars.rs | 2 - package/origlang-compiler/src/chars/line.rs | 115 ------ .../origlang-compiler/src/chars/occurrence.rs | 334 ------------------ package/origlang-compiler/src/lexer.rs | 25 +- package/origlang-compiler/src/lexer/error.rs | 6 +- 5 files changed, 7 insertions(+), 475 deletions(-) delete mode 100644 package/origlang-compiler/src/chars/line.rs delete mode 100644 package/origlang-compiler/src/chars/occurrence.rs diff --git a/package/origlang-compiler/src/chars.rs b/package/origlang-compiler/src/chars.rs index 8f9f4f30..65608ccc 100644 --- a/package/origlang-compiler/src/chars.rs +++ b/package/origlang-compiler/src/chars.rs @@ -1,3 +1 @@ pub mod boundary; -pub mod occurrence; -pub mod line; diff --git a/package/origlang-compiler/src/chars/line.rs b/package/origlang-compiler/src/chars/line.rs deleted file mode 100644 index 328b30e1..00000000 --- a/package/origlang-compiler/src/chars/line.rs +++ /dev/null @@ -1,115 +0,0 @@ -use thiserror::Error; -use origlang_source_span::{SourcePosition as SourcePos}; -use crate::chars::boundary::Utf8CharBoundaryStartByte; -use crate::chars::occurrence::OccurrenceSet; - -#[allow(clippy::module_name_repetitions)] -pub struct LineComputation; - -impl LineComputation { - pub fn compute(future_index: Utf8CharBoundaryStartByte, new_line_occurrences: &OccurrenceSet) -> Result { - /* - // This may be an error, however this snippet leads to infinite loop. - if new_line_occurrences.contains(&future_index) { - return Err(LineComputationError::PointedOnNewLine) - } - */ - - let future_line = new_line_occurrences.count_lowers_exclusive(&future_index) + 1; - - let most_recent_new_line_occurrence_codepoint = new_line_occurrences - .max_upper_bounded_exclusive(&future_index) - // if future_index is still on first line, there's no such occurrence - substitute - // this value with zero to leave future_index as is. - .copied() - .unwrap_or(Utf8CharBoundaryStartByte::new(0)); - - assert!(future_index >= most_recent_new_line_occurrence_codepoint, "{future_index:?} >= {most_recent_new_line_occurrence_codepoint:?}"); - let future_line_column = future_index.as_usize() - most_recent_new_line_occurrence_codepoint.as_usize(); - - Ok(SourcePos { - line: future_line.try_into().map_err(|_| LineComputationError::LineIsZero)?, - column: future_line_column.try_into().map_err(|_| LineComputationError::ColumnIsZero)?, - }) - } -} - -#[derive(Error, Debug, Eq, PartialEq, Copy, Clone)] -#[allow(clippy::module_name_repetitions)] -pub enum LineComputationError { - #[error("line number is zero")] - LineIsZero, - #[error("column number is zero")] - ColumnIsZero, - #[error("out of range")] - OutOfRange, -} - -#[cfg(test)] -mod tests { - use origlang_source_span::{SourcePosition as SourcePos}; - use crate::chars::boundary::Utf8CharBoundaryStartByte; - use crate::chars::line::LineComputation; - use crate::chars::occurrence::OccurrenceSet; - - #[test] - fn no_newline() { - assert_eq!( - LineComputation::compute(Utf8CharBoundaryStartByte::new(12), &OccurrenceSet::default()), - Ok(SourcePos { - line: 1.try_into().unwrap(), - column: 12.try_into().unwrap(), - }) - ); - } - - #[test] - fn single_newline_pre() { - assert_eq!( - LineComputation::compute(Utf8CharBoundaryStartByte::new(1), &OccurrenceSet::new( - vec![Utf8CharBoundaryStartByte::new(100)] - ).unwrap()), - Ok(SourcePos { - line: 1.try_into().unwrap(), - column: 1.try_into().unwrap(), - }) - ) - } - - #[test] - fn single_newline_pre_99() { - assert_eq!( - LineComputation::compute(Utf8CharBoundaryStartByte::new(99), &OccurrenceSet::new( - vec![Utf8CharBoundaryStartByte::new(100)] - ).unwrap()), - Ok(SourcePos { - line: 1.try_into().unwrap(), - column: 99.try_into().unwrap(), - }) - ) - } - - #[test] - fn single_newline_post() { - assert_eq!( - LineComputation::compute(Utf8CharBoundaryStartByte::new(101), &OccurrenceSet::new( - vec![Utf8CharBoundaryStartByte::new(100)] - ).unwrap()), - Ok(SourcePos { - line: 2.try_into().unwrap(), - column: 1.try_into().unwrap(), - }) - ) - } - - #[test] - fn single_newline_point_is_not_an_error() { - assert_eq!( - LineComputation::compute(Utf8CharBoundaryStartByte::new(100), &OccurrenceSet::new(vec![Utf8CharBoundaryStartByte::new(100)]).unwrap()), - Ok(SourcePos { - line: 1.try_into().unwrap(), - column: 100.try_into().unwrap(), - }) - ) - } -} \ No newline at end of file diff --git a/package/origlang-compiler/src/chars/occurrence.rs b/package/origlang-compiler/src/chars/occurrence.rs deleted file mode 100644 index a2483a7e..00000000 --- a/package/origlang-compiler/src/chars/occurrence.rs +++ /dev/null @@ -1,334 +0,0 @@ -#[derive(Clone, Eq, PartialEq, Debug, Hash)] -#[allow(clippy::module_name_repetitions)] -/// Contains "sorted" values. Unlike [std::collections::BTreeSet], this collection has vector internally, -/// for performance optimization. -pub struct OccurrenceSet(Vec); - -// TODO(nightly): -// once the nightly feature called `is_sorted` become stable, replace call to this function -// with stabilized one. -fn is_sorted(slice: &[T]) -> bool { - if slice.len() <= 1 { - true - } else { - slice.iter().fold((true, &slice[0]), |(b, e), f| { - (b && e <= f, f) - }).0 - } -} - -impl OccurrenceSet { - pub fn new(v: Vec) -> Option { - if v.len() <= 1 { - Some(Self(v)) - } else if Self::invariant_was_satisfied(&v) { - // SAFETY: we've checked precondition. - unsafe { - Some(Self::new_unchecked(v)) - } - } else { - None - } - } - - fn invariant_was_satisfied(v: &[T]) -> bool { - is_sorted(v) - } - - pub unsafe fn new_unchecked(v: Vec) -> Self { - debug_assert!(Self::invariant_was_satisfied(&v), "invariant was violated"); - - Self(v) - } - - pub fn count_lowers_exclusive(&self, upper: &T) -> usize { - let mut i = 0; - let values: &[T] = &self.0; - let mut run_rest = true; - if values.len() >= 6400 { - // if values are too many to being cached in L1 storage, - // switch strategy to binary_search. - // This operation always return correct value, as underlying source - // is guaranteed to be sorted in ascending order. - return values.binary_search(upper).map_or_else(|x| x, |x| x); - } else if values.len() >= 8 { - while i < values.len() - 8 { - // SAFETY: above condition ensures that no OOB-reads happen. - let v1 = unsafe { values.get_unchecked(i) }; - // SAFETY: above condition ensures that no OOB-reads happen. - let v2 = unsafe { values.get_unchecked(i + 1) }; - // SAFETY: above condition ensures that no OOB-reads happen. - let v3 = unsafe { values.get_unchecked(i + 2) }; - // SAFETY: above condition ensures that no OOB-reads happen. - let v4 = unsafe { values.get_unchecked(i + 3) }; - // SAFETY: above condition ensures that no OOB-reads happen. - let v5 = unsafe { values.get_unchecked(i + 4) }; - // SAFETY: above condition ensures that no OOB-reads happen. - let v6 = unsafe { values.get_unchecked(i + 5) }; - // SAFETY: above condition ensures that no OOB-reads happen. - let v7 = unsafe { values.get_unchecked(i + 6) }; - // SAFETY: above condition ensures that no OOB-reads happen. - let v8 = unsafe { values.get_unchecked(i + 7) }; - - let upper = &upper; - if v8 < upper { - // let CPU to guess what is going on, manual _mm_prefetch is inefficient - i += 8; - } else { - // v8 >= upper - // partition point must be in v1..v8 - if v8 < upper { - i += 8; - } else if v7 < upper { - i += 7; - } else if v6 < upper { - i += 6; - } else if v5 < upper { - i += 5; - } else if v4 < upper { - i += 4; - } else if v3 < upper { - i += 3; - } else if v2 < upper { - i += 2; - } else if v1 < upper { - i += 1; - } - - run_rest = false; - break - } - } - } - - if run_rest { - let j = i; - for x in &values[j..] { - if x < upper { - i += 1; - } - } - } - - i - } - - pub fn max_upper_bounded_exclusive(&self, upper: &T) -> Option<&T> { - let values: &[T] = &self.0; - - let k = self.count_lowers_exclusive(upper); - if k == 0 { - None - } else { - values.get(k - 1) - } - } -} - -// You can construct empty OccurrenceSet even if T: !Default -impl Default for OccurrenceSet { - fn default() -> Self { - Self(vec![]) - } -} - -#[cfg(test)] -mod tests { - use crate::chars::occurrence::{is_sorted, OccurrenceSet}; - - #[test] - fn sorted_empty() { - assert!(is_sorted::(&[])); - } - - #[test] - fn sorted_single() { - assert!(is_sorted(&[1])); - } - - #[test] - fn sorted_double() { - assert!(is_sorted(&[1, 2])); - } - - #[test] - fn sorted_double_negative() { - assert!(!is_sorted(&[2, 1])); - } - - #[test] - fn occurrence_empty() { - let set = OccurrenceSet::::new(vec![]); - assert_eq!(set.expect("must be constructed").count_lowers_exclusive(&0), 0); - } - - #[test] - fn occurrence_single_less() { - let set = OccurrenceSet::::new(vec![0]); - assert_eq!(set.expect("must be constructed").count_lowers_exclusive(&1), 1); - } - - #[test] - fn occurrence_single_eq() { - let set = OccurrenceSet::::new(vec![0]); - assert_eq!(set.expect("must be constructed").count_lowers_exclusive(&0), 0); - } - - #[test] - fn occurrence_single_more() { - let set = OccurrenceSet::::new(vec![1]); - assert_eq!(set.expect("must be constructed").count_lowers_exclusive(&0), 0); - } - - #[test] - fn occurrence_8() { - let set = OccurrenceSet::new(vec![1, 2, 3, 4, 5, 6, 7, 8]); - assert_eq!(set.expect("must be constructed").count_lowers_exclusive(&10), 8); - } - - #[test] - fn occurrence_9() { - let set = OccurrenceSet::new(vec![1, 2, 3, 4, 5, 6, 7, 8, 9]); - assert_eq!(set.expect("must be constructed").count_lowers_exclusive(&10), 9); - } - - // #[test] - /* - fn bench() { - const N: usize = 16384; - - // avoids stack overflow in debug mode. - struct OnHeap(Box<[T; N]>); - - impl Distribution> for Standard where Standard: Distribution { - fn sample(&self, rng: &mut R) -> OnHeap { - OnHeap(Box::new(rng.gen::<[T; N]>())) - } - } - - impl Deref for OnHeap { - type Target = [T; N]; - - fn deref(&self) -> &Self::Target { - self.0.as_ref() - } - } - - impl IntoIterator for OnHeap { - type Item = T; - type IntoIter = core::array::IntoIter; - - fn into_iter(self) -> Self::IntoIter { - self.0.into_iter() - } - } - - impl<'a, T: 'a, const N: usize> IntoIterator for &'a OnHeap { - type Item = &'a T; - type IntoIter = core::slice::Iter<'a, T>; - - fn into_iter(self) -> Self::IntoIter { - self.0.iter() - } - } - - let haystack = rand::random::>(); - let find = rand::random::>(); - println!("1"); - let occ_time = { - let now = Instant::now(); - let mut x = haystack.to_vec(); - x.sort(); - - let mut buf = Vec::with_capacity(N); - - // let now = Instant::now(); - let occ = OccurrenceSet::new(x).expect("it is not empty"); - for f in &find { - buf.push(occ.count_lowers_exclusive(f)); - } - - (buf, now.elapsed()) - }; - println!("1"); - - let bt_time = { - let mut buf = Vec::with_capacity(N); - - let now = Instant::now(); - let mut occ: BTreeSet = BTreeSet::new(); - occ.extend(&haystack); - - for upper in &find { - buf.push(occ.iter().filter(|x| *x < upper).count()); - } - - (buf, now.elapsed()) - }; - println!("1"); - - let bh_time = { - let mut buf = Vec::with_capacity(N); - - let now = Instant::now(); - let mut occ: BinaryHeap = BinaryHeap::new(); - occ.extend(&haystack); - - for upper in &find { - buf.push(occ.iter().filter(|x| *x < upper).count()); - } - - (buf, now.elapsed()) - }; - println!("1"); - - let vec_time = { - let mut buf = Vec::with_capacity(N); - - let now = Instant::now(); - let mut occ: Vec = Vec::with_capacity(N); - occ.extend(&haystack); - - for upper in &find { - buf.push(occ.iter().filter(|x| *x < upper).count()); - } - - (buf, now.elapsed()) - }; - - - let vec_ni_time = { - let mut buf = Vec::with_capacity(N); - - let now = Instant::now(); - let mut occ: Vec = Vec::with_capacity(N); - occ.extend(&haystack); - - for upper in find { - let mut count = 0usize; - for x in &occ { - if *x < upper { - count += 1; - } - } - - buf.push(count); - } - - (buf, now.elapsed()) - }; - - println!("1"); - - // let vec_ni_time = (vec![1], Duration::new(0, 0)); - assert_eq!(occ_time.0, bt_time.0); - assert_eq!(bh_time.0, bt_time.0); - assert_eq!(vec_time.0, bt_time.0); - assert_eq!(vec_ni_time.0, bt_time.0); - - println!("impl: {o:?} | bin tree:{b:?} | bin heap: {bh:?} | vec_iter: {v:?} | vec: {vi:?}", o = occ_time.1, b = bt_time.1, bh = bh_time.1, v = vec_time.1, vi = vec_ni_time.1); - } - - - */ -} \ No newline at end of file diff --git a/package/origlang-compiler/src/lexer.rs b/package/origlang-compiler/src/lexer.rs index 11117a7b..1e142075 100644 --- a/package/origlang-compiler/src/lexer.rs +++ b/package/origlang-compiler/src/lexer.rs @@ -5,6 +5,7 @@ pub mod token; use std::borrow::Cow; use std::cell::Cell; +use std::convert::Infallible; use std::num::NonZeroUsize; use std::ops::ControlFlow; @@ -15,8 +16,6 @@ use origlang_ast::{Comment, Identifier}; use origlang_source_span::{SourcePosition as SourcePos, Pointed as WithPosition}; use crate::char_list::ASCII_NUMERIC_CHARS; use crate::chars::boundary::{Utf8CharBoundaryStartByte, Utf8CharStride}; -use crate::chars::line::{LineComputation, LineComputationError}; -use crate::chars::occurrence::OccurrenceSet; use crate::lexer::token::{TemporalLexerUnwindToken, Token}; static KEYWORDS: [&str; 12] = @@ -39,7 +38,6 @@ impl AssociateWithPos for T { pub struct Lexer { source_bytes_nth: Cell, source: String, - newline_codepoint_nth_index: OccurrenceSet, line: Cell, column: Cell, } @@ -53,24 +51,9 @@ impl Lexer { Cow::Borrowed(source) }; - let newline_codepoint_nth_index = src.bytes().enumerate() - .filter(|(_, x)| *x == b'\n') - .map(|(i, _)| Utf8CharBoundaryStartByte::new(i)) - // we can't use try_collect because it requires nightly compiler. - // we also can't have FromIterator for OccurrenceSet where T: Ord because doing so may - // break invariant of OccurrenceSet (i.e. the underlying iterator was not sorted.) - .collect::>(); - - // SAFETY: inner value has sorted, because: - // char_indices yields sorted index. - let newline_codepoint_nth_index = unsafe { - OccurrenceSet::new_unchecked(newline_codepoint_nth_index) - }; - Self { source_bytes_nth: Cell::new(Utf8CharBoundaryStartByte::new(0)), source: src.to_string(), - newline_codepoint_nth_index, line: Cell::new(NonZeroUsize::new(1).unwrap()), column: Cell::new(NonZeroUsize::new(1).unwrap()), } @@ -89,7 +72,7 @@ impl Lexer { /// Note /// calling [`Self::advance_bytes`], [`Self::advance`], or [`Self::set_current_index`] is error-prone. - fn try_and_eat_str<'s>(&self, s: &'s str) -> Result, LineComputationError> { + fn try_and_eat_str<'s>(&self, s: &'s str) -> Result, Infallible> { trace!("lexer:try:{s:?}"); let start = self.source_bytes_nth.get(); let end_exclusive = start.as_usize() + s.len(); @@ -365,12 +348,12 @@ impl Lexer { Ok(Token::StringLiteral(s)) } - fn advance_bytes(&self, advance: usize) -> Result<(), LineComputationError> { + fn advance_bytes(&self, advance: usize) -> Result<(), Infallible> { self.set_current_index(Utf8CharBoundaryStartByte::new(self.source_bytes_nth.get().as_usize() + advance)) } #[inline(never)] - fn set_current_index(&self, future_index: Utf8CharBoundaryStartByte) -> Result<(), LineComputationError> { + fn set_current_index(&self, future_index: Utf8CharBoundaryStartByte) -> Result<(), Infallible> { let old = self.source_bytes_nth.get().as_usize(); let new = future_index.as_usize(); diff --git a/package/origlang-compiler/src/lexer/error.rs b/package/origlang-compiler/src/lexer/error.rs index 79defd8e..fc53cbd9 100644 --- a/package/origlang-compiler/src/lexer/error.rs +++ b/package/origlang-compiler/src/lexer/error.rs @@ -1,6 +1,6 @@ +use std::convert::Infallible; use thiserror::Error; use crate::chars::boundary::Utf8CharBoundaryStartByte; -use crate::chars::line::LineComputationError; #[derive(Error, Debug, Eq, PartialEq)] #[allow(clippy::module_name_repetitions)] @@ -18,6 +18,6 @@ pub enum LexerError { MalformedAsUtf8 { boundary: Utf8CharBoundaryStartByte, }, - #[error("fatal: internal bug: {0}")] - FatalLineComputationError(#[from] LineComputationError) + #[error("never: {0}")] + Never(#[from] Infallible) } From 7af7bd71ecc246760c18406e3f18bb630e98c6b0 Mon Sep 17 00:00:00 2001 From: Kisaragi Marine Date: Wed, 18 Oct 2023 22:34:47 +0900 Subject: [PATCH 23/25] refactor: extract OutOfRangeError as a single type --- package/origlang-compiler/src/lexer.rs | 21 ++++++++++---------- package/origlang-compiler/src/lexer/error.rs | 14 ++++++++----- 2 files changed, 19 insertions(+), 16 deletions(-) diff --git a/package/origlang-compiler/src/lexer.rs b/package/origlang-compiler/src/lexer.rs index 1e142075..e9954817 100644 --- a/package/origlang-compiler/src/lexer.rs +++ b/package/origlang-compiler/src/lexer.rs @@ -16,6 +16,7 @@ use origlang_ast::{Comment, Identifier}; use origlang_source_span::{SourcePosition as SourcePos, Pointed as WithPosition}; use crate::char_list::ASCII_NUMERIC_CHARS; use crate::chars::boundary::{Utf8CharBoundaryStartByte, Utf8CharStride}; +use crate::lexer::error::OutOfRangeError; use crate::lexer::token::{TemporalLexerUnwindToken, Token}; static KEYWORDS: [&str; 12] = @@ -78,8 +79,10 @@ impl Lexer { let end_exclusive = start.as_usize() + s.len(); if let Some(b) = self.source.get((start.as_usize())..end_exclusive) { if s == b { - self.set_current_index(Utf8CharBoundaryStartByte::new(end_exclusive))?; - Ok(Some(s)) + match self.set_current_index(Utf8CharBoundaryStartByte::new(end_exclusive)) { + Ok(_) => Ok(Some(s)), + Err(OutOfRangeError { .. }) => Ok(None), + } } else { Ok(None) } @@ -238,11 +241,7 @@ impl Lexer { let s = unsafe { this.source.get_unchecked(index..(index + stride.as_usize())) }; - let c = s.chars().next().ok_or(LexerError::OutOfRange { - current: current_boundary, - // bytes in UTF-8 - max: this.source.len(), - })?; + let c = s.chars().next().ok_or(this.report_out_of_range_error())?; Ok(c) @@ -348,12 +347,12 @@ impl Lexer { Ok(Token::StringLiteral(s)) } - fn advance_bytes(&self, advance: usize) -> Result<(), Infallible> { + fn advance_bytes(&self, advance: usize) -> Result<(), OutOfRangeError> { self.set_current_index(Utf8CharBoundaryStartByte::new(self.source_bytes_nth.get().as_usize() + advance)) } #[inline(never)] - fn set_current_index(&self, future_index: Utf8CharBoundaryStartByte) -> Result<(), Infallible> { + fn set_current_index(&self, future_index: Utf8CharBoundaryStartByte) -> Result<(), OutOfRangeError> { let old = self.source_bytes_nth.get().as_usize(); let new = future_index.as_usize(); @@ -551,9 +550,9 @@ impl Lexer { } fn report_out_of_range_error(&self) -> LexerError { - LexerError::OutOfRange { + LexerError::OutOfRange(OutOfRangeError { current: self.source_bytes_nth.get(), max: self.source.len(), - } + }) } } diff --git a/package/origlang-compiler/src/lexer/error.rs b/package/origlang-compiler/src/lexer/error.rs index fc53cbd9..b9c4fa8e 100644 --- a/package/origlang-compiler/src/lexer/error.rs +++ b/package/origlang-compiler/src/lexer/error.rs @@ -7,11 +7,8 @@ use crate::chars::boundary::Utf8CharBoundaryStartByte; pub enum LexerError { #[error("Invalid suffix for integer literal. Supported suffixes are [`i8`, `i16`, `i32`, `i64`]")] InvalidSuffix, - #[error("Internal compiler error: lexer index overflow: {current:?} > {max}")] - OutOfRange { - current: Utf8CharBoundaryStartByte, - max: usize, - }, + #[error("Internal compiler error: {0}")] + OutOfRange(#[from] OutOfRangeError), #[error("Unclosed string literal was found")] UnclosedStringLiteral, #[error("Input is malformed UTF-8")] @@ -21,3 +18,10 @@ pub enum LexerError { #[error("never: {0}")] Never(#[from] Infallible) } + +#[derive(Debug, Error, Eq, PartialEq)] +#[error("lexer index overflow: {current:?} > {max}")] +pub struct OutOfRangeError { + pub current: Utf8CharBoundaryStartByte, + pub max: usize, +} From 8453dda8ff0a4aff43fb935020562b32b1a3b6b0 Mon Sep 17 00:00:00 2001 From: Kisaragi Marine Date: Wed, 18 Oct 2023 22:35:40 +0900 Subject: [PATCH 24/25] refactor: delete char_list --- package/origlang-compiler/src/char_list.rs | 4 ---- package/origlang-compiler/src/lexer.rs | 1 - package/origlang-compiler/src/lib.rs | 1 - 3 files changed, 6 deletions(-) delete mode 100644 package/origlang-compiler/src/char_list.rs diff --git a/package/origlang-compiler/src/char_list.rs b/package/origlang-compiler/src/char_list.rs deleted file mode 100644 index d3d0da1d..00000000 --- a/package/origlang-compiler/src/char_list.rs +++ /dev/null @@ -1,4 +0,0 @@ -pub static ASCII_NUMERIC_CHARS: [char; 10] = - ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']; -pub static ASCII_LOWERS: [char; 26] = - ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']; diff --git a/package/origlang-compiler/src/lexer.rs b/package/origlang-compiler/src/lexer.rs index e9954817..59037136 100644 --- a/package/origlang-compiler/src/lexer.rs +++ b/package/origlang-compiler/src/lexer.rs @@ -14,7 +14,6 @@ use log::{debug, trace, warn}; use self::error::LexerError; use origlang_ast::{Comment, Identifier}; use origlang_source_span::{SourcePosition as SourcePos, Pointed as WithPosition}; -use crate::char_list::ASCII_NUMERIC_CHARS; use crate::chars::boundary::{Utf8CharBoundaryStartByte, Utf8CharStride}; use crate::lexer::error::OutOfRangeError; use crate::lexer::token::{TemporalLexerUnwindToken, Token}; diff --git a/package/origlang-compiler/src/lib.rs b/package/origlang-compiler/src/lib.rs index beb4719a..7a27dab7 100644 --- a/package/origlang-compiler/src/lib.rs +++ b/package/origlang-compiler/src/lib.rs @@ -1,7 +1,6 @@ #![deny(clippy::all)] #![warn(clippy::pedantic, clippy::nursery)] -mod char_list; pub mod lexer; pub mod parser; pub mod type_check; From 77c1f7d18de3d3f1f792da7549710d942ed18f3f Mon Sep 17 00:00:00 2001 From: Kisaragi Marine Date: Wed, 18 Oct 2023 22:36:01 +0900 Subject: [PATCH 25/25] refactor: remove unnecessary imports --- package/origlang-compiler/src/lexer.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/package/origlang-compiler/src/lexer.rs b/package/origlang-compiler/src/lexer.rs index 59037136..e454f85e 100644 --- a/package/origlang-compiler/src/lexer.rs +++ b/package/origlang-compiler/src/lexer.rs @@ -8,8 +8,6 @@ use std::cell::Cell; use std::convert::Infallible; use std::num::NonZeroUsize; -use std::ops::ControlFlow; -use std::panic::RefUnwindSafe; use log::{debug, trace, warn}; use self::error::LexerError; use origlang_ast::{Comment, Identifier};