diff --git a/package/origlang-compiler/src/char_list.rs b/package/origlang-compiler/src/char_list.rs deleted file mode 100644 index d3d0da1d..00000000 --- a/package/origlang-compiler/src/char_list.rs +++ /dev/null @@ -1,4 +0,0 @@ -pub static ASCII_NUMERIC_CHARS: [char; 10] = - ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']; -pub static ASCII_LOWERS: [char; 26] = - ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']; diff --git a/package/origlang-compiler/src/chars.rs b/package/origlang-compiler/src/chars.rs index 8f9f4f30..65608ccc 100644 --- a/package/origlang-compiler/src/chars.rs +++ b/package/origlang-compiler/src/chars.rs @@ -1,3 +1 @@ pub mod boundary; -pub mod occurrence; -pub mod line; diff --git a/package/origlang-compiler/src/chars/line.rs b/package/origlang-compiler/src/chars/line.rs deleted file mode 100644 index 328b30e1..00000000 --- a/package/origlang-compiler/src/chars/line.rs +++ /dev/null @@ -1,115 +0,0 @@ -use thiserror::Error; -use origlang_source_span::{SourcePosition as SourcePos}; -use crate::chars::boundary::Utf8CharBoundaryStartByte; -use crate::chars::occurrence::OccurrenceSet; - -#[allow(clippy::module_name_repetitions)] -pub struct LineComputation; - -impl LineComputation { - pub fn compute(future_index: Utf8CharBoundaryStartByte, new_line_occurrences: &OccurrenceSet) -> Result { - /* - // This may be an error, however this snippet leads to infinite loop. - if new_line_occurrences.contains(&future_index) { - return Err(LineComputationError::PointedOnNewLine) - } - */ - - let future_line = new_line_occurrences.count_lowers_exclusive(&future_index) + 1; - - let most_recent_new_line_occurrence_codepoint = new_line_occurrences - .max_upper_bounded_exclusive(&future_index) - // if future_index is still on first line, there's no such occurrence - substitute - // this value with zero to leave future_index as is. - .copied() - .unwrap_or(Utf8CharBoundaryStartByte::new(0)); - - assert!(future_index >= most_recent_new_line_occurrence_codepoint, "{future_index:?} >= {most_recent_new_line_occurrence_codepoint:?}"); - let future_line_column = future_index.as_usize() - most_recent_new_line_occurrence_codepoint.as_usize(); - - Ok(SourcePos { - line: future_line.try_into().map_err(|_| LineComputationError::LineIsZero)?, - column: future_line_column.try_into().map_err(|_| LineComputationError::ColumnIsZero)?, - }) - } -} - -#[derive(Error, Debug, Eq, PartialEq, Copy, Clone)] -#[allow(clippy::module_name_repetitions)] -pub enum LineComputationError { - #[error("line number is zero")] - LineIsZero, - #[error("column number is zero")] - ColumnIsZero, - #[error("out of range")] - OutOfRange, -} - -#[cfg(test)] -mod tests { - use origlang_source_span::{SourcePosition as SourcePos}; - use crate::chars::boundary::Utf8CharBoundaryStartByte; - use crate::chars::line::LineComputation; - use crate::chars::occurrence::OccurrenceSet; - - #[test] - fn no_newline() { - assert_eq!( - LineComputation::compute(Utf8CharBoundaryStartByte::new(12), &OccurrenceSet::default()), - Ok(SourcePos { - line: 1.try_into().unwrap(), - column: 12.try_into().unwrap(), - }) - ); - } - - #[test] - fn single_newline_pre() { - assert_eq!( - LineComputation::compute(Utf8CharBoundaryStartByte::new(1), &OccurrenceSet::new( - vec![Utf8CharBoundaryStartByte::new(100)] - ).unwrap()), - Ok(SourcePos { - line: 1.try_into().unwrap(), - column: 1.try_into().unwrap(), - }) - ) - } - - #[test] - fn single_newline_pre_99() { - assert_eq!( - LineComputation::compute(Utf8CharBoundaryStartByte::new(99), &OccurrenceSet::new( - vec![Utf8CharBoundaryStartByte::new(100)] - ).unwrap()), - Ok(SourcePos { - line: 1.try_into().unwrap(), - column: 99.try_into().unwrap(), - }) - ) - } - - #[test] - fn single_newline_post() { - assert_eq!( - LineComputation::compute(Utf8CharBoundaryStartByte::new(101), &OccurrenceSet::new( - vec![Utf8CharBoundaryStartByte::new(100)] - ).unwrap()), - Ok(SourcePos { - line: 2.try_into().unwrap(), - column: 1.try_into().unwrap(), - }) - ) - } - - #[test] - fn single_newline_point_is_not_an_error() { - assert_eq!( - LineComputation::compute(Utf8CharBoundaryStartByte::new(100), &OccurrenceSet::new(vec![Utf8CharBoundaryStartByte::new(100)]).unwrap()), - Ok(SourcePos { - line: 1.try_into().unwrap(), - column: 100.try_into().unwrap(), - }) - ) - } -} \ No newline at end of file diff --git a/package/origlang-compiler/src/chars/occurrence.rs b/package/origlang-compiler/src/chars/occurrence.rs deleted file mode 100644 index a2483a7e..00000000 --- a/package/origlang-compiler/src/chars/occurrence.rs +++ /dev/null @@ -1,334 +0,0 @@ -#[derive(Clone, Eq, PartialEq, Debug, Hash)] -#[allow(clippy::module_name_repetitions)] -/// Contains "sorted" values. Unlike [std::collections::BTreeSet], this collection has vector internally, -/// for performance optimization. -pub struct OccurrenceSet(Vec); - -// TODO(nightly): -// once the nightly feature called `is_sorted` become stable, replace call to this function -// with stabilized one. -fn is_sorted(slice: &[T]) -> bool { - if slice.len() <= 1 { - true - } else { - slice.iter().fold((true, &slice[0]), |(b, e), f| { - (b && e <= f, f) - }).0 - } -} - -impl OccurrenceSet { - pub fn new(v: Vec) -> Option { - if v.len() <= 1 { - Some(Self(v)) - } else if Self::invariant_was_satisfied(&v) { - // SAFETY: we've checked precondition. - unsafe { - Some(Self::new_unchecked(v)) - } - } else { - None - } - } - - fn invariant_was_satisfied(v: &[T]) -> bool { - is_sorted(v) - } - - pub unsafe fn new_unchecked(v: Vec) -> Self { - debug_assert!(Self::invariant_was_satisfied(&v), "invariant was violated"); - - Self(v) - } - - pub fn count_lowers_exclusive(&self, upper: &T) -> usize { - let mut i = 0; - let values: &[T] = &self.0; - let mut run_rest = true; - if values.len() >= 6400 { - // if values are too many to being cached in L1 storage, - // switch strategy to binary_search. - // This operation always return correct value, as underlying source - // is guaranteed to be sorted in ascending order. - return values.binary_search(upper).map_or_else(|x| x, |x| x); - } else if values.len() >= 8 { - while i < values.len() - 8 { - // SAFETY: above condition ensures that no OOB-reads happen. - let v1 = unsafe { values.get_unchecked(i) }; - // SAFETY: above condition ensures that no OOB-reads happen. - let v2 = unsafe { values.get_unchecked(i + 1) }; - // SAFETY: above condition ensures that no OOB-reads happen. - let v3 = unsafe { values.get_unchecked(i + 2) }; - // SAFETY: above condition ensures that no OOB-reads happen. - let v4 = unsafe { values.get_unchecked(i + 3) }; - // SAFETY: above condition ensures that no OOB-reads happen. - let v5 = unsafe { values.get_unchecked(i + 4) }; - // SAFETY: above condition ensures that no OOB-reads happen. - let v6 = unsafe { values.get_unchecked(i + 5) }; - // SAFETY: above condition ensures that no OOB-reads happen. - let v7 = unsafe { values.get_unchecked(i + 6) }; - // SAFETY: above condition ensures that no OOB-reads happen. - let v8 = unsafe { values.get_unchecked(i + 7) }; - - let upper = &upper; - if v8 < upper { - // let CPU to guess what is going on, manual _mm_prefetch is inefficient - i += 8; - } else { - // v8 >= upper - // partition point must be in v1..v8 - if v8 < upper { - i += 8; - } else if v7 < upper { - i += 7; - } else if v6 < upper { - i += 6; - } else if v5 < upper { - i += 5; - } else if v4 < upper { - i += 4; - } else if v3 < upper { - i += 3; - } else if v2 < upper { - i += 2; - } else if v1 < upper { - i += 1; - } - - run_rest = false; - break - } - } - } - - if run_rest { - let j = i; - for x in &values[j..] { - if x < upper { - i += 1; - } - } - } - - i - } - - pub fn max_upper_bounded_exclusive(&self, upper: &T) -> Option<&T> { - let values: &[T] = &self.0; - - let k = self.count_lowers_exclusive(upper); - if k == 0 { - None - } else { - values.get(k - 1) - } - } -} - -// You can construct empty OccurrenceSet even if T: !Default -impl Default for OccurrenceSet { - fn default() -> Self { - Self(vec![]) - } -} - -#[cfg(test)] -mod tests { - use crate::chars::occurrence::{is_sorted, OccurrenceSet}; - - #[test] - fn sorted_empty() { - assert!(is_sorted::(&[])); - } - - #[test] - fn sorted_single() { - assert!(is_sorted(&[1])); - } - - #[test] - fn sorted_double() { - assert!(is_sorted(&[1, 2])); - } - - #[test] - fn sorted_double_negative() { - assert!(!is_sorted(&[2, 1])); - } - - #[test] - fn occurrence_empty() { - let set = OccurrenceSet::::new(vec![]); - assert_eq!(set.expect("must be constructed").count_lowers_exclusive(&0), 0); - } - - #[test] - fn occurrence_single_less() { - let set = OccurrenceSet::::new(vec![0]); - assert_eq!(set.expect("must be constructed").count_lowers_exclusive(&1), 1); - } - - #[test] - fn occurrence_single_eq() { - let set = OccurrenceSet::::new(vec![0]); - assert_eq!(set.expect("must be constructed").count_lowers_exclusive(&0), 0); - } - - #[test] - fn occurrence_single_more() { - let set = OccurrenceSet::::new(vec![1]); - assert_eq!(set.expect("must be constructed").count_lowers_exclusive(&0), 0); - } - - #[test] - fn occurrence_8() { - let set = OccurrenceSet::new(vec![1, 2, 3, 4, 5, 6, 7, 8]); - assert_eq!(set.expect("must be constructed").count_lowers_exclusive(&10), 8); - } - - #[test] - fn occurrence_9() { - let set = OccurrenceSet::new(vec![1, 2, 3, 4, 5, 6, 7, 8, 9]); - assert_eq!(set.expect("must be constructed").count_lowers_exclusive(&10), 9); - } - - // #[test] - /* - fn bench() { - const N: usize = 16384; - - // avoids stack overflow in debug mode. - struct OnHeap(Box<[T; N]>); - - impl Distribution> for Standard where Standard: Distribution { - fn sample(&self, rng: &mut R) -> OnHeap { - OnHeap(Box::new(rng.gen::<[T; N]>())) - } - } - - impl Deref for OnHeap { - type Target = [T; N]; - - fn deref(&self) -> &Self::Target { - self.0.as_ref() - } - } - - impl IntoIterator for OnHeap { - type Item = T; - type IntoIter = core::array::IntoIter; - - fn into_iter(self) -> Self::IntoIter { - self.0.into_iter() - } - } - - impl<'a, T: 'a, const N: usize> IntoIterator for &'a OnHeap { - type Item = &'a T; - type IntoIter = core::slice::Iter<'a, T>; - - fn into_iter(self) -> Self::IntoIter { - self.0.iter() - } - } - - let haystack = rand::random::>(); - let find = rand::random::>(); - println!("1"); - let occ_time = { - let now = Instant::now(); - let mut x = haystack.to_vec(); - x.sort(); - - let mut buf = Vec::with_capacity(N); - - // let now = Instant::now(); - let occ = OccurrenceSet::new(x).expect("it is not empty"); - for f in &find { - buf.push(occ.count_lowers_exclusive(f)); - } - - (buf, now.elapsed()) - }; - println!("1"); - - let bt_time = { - let mut buf = Vec::with_capacity(N); - - let now = Instant::now(); - let mut occ: BTreeSet = BTreeSet::new(); - occ.extend(&haystack); - - for upper in &find { - buf.push(occ.iter().filter(|x| *x < upper).count()); - } - - (buf, now.elapsed()) - }; - println!("1"); - - let bh_time = { - let mut buf = Vec::with_capacity(N); - - let now = Instant::now(); - let mut occ: BinaryHeap = BinaryHeap::new(); - occ.extend(&haystack); - - for upper in &find { - buf.push(occ.iter().filter(|x| *x < upper).count()); - } - - (buf, now.elapsed()) - }; - println!("1"); - - let vec_time = { - let mut buf = Vec::with_capacity(N); - - let now = Instant::now(); - let mut occ: Vec = Vec::with_capacity(N); - occ.extend(&haystack); - - for upper in &find { - buf.push(occ.iter().filter(|x| *x < upper).count()); - } - - (buf, now.elapsed()) - }; - - - let vec_ni_time = { - let mut buf = Vec::with_capacity(N); - - let now = Instant::now(); - let mut occ: Vec = Vec::with_capacity(N); - occ.extend(&haystack); - - for upper in find { - let mut count = 0usize; - for x in &occ { - if *x < upper { - count += 1; - } - } - - buf.push(count); - } - - (buf, now.elapsed()) - }; - - println!("1"); - - // let vec_ni_time = (vec![1], Duration::new(0, 0)); - assert_eq!(occ_time.0, bt_time.0); - assert_eq!(bh_time.0, bt_time.0); - assert_eq!(vec_time.0, bt_time.0); - assert_eq!(vec_ni_time.0, bt_time.0); - - println!("impl: {o:?} | bin tree:{b:?} | bin heap: {bh:?} | vec_iter: {v:?} | vec: {vi:?}", o = occ_time.1, b = bt_time.1, bh = bh_time.1, v = vec_time.1, vi = vec_ni_time.1); - } - - - */ -} \ No newline at end of file diff --git a/package/origlang-compiler/src/lexer.rs b/package/origlang-compiler/src/lexer.rs index 9e79ad8a..e454f85e 100644 --- a/package/origlang-compiler/src/lexer.rs +++ b/package/origlang-compiler/src/lexer.rs @@ -5,18 +5,15 @@ pub mod token; use std::borrow::Cow; use std::cell::Cell; +use std::convert::Infallible; use std::num::NonZeroUsize; -use std::ops::ControlFlow; -use std::panic::RefUnwindSafe; use log::{debug, trace, warn}; use self::error::LexerError; use origlang_ast::{Comment, Identifier}; use origlang_source_span::{SourcePosition as SourcePos, Pointed as WithPosition}; -use crate::char_list::ASCII_NUMERIC_CHARS; use crate::chars::boundary::{Utf8CharBoundaryStartByte, Utf8CharStride}; -use crate::chars::line::{LineComputation, LineComputationError}; -use crate::chars::occurrence::OccurrenceSet; +use crate::lexer::error::OutOfRangeError; use crate::lexer::token::{TemporalLexerUnwindToken, Token}; static KEYWORDS: [&str; 12] = @@ -39,9 +36,8 @@ impl AssociateWithPos for T { pub struct Lexer { source_bytes_nth: Cell, source: String, - current_line: Cell, - current_column: Cell, - newline_codepoint_nth_index: OccurrenceSet, + line: Cell, + column: Cell, } impl Lexer { @@ -53,70 +49,45 @@ impl Lexer { Cow::Borrowed(source) }; - let newline_codepoint_nth_index = src.bytes().enumerate() - .filter(|(_, x)| *x == b'\n') - .map(|(i, _)| Utf8CharBoundaryStartByte::new(i)) - // we can't use try_collect because it requires nightly compiler. - // we also can't have FromIterator for OccurrenceSet where T: Ord because doing so may - // break invariant of OccurrenceSet (i.e. the underlying iterator was not sorted.) - .collect::>(); - - // SAFETY: inner value has sorted, because: - // char_indices yields sorted index. - let newline_codepoint_nth_index = unsafe { - OccurrenceSet::new_unchecked(newline_codepoint_nth_index) - }; - Self { source_bytes_nth: Cell::new(Utf8CharBoundaryStartByte::new(0)), - current_line: Cell::new( - // SAFETY: 1 != 0 - unsafe { NonZeroUsize::new_unchecked(1) } - ), - current_column: Cell::new( - // SAFETY: 1 != 0 - unsafe { NonZeroUsize::new_unchecked(1) } - ), source: src.to_string(), - newline_codepoint_nth_index + line: Cell::new(NonZeroUsize::new(1).unwrap()), + column: Cell::new(NonZeroUsize::new(1).unwrap()), } } fn drain_space(&self) { - while !self.reached_end() && matches!(self.current_char().expect("drain_space"), ' ' | '\t') { - self.consume_char().unwrap(); - } - } - - fn try_char(&self, t: char) -> Result, LexerError> { - trace!("lexer:try:{t:?}"); - if !self.reached_end() && self.current_char()? == t { - self.consume_char()?; - Ok(Some(t)) - } else { - Ok(None) + trace!("drain_space: start vvvvvvvvvvvvvvvvvvv"); + while !self.reached_end() { + if self.try_and_eat_str(" ").unwrap() == Some(" ") || self.try_and_eat_str("\t").unwrap() == Some("\t") { + } else { + break + } } + trace!("drain_space: end ^^^^^^^^^^^^^^^^^^^"); } - fn try_char_peek(&self, t: char) -> Result, LexerError> { - trace!("lexer:try:{t}"); - if !self.reached_end() && self.current_char()? == t { - Ok(Some(t)) + /// Note + /// calling [`Self::advance_bytes`], [`Self::advance`], or [`Self::set_current_index`] is error-prone. + fn try_and_eat_str<'s>(&self, s: &'s str) -> Result, Infallible> { + trace!("lexer:try:{s:?}"); + let start = self.source_bytes_nth.get(); + let end_exclusive = start.as_usize() + s.len(); + if let Some(b) = self.source.get((start.as_usize())..end_exclusive) { + if s == b { + match self.set_current_index(Utf8CharBoundaryStartByte::new(end_exclusive)) { + Ok(_) => Ok(Some(s)), + Err(OutOfRangeError { .. }) => Ok(None), + } + } else { + Ok(None) + } } else { Ok(None) } } - fn try_any(&self, t: &[char]) -> Result, LexerError> { - for c in t { - if let Some(x) = self.try_char_peek(*c)? { - return Ok(Some(x)) - } - } - - Ok(None) - } - #[allow(clippy::too_many_lines, clippy::unnecessary_wraps)] fn next_inner(&self) -> Result { macro_rules! fold { @@ -134,12 +105,12 @@ impl Lexer { } else { None } - .or_else(|| self.try_char('\n').expect("huh?").map(|_| Token::NewLine)) + .or_else(|| self.try_and_eat_str("\n").expect("huh?").map(|_| Token::NewLine)) .or_else(|| fold!( - self.try_char('=').expect("huh?"), + self.try_and_eat_str(r#"="#).expect("huh?"), { - let double_eq = self.try_char('=').expect("huh?"); + let double_eq = self.try_and_eat_str(r#"="#).expect("huh?"); if double_eq.is_some() { Some(Token::PartEqEq) } else { @@ -149,31 +120,31 @@ impl Lexer { None ) ) - .or_else(|| self.try_char('+').expect("huh?").map(|_| Token::SymPlus)) - .or_else(|| self.try_char('-').expect("huh?").map(|_| Token::SymMinus)) - .or_else(|| self.try_char('*').expect("huh?").map(|_| Token::SymAsterisk)) + .or_else(|| self.try_and_eat_str(r#"+"#).expect("huh?").map(|_| Token::SymPlus)) + .or_else(|| self.try_and_eat_str(r#"-"#).expect("huh?").map(|_| Token::SymMinus)) + .or_else(|| self.try_and_eat_str(r#"*"#).expect("huh?").map(|_| Token::SymAsterisk)) .or_else(|| fold!( - self.try_char('/').expect("huh?"), + self.try_and_eat_str(r#"/"#).expect("huh?"), fold!( - self.try_char('/').expect("huh?"), + self.try_and_eat_str(r#"/"#).expect("huh?"), Some(self.scan_line_comment().expect("unable to parse comment")), Some(Token::SymSlash) ), None ) ) - .or_else(|| self.try_char('(').expect("huh?").map(|_| Token::SymLeftPar)) - .or_else(|| self.try_char(')').expect("huh?").map(|_| Token::SymRightPar)) + .or_else(|| self.try_and_eat_str(r#"("#).expect("huh?").map(|_| Token::SymLeftPar)) + .or_else(|| self.try_and_eat_str(r#")"#).expect("huh?").map(|_| Token::SymRightPar)) .or_else(|| { - if let Some(_) = self.try_char('<').expect("huh?") { - if let Some(_) = self.try_char('=').expect("huh?") { - if let Some(_) = self.try_char('>').expect("huh?") { + if let Some(_) = self.try_and_eat_str(r#"<"#).expect("huh?") { + if let Some(_) = self.try_and_eat_str(r#"="#).expect("huh?") { + if let Some(_) = self.try_and_eat_str(r#">"#).expect("huh?") { Some(Token::PartLessEqMore) } else { Some(Token::PartLessEq) } - } else if let Some(_) = self.try_char('<').expect("huh?") { + } else if let Some(_) = self.try_and_eat_str(r#"<"#).expect("huh?") { Some(Token::PartLessLess) } else { Some(Token::SymLess) @@ -183,10 +154,10 @@ impl Lexer { } }) .or_else(|| { - if let Some(_) = self.try_char('>').expect("huh?") { - if let Some(_) = self.try_char('=').expect("huh?") { + if let Some(_) = self.try_and_eat_str(r#">"#).expect("huh?") { + if let Some(_) = self.try_and_eat_str(r#"="#).expect("huh?") { Some(Token::PartMoreEq) - } else if let Some(_) = self.try_char('>').expect("huh?") { + } else if let Some(_) = self.try_and_eat_str(r#">"#).expect("huh?") { Some(Token::PartMoreMore) } else { Some(Token::SymMore) @@ -197,9 +168,9 @@ impl Lexer { }) .or_else(|| fold!( - self.try_char('!').expect("huh?"), + self.try_and_eat_str(r#"!"#).expect("huh?"), fold!( - self.try_char('=').expect("huh?"), + self.try_and_eat_str(r#"="#).expect("huh?"), Some(Token::PartBangEq), Some(Token::SymBang) ), @@ -208,62 +179,34 @@ impl Lexer { ) .or_else(|| fold!( - self.try_char('"').expect("huh?"), + self.try_and_eat_str(r#"""#).expect("huh?"), Some(self.scan_string_literal().expect("unable to parse string literal")), None ) ) - .or_else(|| - fold!( - self.try_any(&ASCII_NUMERIC_CHARS).expect("huh?"), - Some(self.scan_digits().expect("huh?")), - None - ) - ) + .or_else(|| self.scan_digits().expect("huh?")) .or_else(|| fold!( - self.try_char(',').expect("huh?"), + self.try_and_eat_str(r#","#).expect("huh?"), Some(Token::SymComma), None ) ) .or_else(|| fold!( - self.try_char(':').expect("huh?"), + self.try_and_eat_str(r#":"#).expect("huh?"), Some(Token::SymColon), None ) ) .or_else(|| { - self.one_or_many_accumulator( - String::new(), - (true, false), - |x, (first, exit_on_next_iteration)| { - if exit_on_next_iteration { - return ControlFlow::Break(()) - } - - let discarding = x == '_' && first; - let is_identifier = x.is_ascii_alphabetic() || (!first && x.is_ascii_digit()); - - if is_identifier { - ControlFlow::Continue((false, false)) - } else if discarding { - ControlFlow::Continue((false, true)) - } else { - ControlFlow::Break(()) - } - }, - |x, identifier| { - identifier.push(x); - debug!("identifier: {identifier:?}"); - } - ) + self.scan_identifier() .ok() + .flatten() .map(|scanned| { - let is_keyword = KEYWORDS.contains(&scanned.as_str()); + let is_keyword = KEYWORDS.contains(&scanned.as_name()); if is_keyword { - match scanned.as_str() { + match scanned.as_name() { "var" => Token::VarKeyword, "true" => Token::KeywordTrue, "false" => Token::KeywordFalse, @@ -281,15 +224,31 @@ impl Lexer { } } } else { - Token::Identifier { inner: Identifier::new(scanned) } + Token::Identifier { inner: scanned } } }) }) // dont eager evaluate - .unwrap_or_else(|| Token::UnexpectedChar { - // TODO: this is cold path, so may convert boundary to char_nth. - index: self.source_bytes_nth.get(), - char: self.current_char().expect("unexpected_char"), + .unwrap_or_else(|| { + fn current_char(this: &Lexer) -> Result { + let current_boundary = this.source_bytes_nth.get(); + let index = current_boundary.as_usize(); + let stride = this.current_char_stride()?; + + + let s = unsafe { this.source.get_unchecked(index..(index + stride.as_usize())) }; + + let c = s.chars().next().ok_or(this.report_out_of_range_error())?; + + + Ok(c) + } + + Token::UnexpectedChar { + // TODO: this is cold path, so may convert boundary to char_nth. + index: self.source_bytes_nth.get(), + char: current_char(self).expect("unexpected_char"), + } }); Ok(v) } @@ -315,235 +274,154 @@ impl Lexer { fn current_pos(&self) -> SourcePos { SourcePos { - line: self.current_line.get(), - column: self.current_column.get(), + line: self.line.get(), + column: self.column.get(), } } - fn one_or_many(&self, scan_while: impl Fn(char) -> bool, ignore_trailing_char_on_exit: bool) -> Result { - let mut buf = String::new(); - loop { - if self.reached_end() { - break - } + fn scan_digit_suffix_opt(&self) -> Result>, LexerError> { + if self.reached_end() { + return Ok(None) + } - let c = self.current_char()?; - if !scan_while(c) { - if ignore_trailing_char_on_exit { - self.consume_char()?; - } + for s in ["i8", "i16", "i32", "i64"] { + let a = self.try_and_eat_str(s)?; - break + if let Some(a) = a { + return Ok(Some(a.to_string().into_boxed_str())) } - let c = self.consume_char()?; - - buf.push(c); } - Ok(buf) + Ok(None) } - fn one_or_many_accumulator( - &self, - scan_sequence_accumulator: Acc, - registers: R, - judge: impl Fn(char, R) -> ControlFlow<(), R>, - accumulate_before_next_iteration_after_break: impl Fn(char, &mut Acc) - ) -> Result { - let mut acc = scan_sequence_accumulator; - let mut registers = registers; + fn scan_digits(&self) -> Result, LexerError> { + debug!("lexer:digit"); + let mut plus = 0; loop { - if self.reached_end() { - break - } + let r = self.byte_skip_n(plus); - let c = self.current_char()?; - let cf = judge(c, registers); - match cf { - ControlFlow::Continue(c) => { - registers = c; - self.consume_char()?; - } - ControlFlow::Break(_b) => { + if let Ok(b) = r { + if (b'0'..b'9').contains(&b) { + plus += 1; + } else { break } + } else { + break } - - accumulate_before_next_iteration_after_break(c, &mut acc); } - Ok(acc) - } - - fn scan_digit_suffix_opt(&self) -> Result>, LexerError> { - let v = if self.current_char()? == 'i' { - self.consume_char()?; - if self.current_char()? == '8' { - self.consume_char()?; - Some("i8".to_string().into_boxed_str()) - } else if self.current_char()? == '1' { - self.consume_char()?; - if self.current_char()? == '6' { - self.consume_char()?; - Some("i16".to_string().into_boxed_str()) - } else { - return Err(LexerError::InvalidSuffix); - } - } else if self.current_char()? == '3' { - self.consume_char()?; - if self.current_char()? == '2' { - self.consume_char()?; - Some("i32".to_string().into_boxed_str()) - } else { - return Err(LexerError::InvalidSuffix); - } - } else if self.current_char()? == '6' { - self.consume_char()?; - if self.current_char()? == '4' { - self.consume_char()?; - Some("i64".to_string().into_boxed_str()) - } else { - return Err(LexerError::InvalidSuffix); - } - } else { - return Err(LexerError::InvalidSuffix); - } + if plus == 0 { + Ok(None) } else { - None - }; + let start = self.source_bytes_nth.get().as_usize(); + let end_inclusive = start + plus; + self.set_current_index(Utf8CharBoundaryStartByte::new(end_inclusive))?; - Ok(v) - } + let scanned = self.source[start..end_inclusive].to_string(); + let builtin_suffix = self.scan_digit_suffix_opt()?; - fn scan_digits(&self) -> Result { - debug!("lexer:digit"); - let buf = self.one_or_many(|c| ASCII_NUMERIC_CHARS.contains(&c), false)?; - let builtin_suffix = self.scan_digit_suffix_opt()?; + debug!("digit: done ({scanned} {builtin_suffix:?})"); - Ok(Token::Digits { - sequence: buf, - suffix: builtin_suffix, - }) - } - - fn scan_string_literal(&self) -> Result { - fn calc_skip_byte_in_utf8(start: Utf8CharBoundaryStartByte, source: &str) -> Option { - // well, at least, this code accesses memory to sequential order. - const BATCH_SIZE: usize = 32; - let sub_slice = &source.as_bytes()[start.as_usize()..]; - for step in 0..(sub_slice.len() / BATCH_SIZE) { - let offset = step * BATCH_SIZE; - let chunk = &sub_slice[offset..(offset + BATCH_SIZE)]; - for (sub_offset, b) in chunk.iter().enumerate() { - if *b == b'"' { - return Some(Utf8CharBoundaryStartByte::new(offset + sub_offset)) - } + Ok(Some( + Token::Digits { + sequence: scanned, + suffix: builtin_suffix, } - } - - let last_offset = sub_slice.len() / BATCH_SIZE * BATCH_SIZE; - let last_byte = sub_slice.len(); + )) + } - #[allow(clippy::needless_range_loop)] - for offset in last_offset..last_byte { - if sub_slice[offset] == b'"' { - return Some(Utf8CharBoundaryStartByte::new(offset)); - } - } + } - None - } - debug!("lexer:lit:string"); + fn scan_string_literal(&self) -> Result { + let start = self.source_bytes_nth.get().as_usize(); + let rel_pos = self.source[start..].find('"').unwrap_or(self.source.len() - start); + self.advance_bytes(rel_pos + 1)?; - // this search is exact at this point. - // However, once we introduce escape sequence or another delimiter for string literal, - // this code is likely to needed to be rewritten. + let s = self.source[start..(start + rel_pos)].to_string(); + Ok(Token::StringLiteral(s)) + } - let Some(skip_byte_in_utf8) = calc_skip_byte_in_utf8(self.source_bytes_nth.get(), &self.source) else { - return Err(LexerError::UnclosedStringLiteral) - }; + fn advance_bytes(&self, advance: usize) -> Result<(), OutOfRangeError> { + self.set_current_index(Utf8CharBoundaryStartByte::new(self.source_bytes_nth.get().as_usize() + advance)) + } - let mut string_char_literal_content = { - // the starting quote is handled in `next_inner`, so this boundary is either first - // char in the literal, or ending quote. - let maybe_first_char_boundary = self.source_bytes_nth.get(); - let quote_end_boundary = Utf8CharBoundaryStartByte::new(maybe_first_char_boundary.as_usize() + skip_byte_in_utf8.as_usize()); + #[inline(never)] + fn set_current_index(&self, future_index: Utf8CharBoundaryStartByte) -> Result<(), OutOfRangeError> { + let old = self.source_bytes_nth.get().as_usize(); + let new = future_index.as_usize(); - // assert!(found_boundary_nth >= current_chars_nth, "{found_boundary_nth:?} >= {current_chars_nth:?}"); + if old == new { + return Ok(()) + } - let s = &self.source[(maybe_first_char_boundary.as_usize())..(quote_end_boundary.as_usize())]; - self.source_bytes_nth.set(quote_end_boundary); - s.to_string() - }; + let current_line = self.line.get().get(); - loop { - if self.reached_end() { - break - } + let src = &self.source; + if old < new { + // forward + let new_line = current_line + src[old..new].bytes().filter(|x| *x == b'\n').count(); + let new_col = if let Some(old_relative) = src[old..new].rfind('\n') { + // .......................OLD.................NEW + // |<--------N------>| + new - (old + old_relative) + } else { + let mut c = self.column.get().get(); + c += (new - old); - let c = self.current_char()?; - if c == '"' { - // 終わりのダブルクォーテーションは捨てる - self.consume_char()?; - break - } - let c = self.consume_char()?; - string_char_literal_content.push(c); - } - Ok(Token::StringLiteral(string_char_literal_content)) - } + c + }; - #[inline(never)] - fn set_current_index(&self, future_index: Utf8CharBoundaryStartByte) -> Result<(), LineComputationError> { - if future_index == self.source_bytes_nth.get() { - // no computation is needed - Ok(()) + self.line.set(NonZeroUsize::new(new_line).expect("overflow")); + self.column.set(NonZeroUsize::new(new_col).expect("overflow")) } else { - let b = self.source_bytes_nth.get().stride(Utf8CharStride::One); - if future_index == b && self.current_char_stride() == Ok(Utf8CharStride::One) { - return if let Ok(c) = self.current_char() { - self.source_bytes_nth.set(b); - if c == '\n' { - // new line, setting $(L + 1):C. - self.current_line.set(NonZeroUsize::new(self.current_line.get().get() + 1).expect("we do not support this")); - // SAFETY: 1 != 0 - self.current_column.set(unsafe { NonZeroUsize::new_unchecked(1) }); - } else { - // not new line, setting L:$(C + 1). - self.current_column.set(NonZeroUsize::new(self.current_column.get().get() + 1).expect("we do not support this")); - } - Ok(()) + // back + let new_line = current_line - src[new..old].bytes().filter(|x| *x == b'\n').count(); + let new_col = if let Some(new_relative) = src[new..old].find('\n') { + // .......................NEW.................OLD + // |<--------N------>| + let nr = new + new_relative; + if let Some(most_recent_nl) = src[..nr].rfind('\n') { + // ..............NEW.................OLD + // |<--------N------>| + // |<-----MRN-------------->| + + // this is effectively static assertion, should not + // cost on runtime. + assert!(most_recent_nl < nr); + nr - most_recent_nl } else { - // ? - Err(LineComputationError::OutOfRange) + nr } } else { - // trace!("set index to: {future_index}"); - let SourcePos { line, column } = - LineComputation::compute( - future_index.stride(Utf8CharStride::from('\n')), - &self.newline_codepoint_nth_index - )?; - - trace!("compute: {line}:{column}"); - self.source_bytes_nth.set(future_index); - self.current_line.set(line); - self.current_column.set(column); - - Ok(()) - // full computation - } + let mut c = self.column.get().get(); + c += old - new; + + c + }; + + self.line.set(NonZeroUsize::new(new_line).expect("overflow")); + self.column.set(NonZeroUsize::new(new_col).expect("overflow")) } + + debug!("index: requested = {future_index:?}"); + self.source_bytes_nth.set(future_index); + + Ok(()) } fn scan_line_comment(&self) -> Result { - let content = self.one_or_many(|c| c != '\n', false)?; + let start = self.source_bytes_nth.get().as_usize(); + let rel_pos = self.source[start..].find("\n").unwrap_or(self.source.len()); + self.advance_bytes(rel_pos)?; + let content = self.source[start..(start + rel_pos)].to_string(); Ok(Token::Comment { content: Comment { - content, - }, + content + } }) } @@ -597,44 +475,11 @@ impl Lexer { Ok(stride) } - fn current_char(&self) -> Result { - let current_boundary = self.source_bytes_nth.get(); - let index = current_boundary.as_usize(); - let stride = self.current_char_stride()?; - - - let s = unsafe { self.source.get_unchecked(index..(index + stride.as_usize())) }; - - let c = s.chars().next().ok_or(LexerError::OutOfRange { - current: current_boundary, - // bytes in UTF-8 - max: self.source.len(), - })?; - - - Ok(c) - } - - pub(crate) fn consume_char(&self) -> Result { - let c = self.current_char()?; - // trace!("consume: `{c}` (\\U{{{k:06X}}})", k = c as u32); - self.advance(); - Ok(c) - } - fn reached_end(&self) -> bool { // <&str>::len() yields length of BYTES, not CHARS self.source_bytes_nth.get().as_usize() >= self.source.len() } - fn advance(&self) { - trace!("lexer:advance"); - let new = self.source_bytes_nth.get().stride(self.current_char_stride().unwrap()); - self.set_current_index(new).map_err(|e| { - warn!("discarding error: {e}"); - }).unwrap_or_default(); - } - /// パースに失敗するかも知れないものをパースしようと試みる。 /// 成功したならパースした値 /// 失敗したならNoneを返しつつ内部インデックスをこの関数を呼び出したときの値に戻す: @@ -656,4 +501,55 @@ impl Lexer { fn create_reset_token(&self) -> TemporalLexerUnwindToken { TemporalLexerUnwindToken::new(self.source_bytes_nth.get()) } + + fn scan_identifier(&self) -> Result, LexerError> { + debug!("lexer:identifier"); + + let first = self.current_byte()?; + let mut plus = 0; + + if first.is_ascii_alphabetic() || first == b'_' { + plus += 1; + loop { + trace!("lexer:identifier: {plus}"); + match self.byte_skip_n(plus) { + Ok(b) => { + if b.is_ascii_alphanumeric() || b == b'_' { + plus += 1; + } else { + break + } + } + Err(e) => { + warn!("discarding error: {e}"); + break + } + } + } + + debug!("lexer:identifier: length of {plus}"); + let start = self.source_bytes_nth.get().as_usize(); + let s = Identifier::new(self.source[start..(start + plus)].to_string()); + self.advance_bytes(plus)?; + + Ok(Some(s)) + } else { + Ok(None) + } + } + + fn current_byte(&self) -> Result { + self.source.bytes().nth(self.source_bytes_nth.get().as_usize()).ok_or_else(|| self.report_out_of_range_error()) + } + + fn byte_skip_n(&self, skip: usize) -> Result { + self.source.bytes().nth(self.source_bytes_nth.get().as_usize() + skip).ok_or_else(|| self.report_out_of_range_error()) + } + + fn report_out_of_range_error(&self) -> LexerError { + LexerError::OutOfRange(OutOfRangeError { + current: self.source_bytes_nth.get(), + max: self.source.len(), + }) + } } diff --git a/package/origlang-compiler/src/lexer/error.rs b/package/origlang-compiler/src/lexer/error.rs index a3c33e43..b9c4fa8e 100644 --- a/package/origlang-compiler/src/lexer/error.rs +++ b/package/origlang-compiler/src/lexer/error.rs @@ -1,3 +1,4 @@ +use std::convert::Infallible; use thiserror::Error; use crate::chars::boundary::Utf8CharBoundaryStartByte; @@ -6,15 +7,21 @@ use crate::chars::boundary::Utf8CharBoundaryStartByte; pub enum LexerError { #[error("Invalid suffix for integer literal. Supported suffixes are [`i8`, `i16`, `i32`, `i64`]")] InvalidSuffix, - #[error("Internal compiler error: lexer index overflow: {current:?} > {max}")] - OutOfRange { - current: Utf8CharBoundaryStartByte, - max: usize, - }, + #[error("Internal compiler error: {0}")] + OutOfRange(#[from] OutOfRangeError), #[error("Unclosed string literal was found")] UnclosedStringLiteral, #[error("Input is malformed UTF-8")] MalformedAsUtf8 { boundary: Utf8CharBoundaryStartByte, }, + #[error("never: {0}")] + Never(#[from] Infallible) +} + +#[derive(Debug, Error, Eq, PartialEq)] +#[error("lexer index overflow: {current:?} > {max}")] +pub struct OutOfRangeError { + pub current: Utf8CharBoundaryStartByte, + pub max: usize, } diff --git a/package/origlang-compiler/src/lexer/tests.rs b/package/origlang-compiler/src/lexer/tests.rs index 60cc549c..4fc610f7 100644 --- a/package/origlang-compiler/src/lexer/tests.rs +++ b/package/origlang-compiler/src/lexer/tests.rs @@ -93,17 +93,6 @@ fn parse_string_literal_mixed_4_3() { test("\u{10000}あ") } -#[test] -fn avoid_off_read() { - const S: &str = r#"var x = "4あ" -"#; - let lexer = Lexer::create(S); - let k = S.chars().count(); - for i in 0..k { - assert_eq!(lexer.consume_char().expect("oops"), S.chars().nth(i).expect("out of bounds from literal")) - } -} - use std::num::NonZeroUsize; use origlang_source_span::{Pointed, SourcePosition}; @@ -195,3 +184,17 @@ fn token_location() { } }); } + +#[test] +fn digit_regression() { + const D: &str = "123456"; + let lexer = Lexer::create(D); + assert_eq!(lexer.next().data, Token::Digits { + sequence: D.to_string(), + suffix: None, + }); + + const EMPTY: &str = ""; + let lexer = Lexer::create(EMPTY); + assert_eq!(lexer.next().data, Token::EndOfFile); +} diff --git a/package/origlang-compiler/src/lib.rs b/package/origlang-compiler/src/lib.rs index beb4719a..7a27dab7 100644 --- a/package/origlang-compiler/src/lib.rs +++ b/package/origlang-compiler/src/lib.rs @@ -1,7 +1,6 @@ #![deny(clippy::all)] #![warn(clippy::pedantic, clippy::nursery)] -mod char_list; pub mod lexer; pub mod parser; pub mod type_check;