From 82c958277605876f9f7ebd3c520769afcc5ad6fe Mon Sep 17 00:00:00 2001 From: Thierry Arnoux <5831830+tirix@users.noreply.github.com> Date: Sun, 5 Jan 2025 17:49:06 +0100 Subject: [PATCH] Fix for #174, more parser checks and more parser tests --- metamath-rs/src/diag.rs | 14 +++ metamath-rs/src/parser.rs | 18 +++- metamath-rs/src/parser_tests.rs | 182 +++++++++++++++++++++++++++++++- metamath-rs/src/statement.rs | 2 +- 4 files changed, 210 insertions(+), 6 deletions(-) diff --git a/metamath-rs/src/diag.rs b/metamath-rs/src/diag.rs index 7bab1e1..6ba290c 100644 --- a/metamath-rs/src/diag.rs +++ b/metamath-rs/src/diag.rs @@ -145,6 +145,7 @@ pub enum Diagnostic { MissingLabel, MissingMarkupDef([bool; 3], Span), MissingProof(Span), + MissingSpaceAfterCommandToken(Span), MMReservedLabel(Span), NestedComment(Span, Span), NotActiveSymbol(TokenIndex), @@ -183,6 +184,7 @@ pub enum Diagnostic { UnclosedBeforeInclude(StatementIndex), UnclosedCommandComment(Span), UnclosedCommandString(Span), + UnclosedCommand(Span), UnclosedComment(Span), UnclosedHtml(u32, u32), UnclosedInclude, @@ -734,6 +736,12 @@ impl Diagnostic { stmt, *math_end, )]), + MissingSpaceAfterCommandToken(span) => ("Missing space after command token".into(), vec![( + Level::Error, + "A quoted command token must be followed by a space or a semicolon (;)".into(), + stmt, + *span, + )]), MMReservedLabel(span) => ("Reserved label".into(), vec![( Level::Warning, "Labels beginning with 'mm' are reserved for Metamath file names".into(), @@ -1017,6 +1025,12 @@ impl Diagnostic { stmt, *span, )]), + UnclosedCommand(span) => ("Unclosed command".into(), vec![( + Level::Error, + "A command must be closed with a semicolon (;)".into(), + stmt, + *span, + )]), UnclosedComment(comment) => ("Unclosed comment".into(), vec![( Level::Error, "Comment requires closing $) before end of file".into(), diff --git a/metamath-rs/src/parser.rs b/metamath-rs/src/parser.rs index 0d1e944..1af03b9 100644 --- a/metamath-rs/src/parser.rs +++ b/metamath-rs/src/parser.rs @@ -801,8 +801,8 @@ fn get_heading_name(buffer: &[u8], pos: FilePos) -> TokenPtr<'_> { } /// Extract the parser commands out of a $t or $j special comment -fn commands(buffer: &[u8], ch: u8, pos: FilePos) -> Result, Diagnostic> { - let mut iter = CommandIter { +pub(crate) fn commands(buffer: &[u8], ch: u8, pos: FilePos) -> Result, Diagnostic> { + let mut iter: CommandIter<'_> = CommandIter { buffer, index: pos as usize, }; @@ -815,7 +815,7 @@ fn commands(buffer: &[u8], ch: u8, pos: FilePos) -> Result, Diag Ok(iter) } -struct CommandIter<'a> { +pub(crate) struct CommandIter<'a> { buffer: &'a [u8], index: usize, } @@ -904,6 +904,12 @@ impl Iterator for CommandIter<'_> { } } } + if self.has_more() + && !matches!(self.next_char(), b' ' | b'\t' | b'\n' | b'\r' | b';') + { + let cspan = Span::new(token_start, self.index); + return Some(Err(Diagnostic::MissingSpaceAfterCommandToken(cspan))); + } CommandToken::String(Span::new(token_start, self.index - 1)) } else { let token_start = self.index; @@ -920,7 +926,11 @@ impl Iterator for CommandIter<'_> { if let Err(diag) = self.skip_white_spaces() { return Some(Err(diag)); } - if self.has_more() && self.next_char() == b';' { + if !self.has_more() { + let cspan = Span::new(command_start, self.index); + return Some(Err(Diagnostic::UnclosedCommand(cspan))); + } + if self.next_char() == b';' { // Stop if unquoted semicolon $) self.index += 1; return Some(Ok((Span::new(command_start, self.index), command))); diff --git a/metamath-rs/src/parser_tests.rs b/metamath-rs/src/parser_tests.rs index 2127c77..d8b933b 100644 --- a/metamath-rs/src/parser_tests.rs +++ b/metamath-rs/src/parser_tests.rs @@ -1,8 +1,9 @@ use crate::database::Database; use crate::diag::Diagnostic; +use crate::parser::commands; use crate::segment::{Comparer, SegmentOrder}; use crate::statement::{Span, StatementAddress, TokenRef, NO_STATEMENT}; -use crate::StatementType; +use crate::{as_str, StatementType}; use std::cmp::Ordering; #[test] @@ -150,3 +151,182 @@ parse_test!( b"$c X Y\x7F $.", [(0, Diagnostic::BadCharacter(6, 0x7F))] ); + +parse_test!( + test_j_valid, + b"$( $j usage 'exel' avoids 'ax-5' 'ax-7' 'ax-8' 'ax-9' 'ax-10' 'ax-11' + 'ax-12' 'ax-13' 'ax-ext' 'ax-sep' 'ax-nul' 'ax-pow'; $)", + [] +); + +parse_test!( + test_j_missing_closing_quote, + b"$( $j usage 'exel $)", + [( + 0, + Diagnostic::UnclosedCommandString(Span { start: 13, end: 20 }) + )] +); + +parse_test!( + test_j_unclosed_comment, + b"$( $j usage /* $)", + [( + 0, + Diagnostic::UnclosedCommandComment(Span { start: 12, end: 17 }) + )] +); + +parse_test!( + test_j_missing_whitespace, + b"$( $j usage 'exel' avoids 'ax-5' 'ax-7' 'ax-8' 'ax-9' 'ax-10' 'ax-11' + 'ax-12' 'ax-13''ax-ext' 'ax-sep' 'ax-nul' 'ax-pow'; $)", + [] +); + +macro_rules! parse_command_test { + ($name:ident, $text:expr, $commands:expr) => { + #[test] + fn $name() { + let commands = commands($text, b'j', 0); + let parsed = commands.and_then(|iter| { + Ok(iter + .map(|command| { + command.and_then(|(span, tokens)| { + Ok(( + span, + tokens + .into_iter() + .map(|token| as_str(&token.value($text)).to_owned()) + .collect::>(), + )) + }) + }) + .collect::>()) + }); + let expected = Ok($commands + .into_iter() + .map(|item| { + item.and_then(|(start, end, tokens)| { + Ok(( + Span::new(start, end), + tokens + .into_iter() + .map(|token| token.to_string()) + .collect::>(), + )) + }) + }) + .collect::>()); + assert_eq!(parsed, expected); + } + }; +} + +parse_command_test!( + test_command_valid, + b"$( $j usage 'exel' avoids 'ax-5' 'ax-7' 'ax-8' 'ax-9' 'ax-10' 'ax-11' + 'ax-12' 'ax-13' 'ax-ext' 'ax-sep' 'ax-nul' 'ax-pow'; $)", + [Ok(( + 6, + 129, + [ + "usage", "exel", "avoids", "ax-5", "ax-7", "ax-8", "ax-9", "ax-10", "ax-11", "ax-12", + "ax-13", "ax-ext", "ax-sep", "ax-nul", "ax-pow" + ] + ))] +); + +parse_command_test!( + test_command_comment, + b"$( $j usage /* comment, may contain ; * / and ' - ignored */ 'exel' avoids 'ax-5'; $)", + [Ok((6, 82, ["usage", "exel", "avoids", "ax-5"]))] +); + +parse_command_test!( + test_command_stop_at_end_tag, + b"$( $j usage 'exel' avoids 'ax-5'; $) Nothing is parsed here", + [Ok((6, 33, ["usage", "exel", "avoids", "ax-5"]))] +); + +type NoCommandType = [Result<(usize, usize, [&'static str; 0]), Diagnostic>; 0]; +const NO_COMMANDS: NoCommandType = []; +parse_command_test!(test_command_empty_list, b"$( $j $)", NO_COMMANDS); + +parse_command_test!( + test_command_more_commands, + b"$( $j + syntax 'wff'; + syntax '|-' as 'wff'; + unambiguous 'klr 5'; + $)", + [ + Ok((18, 31, vec!["syntax", "wff"])), + Ok((44, 65, vec!["syntax", "|-", "as", "wff"])), + Ok((78, 98, vec!["unambiguous", "klr 5"])) + ] +); + +parse_command_test!( + test_command_missing_semicolon, + b"$( $j usage 'exel' avoids 'ax-5' $)", + [Err::<(usize, usize, Vec<&str>), Diagnostic>( + Diagnostic::UnclosedCommand(Span::new(6, 35)) + )] +); + +parse_command_test!( + test_command_double_quote_escape, + b"$( $j abc \"de\"\"f\" 'ghi'; $)", + [Ok((6, 24, ["abc", "de\"f", "ghi"]))] +); + +parse_command_test!( + test_command_unclosed_comment, + b"$( $j usage /* unclosed comment", + [Err::<(usize, usize, Vec<&str>), Diagnostic>( + Diagnostic::UnclosedCommandComment(Span::new(12, 31)) + )] +); + +parse_command_test!( + test_command_missing_whitespace, + b"$( $j usage 'exel' avoids 'ax-5' 'ax-7' 'ax-8' 'ax-9' 'ax-10' 'ax-11' + 'ax-12' 'ax-13''ax-ext' 'ax-sep' 'ax-nul' 'ax-pow'; $)", + [Ok(( + 6, + 128, + [ + "usage", + "exel", + "avoids", + "ax-5", + "ax-7", + "ax-8", + "ax-9", + "ax-10", + "ax-11", + "ax-12", + "ax-13'ax-ext", + "ax-sep", + "ax-nul", + "ax-pow" + ] + ))] +); + +parse_command_test!( + test_command_missing_closing_quote, + b"$( $j usage 'exel' avoids 'ax-5' 'ax-7' 'ax-8' 'ax-9' 'ax-10 'ax-11' + 'ax-12' 'ax-13' 'ax-ext' 'ax-sep' 'ax-nul' 'ax-pow'; $)", + [ + Err::<(usize, usize, Vec<&str>), Diagnostic>(Diagnostic::MissingSpaceAfterCommandToken( + Span::new(55, 62) + )), + Ok(( + 62, + 128, + vec!["ax-11'", "ax-12", "ax-13", "ax-ext", "ax-sep", "ax-nul", "ax-pow"] + )) + ] +); diff --git a/metamath-rs/src/statement.rs b/metamath-rs/src/statement.rs index ada8e3b..33dbf90 100644 --- a/metamath-rs/src/statement.rs +++ b/metamath-rs/src/statement.rs @@ -362,7 +362,7 @@ impl CommandToken { let buf = span.as_ref(buf); if buf.contains("e) { let mut out = vec![]; - unescape(span.as_ref(buf), &mut out, |c| quote == c); + unescape(buf, &mut out, |c| quote == c); out.into() } else { Cow::Borrowed(buf)