diff --git a/sources/dfmc/reader/lexer.dylan b/sources/dfmc/reader/lexer.dylan index ed5ddd886..eb96b2cce 100644 --- a/sources/dfmc/reader/lexer.dylan +++ b/sources/dfmc/reader/lexer.dylan @@ -861,8 +861,9 @@ end method hex-escape-character; define method decode-string (source-location :: , bpos :: , epos :: , escapes? :: ) - => (string :: ) + => (string :: , multi-line? :: ) let contents = source-location.source-location-record.contents; + let multi-line? = #f; local method skip-hex-escape (pos) // TODO(cgay): signal better error if '>' not found. @@ -903,9 +904,11 @@ define method decode-string loop(new-position, len + 1, #f, string); end if; as(, '\r') => + multi-line? := #t; string & (string[len] := '\n'); loop(pos + 1, len + 1, #t, string); as(, '\n') => + multi-line? := #t; let increment = if (prev-was-cr?) 0 // already stored a LF else @@ -922,9 +925,56 @@ define method decode-string let length = loop(bpos, 0, #f, #f); let string = make(, size: length); loop(bpos, 0, #f, string); - string + values(string, multi-line?) end method decode-string; +// https://opendylan.org/proposals/dep-0012-string-literals.html#the-rectangle-rule +// +// When this is called `string` is known to contain at least one literal newline +// character, the EOL sequence has already been canonicalized to just '\n', escape +// sequences have been processed, and the start/end delimiters have been removed. +define function trim-multi-line-prefix + (string :: ) => (maybe-trimmed :: ) + let lines = split(string, '\n'); + let junk = first(lines); + let prefix = last(lines); + if (~empty?(junk) & ~whitespace?(junk)) + error("invalid multi-line string literal - only whitespace may" + " follow the start delimiter \"\"\" on the same line"); + end; + if (~empty?(prefix) & ~whitespace?(prefix)) + error("invalid multi-line string literal - only whitespace may" + " precede the end delimiter \"\"\" on the same line"); + end; + local method remove-prefix (line) + if (line = "") + line + elseif (~starts-with?(line, prefix)) + error("invalid multi-line string literal - each line must begin" + " with the same whitespace that precedes the end" + " delimiter (got %=, want %=)", + copy-sequence(line, end: prefix.size), prefix); + else + copy-sequence(line, start: prefix.size) + end + end method; + select (lines.size) + 1 => error("compiler bug while trimming multi-line string prefix"); + 2 => ""; + otherwise => + let keep = copy-sequence(lines, start: 1, end: lines.size - 1); + let trimmed = map(remove-prefix, keep); + if (every?(empty?, trimmed)) + // If all lines are empty the last line needs to be handled specially because of + // the exceptional case of ``abc\n"""`` (where we don't want the final newline) + // vs ``\n\n"""`` (where we do want the final newline). + join(concatenate(trimmed, #("")), "\n") + else + join(trimmed, "\n") + end + end select +end function; + // Make a when confronted with the #"foo" syntax. // These are referred to as "unique strings" in the DRM Lexical Syntax. // @@ -1119,7 +1169,11 @@ define method %make-string-literal => (res :: ) let bpos = source-location.start-posn + start-offset; let epos = source-location.end-posn - end-offset; - let string = decode-string(source-location, bpos, epos, allow-escapes?); + let (string, multi-line?) + = decode-string(source-location, bpos, epos, allow-escapes?); + if (multi-line?) + string := trim-multi-line-prefix(string); + end; make(, record: source-location.source-location-record, source-position: source-location.source-location-source-position, diff --git a/sources/dfmc/reader/reader-library.dylan b/sources/dfmc/reader/reader-library.dylan index 0fe2bb88e..902cd686c 100644 --- a/sources/dfmc/reader/reader-library.dylan +++ b/sources/dfmc/reader/reader-library.dylan @@ -12,6 +12,7 @@ define library dfmc-reader use dfmc-common; use dfmc-conditions; use source-records; + use strings; export dfmc-reader; end library dfmc-reader; @@ -29,6 +30,7 @@ define module dfmc-reader use dfmc-imports; use dfmc-conditions; use source-records; + use strings; //// Token classes used externally. diff --git a/sources/dfmc/reader/tests/literal-test-suite.dylan b/sources/dfmc/reader/tests/literal-test-suite.dylan index 375a7b32e..7fdc84ca5 100644 --- a/sources/dfmc/reader/tests/literal-test-suite.dylan +++ b/sources/dfmc/reader/tests/literal-test-suite.dylan @@ -224,39 +224,224 @@ define test string-literal-test () assert-signals(, read-fragment(#:string:{"\1"})); end test; -define test string-literal-multi-line-test () - let f = read-fragment(#:string:{""""""}); - verify-literal(f, "", ); - // Make sure the reader didn't stop at the first pair of double quotes... - let source = source-location-string(fragment-source-location(f)); - assert-equal(#:string:{""""""}, source); +// verify multi-line string +define function verify-mls + (name, source, want) + assert-no-errors(read-fragment(source), "%s - parses without error", name); + let frag = read-fragment(source); + assert-instance?(, frag, "%s - is string fragment", name); + assert-equal(frag.fragment-value, want, "%s - has expected value", name); +end function; - verify-literal(read-fragment(#:string:{"""abc"""}), "abc", ); - verify-literal(read-fragment(#:string:{"""a\nc"""}), "a\nc", ); +define test string-literal-one-line-test () + verify-mls("empty string", #:string:{""""""}, ""); - // EOL canonicalization - verify-literal(read-fragment("\"\"\"a\nc\"\"\""), "a\nc", ); - verify-literal(read-fragment("\"\"\"a\r\nc\"\"\""), "a\nc", ); - verify-literal(read-fragment("\"\"\"a\rc\"\"\""), "a\nc", ); - verify-literal(read-fragment("\"\"\"a\n\rc\"\"\""), "a\n\nc", ); + // Make sure the reader didn't stop at the first pair of double quotes... + let empty-string-fragment = read-fragment(#:string:{""""""}); + assert-equal(#:string:{""""""}, + source-location-string(fragment-source-location(empty-string-fragment)), + "entire empty string consumed"); + + verify-mls("simple abc", + #:string:{"""abc"""}, "abc"); + verify-mls("abc with spaces", + #:string:{""" abc """}, " abc "); +end test; + +define test string-literal-multi-line-test () + verify-mls("multi-line empty string, no prefix", + #:string:{""" +"""}, + ""); + verify-mls("multi-line empty string, with prefix", + #:string:{""" +"""}, + ""); + verify-mls("multi-line one blank line, no prefix", + #:string:{""" + +"""}, + "\n"); + verify-mls("leading whitespace relative to end delim retained", + #:string:{""" + abc +def +"""}, + " abc\ndef"); + verify-mls("end delim to right of start delim", + #:string:{""" + abc + def + """}, + " abc\ndef"); + verify-mls("whitespace on first line ignored?", // 0x20 = space + #:string:{"""\<20>\<20> + abc +def +"""}, + " abc\ndef"); + // The first blank line below is truly empty and the second one has only the prefix + // (written as \<20> to avoid editors removing trailing whitespace). + verify-mls("blank lines retained", + #:string:{""" + + def +\<20>\<20>\<20> + """}, + "\ndef\n"); + assert-signals(, + read-fragment(#:string:{"""a (only whitespace allowed after start delim) +abc +"""}), + "junk on first line"); + assert-signals(, + read-fragment(#:string:{""" +abc +xxx"""}), + "junk on last line"); + assert-signals(, + read-fragment(#:string:{""" + abc + xxx (this line not indented enough) + """}), + "prefix mismatch non-white"); + // Prefix should be " " but one line has a literal tab in prefix. + /* TODO: the literal tab causes a failure due (I presume) to + https://github.com/dylan-lang/opendylan/issues/425 + check-condition("prefix mismatch whitespace", + , + read-fragment("\"\"\"\n aaa\n \t bbb\n \"\"\"")); + */ + + // Check that CRLF and CR are converted to LF. + verify-mls("eol canonicalized 1", + "\"\"\"\na\r\nc\n\"\"\"", + "a\nc"); + verify-mls("eol canonicalized 2", + "\"\"\"\na\rc\n\"\"\"", + "a\nc"); + verify-mls("eol canonicalized 3", + "\"\"\"\r\na\n\rc\r\n\"\"\"", + "a\n\nc"); let char = curry(as, ); - // One of every escape sequence. "\a\b\e\f\n\r\t\0\'\"\\" - verify-literal(read-fragment(#:string:{"""\a\b\e\f\n\r\t\0\'\"\\"""}), - map-as(, char, #('\a', '\b', '\e', '\f', '\n', '\r', - '\t', '\0', '\'', '\"', '\\')), - ); - // Basic hex escaping. - verify-literal(read-fragment(#:string:{"""z\<9f>z"""}), - map-as(, char, #('z', #x9f, 'z')), - ); + verify-mls("all escape sequences", + #:string:{"""\a\b\e\f\n\r\t\0\'\"\\"""}, + map-as(, char, + #('\a', '\b', '\e', '\f', '\n', '\r', '\t', '\0', '\'', '\"', '\\'))); + verify-mls("basic hex escaping", + #:string:{"""z\<9f>z"""}, + map-as(, char, #('z', #x9f, 'z'))); // We can't handle character codes > 255 yet, but the leading zeros shouldn't // confuse the reader. - verify-literal(read-fragment(#:string:{"""z\<009f>z"""}), - map-as(, char, #('z', #x9f, 'z')), - ); - - assert-signals(, read-fragment(#:string:{"""\1"""})); + verify-mls("hex escape with leading zeros", + #:string:{"""z\<009f>z"""}, + map-as(, char, #('z', #x9f, 'z'))); + + assert-signals(, + read-fragment(#:string:{"""\1"""}), + "invalid escape sequence"); + + verify-mls("one line", + #:string:{ +""" +abc +"""}, + "abc"); + verify-mls("one line with prefix", + #:string:{ + """ + abc + """}, + "abc"); + verify-mls("two lines", + #:string:{ +""" +abc +def +"""}, + "abc\ndef"); + verify-mls("two lines with prefix", + #:string:{ + """ + abc + def + """}, + "abc\ndef"); + verify-mls("empty line at start", + #:string:{ +""" + +abc +"""}, + "\nabc"); + verify-mls("two empty lines at start", + #:string:{ +""" + + +abc +"""}, + "\n\nabc"); + verify-mls("one empty line", + #:string:{ +""" + +"""}, + "\n"); + verify-mls("one empty line with prefix", + #:string:{ + """ +\<20>\<20> + """}, + "\n"); + verify-mls("empty line at end", + #:string:{ +""" +abc + +"""}, + "abc\n"); + verify-mls("two empty lines at end", + #:string:{ +""" +abc + + +"""}, + "abc\n\n"); + verify-mls("empty lines at start and end", + #:string:{ +""" + +abc + +"""}, + "\nabc\n"); + verify-mls("two empty lines", + #:string:{ +""" + + +"""}, + "\n\n"); + verify-mls("three empty lines", + #:string:{ +""" + + + +"""}, + "\n\n\n"); + verify-mls("three empty lines at end", + #:string:{ +""" +abc + + + +"""}, + "abc\n\n\n"); end test; define test string-literal-raw-one-line-test () @@ -362,6 +547,7 @@ define suite literal-test-suite () test pair-literal-test; test ratio-literal-test; test string-literal-multi-line-test; + test string-literal-one-line-test; test string-literal-raw-multi-line-test; test string-literal-raw-one-line-test; test string-literal-test;