Skip to content

Commit

Permalink
Implement DEP 12, string literals supporting Rectangle Rule
Browse files Browse the repository at this point in the history
  • Loading branch information
cgay committed Sep 24, 2023
1 parent a63ef63 commit 3b11f5e
Show file tree
Hide file tree
Showing 3 changed files with 272 additions and 30 deletions.
60 changes: 57 additions & 3 deletions sources/dfmc/reader/lexer.dylan
Original file line number Diff line number Diff line change
Expand Up @@ -861,8 +861,9 @@ end method hex-escape-character;
define method decode-string
(source-location :: <lexer-source-location>, bpos :: <integer>,
epos :: <integer>, escapes? :: <boolean>)
=> (string :: <byte-string>)
=> (string :: <byte-string>, multi-line? :: <boolean>)
let contents = source-location.source-location-record.contents;
let multi-line? = #f;
local
method skip-hex-escape (pos)
// TODO(cgay): signal better error if '>' not found.
Expand Down Expand Up @@ -903,9 +904,11 @@ define method decode-string
loop(new-position, len + 1, #f, string);
end if;
as(<integer>, '\r') =>
multi-line? := #t;
string & (string[len] := '\n');
loop(pos + 1, len + 1, #t, string);
as(<integer>, '\n') =>
multi-line? := #t;
let increment = if (prev-was-cr?)
0 // already stored a LF
else
Expand All @@ -922,9 +925,56 @@ define method decode-string
let length = loop(bpos, 0, #f, #f);
let string = make(<string>, size: length);
loop(bpos, 0, #f, string);
string
values(string, multi-line?)
end method decode-string;

// https://opendylan.org/proposals/dep-0012-string-literals.html#the-rectangle-rule
//
// When this is called `string` is known to contain at least one literal newline
// character, the EOL sequence has already been canonicalized to just '\n', escape
// sequences have been processed, and the start/end delimiters have been removed.
define function trim-multi-line-prefix
(string :: <string>) => (maybe-trimmed :: <string>)
let lines = split(string, '\n');
let junk = first(lines);
let prefix = last(lines);
if (~empty?(junk) & ~whitespace?(junk))
error("invalid multi-line string literal - only whitespace may"
" follow the start delimiter \"\"\" on the same line");
end;
if (~empty?(prefix) & ~whitespace?(prefix))
error("invalid multi-line string literal - only whitespace may"
" precede the end delimiter \"\"\" on the same line");
end;
local method remove-prefix (line)
if (line = "")
line
elseif (~starts-with?(line, prefix))
error("invalid multi-line string literal - each line must begin"
" with the same whitespace that precedes the end"
" delimiter (got %=, want %=)",
copy-sequence(line, end: prefix.size), prefix);
else
copy-sequence(line, start: prefix.size)
end
end method;
select (lines.size)
1 => error("compiler bug while trimming multi-line string prefix");
2 => "";
otherwise =>
let keep = copy-sequence(lines, start: 1, end: lines.size - 1);
let trimmed = map(remove-prefix, keep);
if (every?(empty?, trimmed))
// If all lines are empty the last line needs to be handled specially because of
// the exceptional case of ``abc\n"""`` (where we don't want the final newline)
// vs ``\n\n"""`` (where we do want the final newline).
join(concatenate(trimmed, #("")), "\n")
else
join(trimmed, "\n")
end
end select
end function;

// Make a <literal-token> when confronted with the #"foo" syntax.
// These are referred to as "unique strings" in the DRM Lexical Syntax.
//
Expand Down Expand Up @@ -1119,7 +1169,11 @@ define method %make-string-literal
=> (res :: <string-fragment>)
let bpos = source-location.start-posn + start-offset;
let epos = source-location.end-posn - end-offset;
let string = decode-string(source-location, bpos, epos, allow-escapes?);
let (string, multi-line?)
= decode-string(source-location, bpos, epos, allow-escapes?);
if (multi-line?)
string := trim-multi-line-prefix(string);
end;
make(<string-fragment>,
record: source-location.source-location-record,
source-position: source-location.source-location-source-position,
Expand Down
2 changes: 2 additions & 0 deletions sources/dfmc/reader/reader-library.dylan
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ define library dfmc-reader
use dfmc-common;
use dfmc-conditions;
use source-records;
use strings;
export dfmc-reader;
end library dfmc-reader;

Expand All @@ -29,6 +30,7 @@ define module dfmc-reader
use dfmc-imports;
use dfmc-conditions;
use source-records;
use strings;

//// Token classes used externally.

Expand Down
240 changes: 213 additions & 27 deletions sources/dfmc/reader/tests/literal-test-suite.dylan
Original file line number Diff line number Diff line change
Expand Up @@ -224,39 +224,224 @@ define test string-literal-test ()
assert-signals(<invalid-token>, read-fragment(#:string:{"\1<b>"}));
end test;

define test string-literal-multi-line-test ()
let f = read-fragment(#:string:{""""""});
verify-literal(f, "", <string-fragment>);
// Make sure the reader didn't stop at the first pair of double quotes...
let source = source-location-string(fragment-source-location(f));
assert-equal(#:string:{""""""}, source);
// verify multi-line string
define function verify-mls
(name, source, want)
assert-no-errors(read-fragment(source), "%s - parses without error", name);
let frag = read-fragment(source);
assert-instance?(<string-fragment>, frag, "%s - is string fragment", name);
assert-equal(frag.fragment-value, want, "%s - has expected value", name);
end function;

verify-literal(read-fragment(#:string:{"""abc"""}), "abc", <string-fragment>);
verify-literal(read-fragment(#:string:{"""a\nc"""}), "a\nc", <string-fragment>);
define test string-literal-one-line-test ()
verify-mls("empty string", #:string:{""""""}, "");

// EOL canonicalization
verify-literal(read-fragment("\"\"\"a\nc\"\"\""), "a\nc", <string-fragment>);
verify-literal(read-fragment("\"\"\"a\r\nc\"\"\""), "a\nc", <string-fragment>);
verify-literal(read-fragment("\"\"\"a\rc\"\"\""), "a\nc", <string-fragment>);
verify-literal(read-fragment("\"\"\"a\n\rc\"\"\""), "a\n\nc", <string-fragment>);
// Make sure the reader didn't stop at the first pair of double quotes...
let empty-string-fragment = read-fragment(#:string:{""""""});
assert-equal(#:string:{""""""},
source-location-string(fragment-source-location(empty-string-fragment)),
"entire empty string consumed");

verify-mls("simple abc",
#:string:{"""abc"""}, "abc");
verify-mls("abc with spaces",
#:string:{""" abc """}, " abc ");
end test;

define test string-literal-multi-line-test ()
verify-mls("multi-line empty string, no prefix",
#:string:{"""
"""},
"");
verify-mls("multi-line empty string, with prefix",
#:string:{"""
"""},
"");
verify-mls("multi-line one blank line, no prefix",
#:string:{"""
"""},
"\n");
verify-mls("leading whitespace relative to end delim retained",
#:string:{"""
abc
def
"""},
" abc\ndef");
verify-mls("end delim to right of start delim",
#:string:{"""
abc
def
"""},
" abc\ndef");
verify-mls("whitespace on first line ignored?", // 0x20 = space
#:string:{"""\<20>\<20>
abc
def
"""},
" abc\ndef");
// The first blank line below is truly empty and the second one has only the prefix
// (written as \<20> to avoid editors removing trailing whitespace).
verify-mls("blank lines retained",
#:string:{"""
def
\<20>\<20>\<20>
"""},
"\ndef\n");
assert-signals(<error>,
read-fragment(#:string:{"""a (only whitespace allowed after start delim)
abc
"""}),
"junk on first line");
assert-signals(<error>,
read-fragment(#:string:{"""
abc
xxx"""}),
"junk on last line");
assert-signals(<error>,
read-fragment(#:string:{"""
abc
xxx (this line not indented enough)
"""}),
"prefix mismatch non-white");
// Prefix should be " " but one line has a literal tab in prefix.
/* TODO: the literal tab causes a failure due (I presume) to
https://github.com/dylan-lang/opendylan/issues/425
check-condition("prefix mismatch whitespace",
<error>,
read-fragment("\"\"\"\n aaa\n \t bbb\n \"\"\""));
*/

// Check that CRLF and CR are converted to LF.
verify-mls("eol canonicalized 1",
"\"\"\"\na\r\nc\n\"\"\"",
"a\nc");
verify-mls("eol canonicalized 2",
"\"\"\"\na\rc\n\"\"\"",
"a\nc");
verify-mls("eol canonicalized 3",
"\"\"\"\r\na\n\rc\r\n\"\"\"",
"a\n\nc");

let char = curry(as, <character>);
// One of every escape sequence. "\a\b\e\f\n\r\t\0\'\"\\"
verify-literal(read-fragment(#:string:{"""\a\b\e\f\n\r\t\0\'\"\\"""}),
map-as(<string>, char, #('\a', '\b', '\e', '\f', '\n', '\r',
'\t', '\0', '\'', '\"', '\\')),
<string-fragment>);
// Basic hex escaping.
verify-literal(read-fragment(#:string:{"""z\<9f>z"""}),
map-as(<string>, char, #('z', #x9f, 'z')),
<string-fragment>);
verify-mls("all escape sequences",
#:string:{"""\a\b\e\f\n\r\t\0\'\"\\"""},
map-as(<string>, char,
#('\a', '\b', '\e', '\f', '\n', '\r', '\t', '\0', '\'', '\"', '\\')));
verify-mls("basic hex escaping",
#:string:{"""z\<9f>z"""},
map-as(<string>, char, #('z', #x9f, 'z')));
// We can't handle character codes > 255 yet, but the leading zeros shouldn't
// confuse the reader.
verify-literal(read-fragment(#:string:{"""z\<009f>z"""}),
map-as(<string>, char, #('z', #x9f, 'z')),
<string-fragment>);

assert-signals(<invalid-token>, read-fragment(#:string:{"""\1<b>"""}));
verify-mls("hex escape with leading zeros",
#:string:{"""z\<009f>z"""},
map-as(<string>, char, #('z', #x9f, 'z')));

assert-signals(<invalid-token>,
read-fragment(#:string:{"""\1<b>"""}),
"invalid escape sequence");

verify-mls("one line",
#:string:{
"""
abc
"""},
"abc");
verify-mls("one line with prefix",
#:string:{
"""
abc
"""},
"abc");
verify-mls("two lines",
#:string:{
"""
abc
def
"""},
"abc\ndef");
verify-mls("two lines with prefix",
#:string:{
"""
abc
def
"""},
"abc\ndef");
verify-mls("empty line at start",
#:string:{
"""
abc
"""},
"\nabc");
verify-mls("two empty lines at start",
#:string:{
"""
abc
"""},
"\n\nabc");
verify-mls("one empty line",
#:string:{
"""
"""},
"\n");
verify-mls("one empty line with prefix",
#:string:{
"""
\<20>\<20>
"""},
"\n");
verify-mls("empty line at end",
#:string:{
"""
abc
"""},
"abc\n");
verify-mls("two empty lines at end",
#:string:{
"""
abc
"""},
"abc\n\n");
verify-mls("empty lines at start and end",
#:string:{
"""
abc
"""},
"\nabc\n");
verify-mls("two empty lines",
#:string:{
"""
"""},
"\n\n");
verify-mls("three empty lines",
#:string:{
"""
"""},
"\n\n\n");
verify-mls("three empty lines at end",
#:string:{
"""
abc
"""},
"abc\n\n\n");
end test;

define test string-literal-raw-one-line-test ()
Expand Down Expand Up @@ -362,6 +547,7 @@ define suite literal-test-suite ()
test pair-literal-test;
test ratio-literal-test;
test string-literal-multi-line-test;
test string-literal-one-line-test;
test string-literal-raw-multi-line-test;
test string-literal-raw-one-line-test;
test string-literal-test;
Expand Down

0 comments on commit 3b11f5e

Please sign in to comment.