Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement DEP 12, string literals supporting Rectangle Rule #1534

Merged
merged 1 commit into from
Oct 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 57 additions & 3 deletions sources/dfmc/reader/lexer.dylan
Original file line number Diff line number Diff line change
Expand Up @@ -861,8 +861,9 @@ end method hex-escape-character;
define method decode-string
(source-location :: <lexer-source-location>, bpos :: <integer>,
epos :: <integer>, escapes? :: <boolean>)
=> (string :: <byte-string>)
=> (string :: <byte-string>, multi-line? :: <boolean>)
let contents = source-location.source-location-record.contents;
let multi-line? = #f;
local
method skip-hex-escape (pos)
// TODO(cgay): signal better error if '>' not found.
Expand Down Expand Up @@ -903,9 +904,11 @@ define method decode-string
loop(new-position, len + 1, #f, string);
end if;
as(<integer>, '\r') =>
multi-line? := #t;
string & (string[len] := '\n');
loop(pos + 1, len + 1, #t, string);
as(<integer>, '\n') =>
multi-line? := #t;
let increment = if (prev-was-cr?)
0 // already stored a LF
else
Expand All @@ -922,9 +925,56 @@ define method decode-string
let length = loop(bpos, 0, #f, #f);
let string = make(<string>, size: length);
loop(bpos, 0, #f, string);
string
values(string, multi-line?)
end method decode-string;

// https://opendylan.org/proposals/dep-0012-string-literals.html#the-rectangle-rule
//
// When this is called `string` is known to contain at least one literal newline
// character, the EOL sequence has already been canonicalized to just '\n', escape
// sequences have been processed, and the start/end delimiters have been removed.
define function trim-multi-line-prefix
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are probably efficiency improvements that could be made to this function, but I guess it won't be a bottleneck in compiling most programs so it should be fine.

(string :: <string>) => (maybe-trimmed :: <string>)
let lines = split(string, '\n');
let junk = first(lines);
let prefix = last(lines);
if (~empty?(junk) & ~whitespace?(junk))
error("invalid multi-line string literal - only whitespace may"
" follow the start delimiter \"\"\" on the same line");
end;
if (~empty?(prefix) & ~whitespace?(prefix))
error("invalid multi-line string literal - only whitespace may"
" precede the end delimiter \"\"\" on the same line");
end;
local method remove-prefix (line)
if (line = "")
line
elseif (~starts-with?(line, prefix))
error("invalid multi-line string literal - each line must begin"
" with the same whitespace that precedes the end"
" delimiter (got %=, want %=)",
copy-sequence(line, end: prefix.size), prefix);
else
copy-sequence(line, start: prefix.size)
end
end method;
select (lines.size)
1 => error("compiler bug while trimming multi-line string prefix");
2 => "";
otherwise =>
let keep = copy-sequence(lines, start: 1, end: lines.size - 1);
let trimmed = map(remove-prefix, keep);
if (every?(empty?, trimmed))
// If all lines are empty the last line needs to be handled specially because of
// the exceptional case of ``abc\n"""`` (where we don't want the final newline)
// vs ``\n\n"""`` (where we do want the final newline).
join(concatenate(trimmed, #("")), "\n")
else
join(trimmed, "\n")
end
end select
end function;

// Make a <literal-token> when confronted with the #"foo" syntax.
// These are referred to as "unique strings" in the DRM Lexical Syntax.
//
Expand Down Expand Up @@ -1119,7 +1169,11 @@ define method %make-string-literal
=> (res :: <string-fragment>)
let bpos = source-location.start-posn + start-offset;
let epos = source-location.end-posn - end-offset;
let string = decode-string(source-location, bpos, epos, allow-escapes?);
let (string, multi-line?)
= decode-string(source-location, bpos, epos, allow-escapes?);
if (multi-line?)
string := trim-multi-line-prefix(string);
end;
make(<string-fragment>,
record: source-location.source-location-record,
source-position: source-location.source-location-source-position,
Expand Down
2 changes: 2 additions & 0 deletions sources/dfmc/reader/reader-library.dylan
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ define library dfmc-reader
use dfmc-common;
use dfmc-conditions;
use source-records;
use strings;
export dfmc-reader;
end library dfmc-reader;

Expand All @@ -29,6 +30,7 @@ define module dfmc-reader
use dfmc-imports;
use dfmc-conditions;
use source-records;
use strings;

//// Token classes used externally.

Expand Down
240 changes: 213 additions & 27 deletions sources/dfmc/reader/tests/literal-test-suite.dylan
Original file line number Diff line number Diff line change
Expand Up @@ -224,39 +224,224 @@ define test string-literal-test ()
assert-signals(<invalid-token>, read-fragment(#:string:{"\1<b>"}));
end test;

define test string-literal-multi-line-test ()
let f = read-fragment(#:string:{""""""});
verify-literal(f, "", <string-fragment>);
// Make sure the reader didn't stop at the first pair of double quotes...
let source = source-location-string(fragment-source-location(f));
assert-equal(#:string:{""""""}, source);
// verify multi-line string
define function verify-mls
(name, source, want)
assert-no-errors(read-fragment(source), "%s - parses without error", name);
let frag = read-fragment(source);
assert-instance?(<string-fragment>, frag, "%s - is string fragment", name);
assert-equal(frag.fragment-value, want, "%s - has expected value", name);
end function;

verify-literal(read-fragment(#:string:{"""abc"""}), "abc", <string-fragment>);
verify-literal(read-fragment(#:string:{"""a\nc"""}), "a\nc", <string-fragment>);
define test string-literal-one-line-test ()
verify-mls("empty string", #:string:{""""""}, "");

// EOL canonicalization
verify-literal(read-fragment("\"\"\"a\nc\"\"\""), "a\nc", <string-fragment>);
verify-literal(read-fragment("\"\"\"a\r\nc\"\"\""), "a\nc", <string-fragment>);
verify-literal(read-fragment("\"\"\"a\rc\"\"\""), "a\nc", <string-fragment>);
verify-literal(read-fragment("\"\"\"a\n\rc\"\"\""), "a\n\nc", <string-fragment>);
// Make sure the reader didn't stop at the first pair of double quotes...
let empty-string-fragment = read-fragment(#:string:{""""""});
assert-equal(#:string:{""""""},
source-location-string(fragment-source-location(empty-string-fragment)),
"entire empty string consumed");

verify-mls("simple abc",
#:string:{"""abc"""}, "abc");
verify-mls("abc with spaces",
#:string:{""" abc """}, " abc ");
end test;

define test string-literal-multi-line-test ()
verify-mls("multi-line empty string, no prefix",
#:string:{"""
"""},
"");
verify-mls("multi-line empty string, with prefix",
#:string:{"""
"""},
"");
verify-mls("multi-line one blank line, no prefix",
#:string:{"""

"""},
"\n");
verify-mls("leading whitespace relative to end delim retained",
#:string:{"""
abc
def
"""},
" abc\ndef");
verify-mls("end delim to right of start delim",
#:string:{"""
abc
def
"""},
" abc\ndef");
verify-mls("whitespace on first line ignored?", // 0x20 = space
#:string:{"""\<20>\<20>
abc
def
"""},
" abc\ndef");
// The first blank line below is truly empty and the second one has only the prefix
// (written as \<20> to avoid editors removing trailing whitespace).
verify-mls("blank lines retained",
#:string:{"""

def
\<20>\<20>\<20>
"""},
"\ndef\n");
assert-signals(<error>,
read-fragment(#:string:{"""a (only whitespace allowed after start delim)
abc
"""}),
"junk on first line");
assert-signals(<error>,
read-fragment(#:string:{"""
abc
xxx"""}),
"junk on last line");
assert-signals(<error>,
read-fragment(#:string:{"""
abc
xxx (this line not indented enough)
"""}),
"prefix mismatch non-white");
// Prefix should be " " but one line has a literal tab in prefix.
/* TODO: the literal tab causes a failure due (I presume) to
https://github.com/dylan-lang/opendylan/issues/425
check-condition("prefix mismatch whitespace",
<error>,
read-fragment("\"\"\"\n aaa\n \t bbb\n \"\"\""));
*/

// Check that CRLF and CR are converted to LF.
verify-mls("eol canonicalized 1",
"\"\"\"\na\r\nc\n\"\"\"",
"a\nc");
verify-mls("eol canonicalized 2",
"\"\"\"\na\rc\n\"\"\"",
"a\nc");
verify-mls("eol canonicalized 3",
"\"\"\"\r\na\n\rc\r\n\"\"\"",
"a\n\nc");

let char = curry(as, <character>);
// One of every escape sequence. "\a\b\e\f\n\r\t\0\'\"\\"
verify-literal(read-fragment(#:string:{"""\a\b\e\f\n\r\t\0\'\"\\"""}),
map-as(<string>, char, #('\a', '\b', '\e', '\f', '\n', '\r',
'\t', '\0', '\'', '\"', '\\')),
<string-fragment>);
// Basic hex escaping.
verify-literal(read-fragment(#:string:{"""z\<9f>z"""}),
map-as(<string>, char, #('z', #x9f, 'z')),
<string-fragment>);
verify-mls("all escape sequences",
#:string:{"""\a\b\e\f\n\r\t\0\'\"\\"""},
map-as(<string>, char,
#('\a', '\b', '\e', '\f', '\n', '\r', '\t', '\0', '\'', '\"', '\\')));
verify-mls("basic hex escaping",
#:string:{"""z\<9f>z"""},
map-as(<string>, char, #('z', #x9f, 'z')));
// We can't handle character codes > 255 yet, but the leading zeros shouldn't
// confuse the reader.
verify-literal(read-fragment(#:string:{"""z\<009f>z"""}),
map-as(<string>, char, #('z', #x9f, 'z')),
<string-fragment>);

assert-signals(<invalid-token>, read-fragment(#:string:{"""\1<b>"""}));
verify-mls("hex escape with leading zeros",
#:string:{"""z\<009f>z"""},
map-as(<string>, char, #('z', #x9f, 'z')));

assert-signals(<invalid-token>,
read-fragment(#:string:{"""\1<b>"""}),
"invalid escape sequence");

verify-mls("one line",
#:string:{
"""
abc
"""},
"abc");
verify-mls("one line with prefix",
#:string:{
"""
abc
"""},
"abc");
verify-mls("two lines",
#:string:{
"""
abc
def
"""},
"abc\ndef");
verify-mls("two lines with prefix",
#:string:{
"""
abc
def
"""},
"abc\ndef");
verify-mls("empty line at start",
#:string:{
"""

abc
"""},
"\nabc");
verify-mls("two empty lines at start",
#:string:{
"""


abc
"""},
"\n\nabc");
verify-mls("one empty line",
#:string:{
"""

"""},
"\n");
verify-mls("one empty line with prefix",
#:string:{
"""
\<20>\<20>
"""},
"\n");
verify-mls("empty line at end",
#:string:{
"""
abc

"""},
"abc\n");
verify-mls("two empty lines at end",
#:string:{
"""
abc


"""},
"abc\n\n");
verify-mls("empty lines at start and end",
#:string:{
"""

abc

"""},
"\nabc\n");
verify-mls("two empty lines",
#:string:{
"""


"""},
"\n\n");
verify-mls("three empty lines",
#:string:{
"""



"""},
"\n\n\n");
verify-mls("three empty lines at end",
#:string:{
"""
abc



"""},
"abc\n\n\n");
end test;

define test string-literal-raw-one-line-test ()
Expand Down Expand Up @@ -362,6 +547,7 @@ define suite literal-test-suite ()
test pair-literal-test;
test ratio-literal-test;
test string-literal-multi-line-test;
test string-literal-one-line-test;
test string-literal-raw-multi-line-test;
test string-literal-raw-one-line-test;
test string-literal-test;
Expand Down