dylan-lang · cgay · Oct 1, 2023 · Sep 15, 2023 · housel · Oct 1, 2023
diff --git a/sources/dfmc/reader/lexer.dylan b/sources/dfmc/reader/lexer.dylan
@@ -861,8 +861,9 @@ end method hex-escape-character;
 define method decode-string
     (source-location :: <lexer-source-location>, bpos :: <integer>,
      epos :: <integer>, escapes? :: <boolean>)
- => (string :: <byte-string>)
+ => (string :: <byte-string>, multi-line? :: <boolean>)
   let contents = source-location.source-location-record.contents;
+  let multi-line? = #f;
   local
     method skip-hex-escape (pos)
       // TODO(cgay): signal better error if '>' not found.
@@ -903,9 +904,11 @@ define method decode-string
               loop(new-position, len + 1, #f, string);
             end if;
           as(<integer>, '\r') =>
+            multi-line? := #t;
             string & (string[len] := '\n');
             loop(pos + 1, len + 1, #t, string);
           as(<integer>, '\n') =>
+            multi-line? := #t;
             let increment = if (prev-was-cr?)
                               0 // already stored a LF
                             else
@@ -922,9 +925,56 @@ define method decode-string
   let length = loop(bpos, 0, #f, #f);
   let string = make(<string>, size: length);
   loop(bpos, 0, #f, string);
-  string
+  values(string, multi-line?)
 end method decode-string;
 
+// https://opendylan.org/proposals/dep-0012-string-literals.html#the-rectangle-rule
+//
+// When this is called `string` is known to contain at least one literal newline
+// character, the EOL sequence has already been canonicalized to just '\n', escape
+// sequences have been processed, and the start/end delimiters have been removed.
+define function trim-multi-line-prefix
+    (string :: <string>) => (maybe-trimmed :: <string>)
+  let lines = split(string, '\n');
+  let junk = first(lines);
+  let prefix = last(lines);
+  if (~empty?(junk) & ~whitespace?(junk))
+    error("invalid multi-line string literal - only whitespace may"
+            " follow the start delimiter \"\"\" on the same line");
+  end;
+  if (~empty?(prefix) & ~whitespace?(prefix))
+    error("invalid multi-line string literal - only whitespace may"
+            " precede the end delimiter \"\"\" on the same line");
+  end;
+  local method remove-prefix (line)
+          if (line = "")
+            line
+          elseif (~starts-with?(line, prefix))
+            error("invalid multi-line string literal - each line must begin"
+                    " with the same whitespace that precedes the end"
+                    " delimiter (got %=, want %=)",
+                  copy-sequence(line, end: prefix.size), prefix);
+          else
+            copy-sequence(line, start: prefix.size)
+          end
+        end method;
+  select (lines.size)
+    1 => error("compiler bug while trimming multi-line string prefix");
+    2 => "";
+    otherwise =>
+      let keep = copy-sequence(lines, start: 1, end: lines.size - 1);
+      let trimmed = map(remove-prefix, keep);
+      if (every?(empty?, trimmed))
+        // If all lines are empty the last line needs to be handled specially because of
+        // the exceptional case of ``abc\n"""`` (where we don't want the final newline)
+        // vs ``\n\n"""`` (where we do want the final newline).
+        join(concatenate(trimmed, #("")), "\n")
+      else
+        join(trimmed, "\n")
+      end
+  end select
+end function;
+
 // Make a <literal-token> when confronted with the #"foo" syntax.
 // These are referred to as "unique strings" in the DRM Lexical Syntax.
 //
@@ -1119,7 +1169,11 @@ define method %make-string-literal
  => (res :: <string-fragment>)
   let bpos = source-location.start-posn + start-offset;
   let epos = source-location.end-posn - end-offset;
-  let string = decode-string(source-location, bpos, epos, allow-escapes?);
+  let (string, multi-line?)
+    = decode-string(source-location, bpos, epos, allow-escapes?);
+  if (multi-line?)
+    string := trim-multi-line-prefix(string);
+  end;
   make(<string-fragment>,
        record: source-location.source-location-record,
        source-position: source-location.source-location-source-position,

diff --git a/sources/dfmc/reader/reader-library.dylan b/sources/dfmc/reader/reader-library.dylan
@@ -12,6 +12,7 @@ define library dfmc-reader
   use dfmc-common;
   use dfmc-conditions;
   use source-records;
+  use strings;
   export dfmc-reader;
 end library dfmc-reader;
 
@@ -29,6 +30,7 @@ define module dfmc-reader
   use dfmc-imports;
   use dfmc-conditions;
   use source-records;
+  use strings;
 
   //// Token classes used externally.
 

diff --git a/sources/dfmc/reader/tests/literal-test-suite.dylan b/sources/dfmc/reader/tests/literal-test-suite.dylan
@@ -224,39 +224,224 @@ define test string-literal-test ()
   assert-signals(<invalid-token>, read-fragment(#:string:{"\1<b>"}));
 end test;
 
-define test string-literal-multi-line-test ()
-  let f = read-fragment(#:string:{""""""});
-  verify-literal(f, "", <string-fragment>);
-  // Make sure the reader didn't stop at the first pair of double quotes...
-  let source = source-location-string(fragment-source-location(f));
-  assert-equal(#:string:{""""""}, source);
+// verify multi-line string
+define function verify-mls
+    (name, source, want)
+  assert-no-errors(read-fragment(source), "%s - parses without error", name);
+  let frag = read-fragment(source);
+  assert-instance?(<string-fragment>, frag, "%s - is string fragment", name);
+  assert-equal(frag.fragment-value, want, "%s - has expected value", name);
+end function;
 
-  verify-literal(read-fragment(#:string:{"""abc"""}),  "abc", <string-fragment>);
-  verify-literal(read-fragment(#:string:{"""a\nc"""}), "a\nc", <string-fragment>);
+define test string-literal-one-line-test ()
+  verify-mls("empty string", #:string:{""""""}, "");
 
-  // EOL canonicalization
-  verify-literal(read-fragment("\"\"\"a\nc\"\"\""),   "a\nc", <string-fragment>);
-  verify-literal(read-fragment("\"\"\"a\r\nc\"\"\""), "a\nc", <string-fragment>);
-  verify-literal(read-fragment("\"\"\"a\rc\"\"\""),   "a\nc", <string-fragment>);
-  verify-literal(read-fragment("\"\"\"a\n\rc\"\"\""), "a\n\nc", <string-fragment>);
+  // Make sure the reader didn't stop at the first pair of double quotes...
+  let empty-string-fragment = read-fragment(#:string:{""""""});
+  assert-equal(#:string:{""""""},
+               source-location-string(fragment-source-location(empty-string-fragment)),
+               "entire empty string consumed");
+
+  verify-mls("simple abc",
+             #:string:{"""abc"""},  "abc");
+  verify-mls("abc with spaces",
+             #:string:{""" abc """},  " abc ");
+end test;
+
+define test string-literal-multi-line-test ()
+  verify-mls("multi-line empty string, no prefix",
+             #:string:{"""
+"""},
+             "");
+  verify-mls("multi-line empty string, with prefix",
+             #:string:{"""
+"""},
+             "");
+  verify-mls("multi-line one blank line, no prefix",
+             #:string:{"""
+
+"""},
+             "\n");
+  verify-mls("leading whitespace relative to end delim retained",
+             #:string:{"""
+  abc
+def
+"""},
+             "  abc\ndef");
+  verify-mls("end delim to right of start delim",
+             #:string:{"""
+                           abc
+                         def
+                         """},
+             "  abc\ndef");
+  verify-mls("whitespace on first line ignored?", // 0x20 = space
+             #:string:{"""\<20>\<20>
+  abc
+def
+"""},
+             "  abc\ndef");
+  // The first blank line below is truly empty and the second one has only the prefix
+  // (written as \<20> to avoid editors removing trailing whitespace).
+  verify-mls("blank lines retained",
+             #:string:{"""
+
+   def
+\<20>\<20>\<20>
+   """},
+             "\ndef\n");
+  assert-signals(<error>,
+                 read-fragment(#:string:{"""a  (only whitespace allowed after start delim)
+abc
+"""}),
+                 "junk on first line");
+  assert-signals(<error>,
+                 read-fragment(#:string:{"""
+abc
+xxx"""}),
+                  "junk on last line");
+  assert-signals(<error>,
+                 read-fragment(#:string:{"""
+   abc
+  xxx  (this line not indented enough)
+   """}),
+                 "prefix mismatch non-white");
+  // Prefix should be "   " but one line has a literal tab in prefix.
+  /* TODO: the literal tab causes a failure due (I presume) to
+           https://github.com/dylan-lang/opendylan/issues/425
+  check-condition("prefix mismatch whitespace",
+                  <error>,
+                  read-fragment("\"\"\"\n   aaa\n \t bbb\n   \"\"\""));
+  */
+
+  // Check that CRLF and CR are converted to LF.
+  verify-mls("eol canonicalized 1",
+             "\"\"\"\na\r\nc\n\"\"\"",
+             "a\nc");
+  verify-mls("eol canonicalized 2",
+             "\"\"\"\na\rc\n\"\"\"",
+             "a\nc");
+  verify-mls("eol canonicalized 3",
+             "\"\"\"\r\na\n\rc\r\n\"\"\"",
+             "a\n\nc");
 
   let char = curry(as, <character>);
-  // One of every escape sequence. "\a\b\e\f\n\r\t\0\'\"\\"
-  verify-literal(read-fragment(#:string:{"""\a\b\e\f\n\r\t\0\'\"\\"""}),
-                 map-as(<string>, char, #('\a', '\b', '\e', '\f', '\n', '\r',
-                                          '\t', '\0', '\'', '\"', '\\')),
-                 <string-fragment>);
-  // Basic hex escaping.
-  verify-literal(read-fragment(#:string:{"""z\<9f>z"""}),
-                 map-as(<string>, char, #('z', #x9f, 'z')),
-                 <string-fragment>);
+  verify-mls("all escape sequences",
+             #:string:{"""\a\b\e\f\n\r\t\0\'\"\\"""},
+             map-as(<string>, char,
+                    #('\a', '\b', '\e', '\f', '\n', '\r', '\t', '\0', '\'', '\"', '\\')));
+  verify-mls("basic hex escaping",
+             #:string:{"""z\<9f>z"""},
+             map-as(<string>, char, #('z', #x9f, 'z')));
   // We can't handle character codes > 255 yet, but the leading zeros shouldn't
   // confuse the reader.
-  verify-literal(read-fragment(#:string:{"""z\<009f>z"""}),
-                 map-as(<string>, char, #('z', #x9f, 'z')),
-                 <string-fragment>);
-
-  assert-signals(<invalid-token>, read-fragment(#:string:{"""\1<b>"""}));
+  verify-mls("hex escape with leading zeros",
+             #:string:{"""z\<009f>z"""},
+             map-as(<string>, char, #('z', #x9f, 'z')));
+
+  assert-signals(<invalid-token>,
+                 read-fragment(#:string:{"""\1<b>"""}),
+                 "invalid escape sequence");
+
+  verify-mls("one line",
+             #:string:{
+"""
+abc
+"""},
+                  "abc");
+  verify-mls("one line with prefix",
+             #:string:{
+  """
+  abc
+  """},
+             "abc");
+  verify-mls("two lines",
+             #:string:{
+"""
+abc
+def
+"""},
+             "abc\ndef");
+  verify-mls("two lines with prefix",
+             #:string:{
+  """
+  abc
+  def
+  """},
+             "abc\ndef");
+  verify-mls("empty line at start",
+             #:string:{
+"""
+
+abc
+"""},
+             "\nabc");
+  verify-mls("two empty lines at start",
+             #:string:{
+"""
+
+
+abc
+"""},
+             "\n\nabc");
+  verify-mls("one empty line",
+             #:string:{
+"""
+
+"""},
+             "\n");
+  verify-mls("one empty line with prefix",
+             #:string:{
+  """
+\<20>\<20>
+  """},
+             "\n");
+  verify-mls("empty line at end",
+             #:string:{
+"""
+abc
+
+"""},
+             "abc\n");
+  verify-mls("two empty lines at end",
+             #:string:{
+"""
+abc
+
+
+"""},
+                  "abc\n\n");
+  verify-mls("empty lines at start and end",
+             #:string:{
+"""
+
+abc
+
+"""},
+             "\nabc\n");
+  verify-mls("two empty lines",
+             #:string:{
+"""
+
+
+"""},
+             "\n\n");
+  verify-mls("three empty lines",
+             #:string:{
+"""
+
+
+
+"""},
+             "\n\n\n");
+  verify-mls("three empty lines at end",
+             #:string:{
+"""
+abc
+
+
+
+"""},
+             "abc\n\n\n");
 end test;
 
 define test string-literal-raw-one-line-test ()
@@ -362,6 +547,7 @@ define suite literal-test-suite ()
   test pair-literal-test;
   test ratio-literal-test;
   test string-literal-multi-line-test;
+  test string-literal-one-line-test;
   test string-literal-raw-multi-line-test;
   test string-literal-raw-one-line-test;
   test string-literal-test;