From 6b0c68e62a7238ec54faa45f188e165412464277 Mon Sep 17 00:00:00 2001
From: Masaaki Goshima <goccy54@gmail.com>
Date: Sat, 9 Nov 2024 13:00:12 +0900
Subject: [PATCH] Fix parsing of document and string (#513)

* fix invalid test case
* fix parsing of document
* fix validation
---
 encode_test.go            |   3 -
 lexer/lexer_test.go       | 210 ++++++++++++++++++++++++++++++++++++++
 parser/parser.go          |   6 +-
 parser/parser_test.go     |  47 +++++++++
 scanner/context.go        |  33 +++---
 scanner/scanner.go        |  30 +++---
 testdata/validate_test.go |  16 +--
 7 files changed, 306 insertions(+), 39 deletions(-)

diff --git a/encode_test.go b/encode_test.go
index 7c55269c..2917c857 100644
--- a/encode_test.go
+++ b/encode_test.go
@@ -922,9 +922,6 @@ func TestEncodeWithNestedYAML(t *testing.T) {
 			value:           map[string]interface{}{"v": "# comment\n"},
 			expectDifferent: true,
 		},
-		{
-			value: map[string]interface{}{"v": "\n"},
-		},
 	}
 
 	for _, test := range tests {
diff --git a/lexer/lexer_test.go b/lexer/lexer_test.go
index bd2bd24c..658ff88b 100644
--- a/lexer/lexer_test.go
+++ b/lexer/lexer_test.go
@@ -1877,6 +1877,216 @@ a: !!binary |
 		},
 		{
 			YAML: `
+a:
+ b
+
+ c
+`,
+			Tokens: token.Tokens{
+				{
+					Type:          token.StringType,
+					CharacterType: token.CharacterTypeMiscellaneous,
+					Indicator:     token.NotIndicator,
+					Value:         "a",
+					Origin:        "\na",
+				},
+				{
+					Type:          token.MappingValueType,
+					CharacterType: token.CharacterTypeIndicator,
+					Indicator:     token.BlockStructureIndicator,
+					Value:         ":",
+					Origin:        ":",
+				},
+				{
+					Type:          token.StringType,
+					CharacterType: token.CharacterTypeMiscellaneous,
+					Indicator:     token.NotIndicator,
+					Value:         "b\nc",
+					Origin:        "\n b\n\n c",
+				},
+			},
+		},
+		{
+			YAML: `
+a:   
+ b   
+
+  
+ c
+ d 
+e: f
+`,
+			Tokens: token.Tokens{
+				{
+					Type:          token.StringType,
+					CharacterType: token.CharacterTypeMiscellaneous,
+					Indicator:     token.NotIndicator,
+					Value:         "a",
+					Origin:        "\na",
+				},
+				{
+					Type:          token.MappingValueType,
+					CharacterType: token.CharacterTypeIndicator,
+					Indicator:     token.BlockStructureIndicator,
+					Value:         ":",
+					Origin:        ":",
+				},
+				{
+					Type:          token.StringType,
+					CharacterType: token.CharacterTypeMiscellaneous,
+					Indicator:     token.NotIndicator,
+					Value:         "b\nc d",
+					Origin:        "\n b\n\n\n c\n d\n",
+				},
+				{
+					Type:          token.StringType,
+					CharacterType: token.CharacterTypeMiscellaneous,
+					Indicator:     token.NotIndicator,
+					Value:         "e",
+					Origin:        "e",
+				},
+				{
+					Type:          token.MappingValueType,
+					CharacterType: token.CharacterTypeIndicator,
+					Indicator:     token.BlockStructureIndicator,
+					Value:         ":",
+					Origin:        ":",
+				},
+				{
+					Type:          token.StringType,
+					CharacterType: token.CharacterTypeMiscellaneous,
+					Indicator:     token.NotIndicator,
+					Value:         "f",
+					Origin:        " f",
+				},
+			},
+		},
+		{
+			YAML: `
+a: |
+ b   
+
+  
+ c
+ d 
+e: f
+`,
+			Tokens: token.Tokens{
+				{
+					Type:          token.StringType,
+					CharacterType: token.CharacterTypeMiscellaneous,
+					Indicator:     token.NotIndicator,
+					Value:         "a",
+					Origin:        "\na",
+				},
+				{
+					Type:          token.MappingValueType,
+					CharacterType: token.CharacterTypeIndicator,
+					Indicator:     token.BlockStructureIndicator,
+					Value:         ":",
+					Origin:        ":",
+				},
+				{
+					Type:          token.LiteralType,
+					CharacterType: token.CharacterTypeIndicator,
+					Indicator:     token.BlockScalarIndicator,
+					Value:         "|",
+					Origin:        " |\n",
+				},
+				{
+					Type:          token.StringType,
+					CharacterType: token.CharacterTypeMiscellaneous,
+					Indicator:     token.NotIndicator,
+					Value:         "b   \n\n \nc\nd \n",
+					Origin:        " b   \n\n  \n c\n d \n",
+				},
+				{
+					Type:          token.StringType,
+					CharacterType: token.CharacterTypeMiscellaneous,
+					Indicator:     token.NotIndicator,
+					Value:         "e",
+					Origin:        "e",
+				},
+				{
+					Type:          token.MappingValueType,
+					CharacterType: token.CharacterTypeIndicator,
+					Indicator:     token.BlockStructureIndicator,
+					Value:         ":",
+					Origin:        ":",
+				},
+				{
+					Type:          token.StringType,
+					CharacterType: token.CharacterTypeMiscellaneous,
+					Indicator:     token.NotIndicator,
+					Value:         "f",
+					Origin:        " f",
+				},
+			},
+		},
+		{
+			YAML: `
+a: >
+ b   
+
+  
+ c
+ d 
+e: f
+`,
+			Tokens: token.Tokens{
+				{
+					Type:          token.StringType,
+					CharacterType: token.CharacterTypeMiscellaneous,
+					Indicator:     token.NotIndicator,
+					Value:         "a",
+					Origin:        "\na",
+				},
+				{
+					Type:          token.MappingValueType,
+					CharacterType: token.CharacterTypeIndicator,
+					Indicator:     token.BlockStructureIndicator,
+					Value:         ":",
+					Origin:        ":",
+				},
+				{
+					Type:          token.FoldedType,
+					CharacterType: token.CharacterTypeIndicator,
+					Indicator:     token.BlockScalarIndicator,
+					Value:         ">",
+					Origin:        " >\n",
+				},
+				{
+					Type:          token.StringType,
+					CharacterType: token.CharacterTypeMiscellaneous,
+					Indicator:     token.NotIndicator,
+					Value:         "b   \n\n \nc d \n",
+					Origin:        " b   \n\n  \n c\n d \n",
+				},
+				{
+					Type:          token.StringType,
+					CharacterType: token.CharacterTypeMiscellaneous,
+					Indicator:     token.NotIndicator,
+					Value:         "e",
+					Origin:        "e",
+				},
+				{
+					Type:          token.MappingValueType,
+					CharacterType: token.CharacterTypeIndicator,
+					Indicator:     token.BlockStructureIndicator,
+					Value:         ":",
+					Origin:        ":",
+				},
+				{
+					Type:          token.StringType,
+					CharacterType: token.CharacterTypeMiscellaneous,
+					Indicator:     token.NotIndicator,
+					Value:         "f",
+					Origin:        " f",
+				},
+			},
+		},
+		{
+			YAML: `
 a: >
   Text`,
 			Tokens: token.Tokens{
diff --git a/parser/parser.go b/parser/parser.go
index 65bff398..36840565 100644
--- a/parser/parser.go
+++ b/parser/parser.go
@@ -481,7 +481,11 @@ func (p *parser) parseMappingValue(ctx *context) (ast.Node, error) {
 		ntk = p.nextNotCommentToken()
 		antk = p.afterNextNotCommentToken()
 	}
-	if tk := p.nextNotCommentToken(); tk != nil && tk.Position.Line > node.Start.Position.Line && tk.Position.Column > node.Start.Position.Column {
+	validationTk := node.Start
+	if len(node.Values) != 0 {
+		validationTk = node.Values[len(node.Values)-1].Key.GetToken()
+	}
+	if tk := p.nextNotCommentToken(); tk != nil && tk.Position.Line > validationTk.Position.Line && tk.Position.Column > validationTk.Position.Column {
 		// a: b
 		//   c <= this token is invalid.
 		return nil, errors.ErrSyntax("value is not allowed in this context", tk)
diff --git a/parser/parser_test.go b/parser/parser_test.go
index ae2fe635..281cd945 100644
--- a/parser/parser_test.go
+++ b/parser/parser_test.go
@@ -608,6 +608,20 @@ i: 'j'
 "e": "f"
 g: "h"
 i: 'j'
+`,
+		},
+		{
+			`
+a:
+  - |2
+        b
+    c: d
+`,
+			`
+a:
+  - |2
+        b
+    c: d
 `,
 		},
 	}
@@ -1165,6 +1179,39 @@ b: - 2
          ^
 `,
 		},
+		{
+			`
+a:
+  - |
+        b
+    c: d
+`,
+			`
+[5:5] value is not allowed in this context
+   2 | a:
+   3 |   - |
+   4 |         b
+>  5 |     c: d
+           ^
+`,
+		},
+		{
+			`
+a:
+  - |
+        b
+    c:
+      d: e
+`,
+			`
+[5:5] value is not allowed in this context
+   2 | a:
+   3 |   - |
+   4 |         b
+>  5 |     c:
+           ^
+   6 |       d: e`,
+		},
 	}
 	for _, test := range tests {
 		t.Run(test.source, func(t *testing.T) {
diff --git a/scanner/context.go b/scanner/context.go
index 5263f3e4..f6c06e19 100644
--- a/scanner/context.go
+++ b/scanner/context.go
@@ -145,13 +145,8 @@ func (c *Context) addDocumentIndent(column int) {
 
 	// If the first line of the document has already been evaluated, the number is treated as the threshold, since the `docFirstLineIndentColumn` is a positive number.
 	if c.docFirstLineIndentColumn <= column {
-		// In the folded state, new-line-char is normally treated as space,
-		// but if the number of indents is different from the number of indents in the first line,
-		// new-line-char is used as is instead of space.
-		// Therefore, it is necessary to replace the space already added to buf.
 		// `c.docFoldedNewLine` is a variable that is set to true for every newline.
-		if c.isFolded && c.docFoldedNewLine {
-			c.buf[len(c.buf)-1] = '\n'
+		if (c.isFolded || c.isRawFolded) && c.docFoldedNewLine {
 			c.docFoldedNewLine = false
 		}
 		// Since addBuf ignore space character, add to the buffer directly.
@@ -159,19 +154,23 @@ func (c *Context) addDocumentIndent(column int) {
 	}
 }
 
-func (c *Context) addDocumentNewLineInFolded(column int) {
-	if !c.isFolded {
+// updateDocumentNewLineInFolded if Folded or RawFolded context and the content on the current line starts at the same column as the previous line,
+// treat the new-line-char as a space.
+func (c *Context) updateDocumentNewLineInFolded(column int) {
+	if c.isLiteral {
 		return
 	}
+
+	// Folded or RawFolded.
+
 	if !c.docFoldedNewLine {
 		return
 	}
-	if c.docFirstLineIndentColumn == c.docLineIndentColumn &&
-		c.docLineIndentColumn == c.docPrevLineIndentColumn {
-		// use space as a new line delimiter.
-		return
+	if c.docLineIndentColumn == c.docPrevLineIndentColumn {
+		if c.buf[len(c.buf)-1] == '\n' {
+			c.buf[len(c.buf)-1] = ' '
+		}
 	}
-	c.buf[len(c.buf)-1] = '\n'
 	c.docFoldedNewLine = false
 }
 
@@ -298,12 +297,18 @@ func (c *Context) bufferedSrc() []rune {
 
 		// If the text ends with a space character, remove all of them.
 		src = []rune(strings.TrimRight(string(src), " "))
+		if string(src) == "\n" {
+			// If the content consists only of a newline,
+			// it can be considered as the document ending without any specified value,
+			// so it is treated as an empty string.
+			src = []rune{}
+		}
 	}
 	return src
 }
 
 func (c *Context) hasTrimAllEndNewlineOpt() bool {
-	return strings.HasPrefix(c.docOpt, "-") || strings.HasSuffix(c.docOpt, "-")
+	return strings.HasPrefix(c.docOpt, "-") || strings.HasSuffix(c.docOpt, "-") || c.isRawFolded
 }
 
 func (c *Context) bufferedToken(pos *token.Position) *token.Token {
diff --git a/scanner/scanner.go b/scanner/scanner.go
index e89290ad..7d7cbbd2 100644
--- a/scanner/scanner.go
+++ b/scanner/scanner.go
@@ -176,7 +176,7 @@ func (s *Scanner) indentStateFromIndentNumDifference() IndentState {
 }
 
 func (s *Scanner) updateIndent(ctx *Context, c rune) {
-	if s.isFirstCharAtLine && s.isNewLineChar(c) && ctx.isDocument() {
+	if s.isFirstCharAtLine && s.isNewLineChar(c) {
 		return
 	}
 	if s.isFirstCharAtLine && c == ' ' {
@@ -557,21 +557,13 @@ func (s *Scanner) scanDocument(ctx *Context, c rune) error {
 			s.progressColumn(ctx, 1)
 			return ErrInvalidToken(err.Error(), invalidTk)
 		}
-		if ctx.isLiteral {
-			ctx.addBuf(c)
-		} else if ctx.isFolded {
-			ctx.addBuf(c)
-		}
+		ctx.addBuf(c)
 		value := ctx.bufferedSrc()
 		ctx.addToken(token.String(string(value), string(ctx.obuf), s.pos()))
 		ctx.resetBuffer()
 		s.progressColumn(ctx, 1)
 	} else if s.isNewLineChar(c) {
-		if ctx.isLiteral {
-			ctx.addBuf(c)
-		} else {
-			ctx.addBuf(' ')
-		}
+		ctx.addBuf(c)
 		ctx.updateDocumentNewLineState()
 		s.progressLine(ctx)
 	} else if s.isFirstCharAtLine && c == ' ' {
@@ -579,12 +571,15 @@ func (s *Scanner) scanDocument(ctx *Context, c rune) error {
 		s.progressColumn(ctx, 1)
 	} else {
 		ctx.updateDocumentLineIndentColumn(s.column)
+		if ctx.docFirstLineIndentColumn > 0 {
+			s.lastDelimColumn = ctx.docFirstLineIndentColumn - 1
+		}
 		if err := ctx.validateDocumentLineIndentColumn(); err != nil {
 			invalidTk := token.Invalid(string(ctx.obuf), s.pos())
 			s.progressColumn(ctx, 1)
 			return ErrInvalidToken(err.Error(), invalidTk)
 		}
-		ctx.addDocumentNewLineInFolded(s.column)
+		ctx.updateDocumentNewLineInFolded(s.column)
 		ctx.addBuf(c)
 		s.progressColumn(ctx, 1)
 	}
@@ -626,7 +621,15 @@ func (s *Scanner) scanNewLine(ctx *Context, c rune) {
 	} else if s.isAnchor {
 		s.addBufferedTokenIfExists(ctx)
 	}
-	ctx.addBuf(' ')
+	if ctx.existsBuffer() && s.isFirstCharAtLine {
+		if ctx.buf[len(ctx.buf)-1] == ' ' {
+			ctx.buf[len(ctx.buf)-1] = '\n'
+		} else {
+			ctx.buf = append(ctx.buf, '\n')
+		}
+	} else {
+		ctx.addBuf(' ')
+	}
 	ctx.addOriginBuf(c)
 	s.progressLine(ctx)
 }
@@ -789,6 +792,7 @@ func (s *Scanner) scanRawFoldedChar(ctx *Context) bool {
 		return false
 	}
 
+	ctx.updateDocumentLineIndentColumn(s.column)
 	ctx.isRawFolded = true
 	ctx.addBuf('-')
 	ctx.addOriginBuf('-')
diff --git a/testdata/validate_test.go b/testdata/validate_test.go
index d73ee3a3..d33f95a2 100644
--- a/testdata/validate_test.go
+++ b/testdata/validate_test.go
@@ -117,11 +117,11 @@ name: myDocument
 roles:
   name: myRole
   permissions:
-	- hello
-	- how
-	- are
-	- you
-	`,
+    - hello
+    - how
+    - are
+    - you
+`,
 			ExpectedErr: `[4:7] mapping was used where sequence is expected
    1 | ---
    2 | name: myDocument
@@ -129,9 +129,9 @@ roles:
 >  4 |   name: myRole
              ^
    5 |   permissions:
-   6 | 	- hello
-   7 | 	- how
-   8 | `,
+   6 |     - hello
+   7 |     - how
+   8 |     `,
 			Instance: &struct {
 				Name  string `yaml:"name"`
 				Roles []struct {