Fix parsing of document and string (#513)

* fix invalid test case * fix parsing of document * fix validation
goccy · Nov 9, 2024 · 6b0c68e · 6b0c68e
1 parent e1bab38
commit 6b0c68e
Show file tree

Hide file tree

Showing 7 changed files with 306 additions and 39 deletions.
diff --git a/encode_test.go b/encode_test.go
@@ -922,9 +922,6 @@ func TestEncodeWithNestedYAML(t *testing.T) {
 			value:           map[string]interface{}{"v": "# comment\n"},
 			expectDifferent: true,
 		},
-		{
-			value: map[string]interface{}{"v": "\n"},
-		},
 	}
 
 	for _, test := range tests {

diff --git a/lexer/lexer_test.go b/lexer/lexer_test.go
@@ -1877,6 +1877,216 @@ a: !!binary |
 		},
 		{
 			YAML: `
+a:
+ b
+
+ c
+`,
+			Tokens: token.Tokens{
+				{
+					Type:          token.StringType,
+					CharacterType: token.CharacterTypeMiscellaneous,
+					Indicator:     token.NotIndicator,
+					Value:         "a",
+					Origin:        "\na",
+				},
+				{
+					Type:          token.MappingValueType,
+					CharacterType: token.CharacterTypeIndicator,
+					Indicator:     token.BlockStructureIndicator,
+					Value:         ":",
+					Origin:        ":",
+				},
+				{
+					Type:          token.StringType,
+					CharacterType: token.CharacterTypeMiscellaneous,
+					Indicator:     token.NotIndicator,
+					Value:         "b\nc",
+					Origin:        "\n b\n\n c",
+				},
+			},
+		},
+		{
+			YAML: `
+a:   
+ b   
+
+  
+ c
+ d 
+e: f
+`,
+			Tokens: token.Tokens{
+				{
+					Type:          token.StringType,
+					CharacterType: token.CharacterTypeMiscellaneous,
+					Indicator:     token.NotIndicator,
+					Value:         "a",
+					Origin:        "\na",
+				},
+				{
+					Type:          token.MappingValueType,
+					CharacterType: token.CharacterTypeIndicator,
+					Indicator:     token.BlockStructureIndicator,
+					Value:         ":",
+					Origin:        ":",
+				},
+				{
+					Type:          token.StringType,
+					CharacterType: token.CharacterTypeMiscellaneous,
+					Indicator:     token.NotIndicator,
+					Value:         "b\nc d",
+					Origin:        "\n b\n\n\n c\n d\n",
+				},
+				{
+					Type:          token.StringType,
+					CharacterType: token.CharacterTypeMiscellaneous,
+					Indicator:     token.NotIndicator,
+					Value:         "e",
+					Origin:        "e",
+				},
+				{
+					Type:          token.MappingValueType,
+					CharacterType: token.CharacterTypeIndicator,
+					Indicator:     token.BlockStructureIndicator,
+					Value:         ":",
+					Origin:        ":",
+				},
+				{
+					Type:          token.StringType,
+					CharacterType: token.CharacterTypeMiscellaneous,
+					Indicator:     token.NotIndicator,
+					Value:         "f",
+					Origin:        " f",
+				},
+			},
+		},
+		{
+			YAML: `
+a: |
+ b   
+
+  
+ c
+ d 
+e: f
+`,
+			Tokens: token.Tokens{
+				{
+					Type:          token.StringType,
+					CharacterType: token.CharacterTypeMiscellaneous,
+					Indicator:     token.NotIndicator,
+					Value:         "a",
+					Origin:        "\na",
+				},
+				{
+					Type:          token.MappingValueType,
+					CharacterType: token.CharacterTypeIndicator,
+					Indicator:     token.BlockStructureIndicator,
+					Value:         ":",
+					Origin:        ":",
+				},
+				{
+					Type:          token.LiteralType,
+					CharacterType: token.CharacterTypeIndicator,
+					Indicator:     token.BlockScalarIndicator,
+					Value:         "|",
+					Origin:        " |\n",
+				},
+				{
+					Type:          token.StringType,
+					CharacterType: token.CharacterTypeMiscellaneous,
+					Indicator:     token.NotIndicator,
+					Value:         "b   \n\n \nc\nd \n",
+					Origin:        " b   \n\n  \n c\n d \n",
+				},
+				{
+					Type:          token.StringType,
+					CharacterType: token.CharacterTypeMiscellaneous,
+					Indicator:     token.NotIndicator,
+					Value:         "e",
+					Origin:        "e",
+				},
+				{
+					Type:          token.MappingValueType,
+					CharacterType: token.CharacterTypeIndicator,
+					Indicator:     token.BlockStructureIndicator,
+					Value:         ":",
+					Origin:        ":",
+				},
+				{
+					Type:          token.StringType,
+					CharacterType: token.CharacterTypeMiscellaneous,
+					Indicator:     token.NotIndicator,
+					Value:         "f",
+					Origin:        " f",
+				},
+			},
+		},
+		{
+			YAML: `
+a: >
+ b   
+
+  
+ c
+ d 
+e: f
+`,
+			Tokens: token.Tokens{
+				{
+					Type:          token.StringType,
+					CharacterType: token.CharacterTypeMiscellaneous,
+					Indicator:     token.NotIndicator,
+					Value:         "a",
+					Origin:        "\na",
+				},
+				{
+					Type:          token.MappingValueType,
+					CharacterType: token.CharacterTypeIndicator,
+					Indicator:     token.BlockStructureIndicator,
+					Value:         ":",
+					Origin:        ":",
+				},
+				{
+					Type:          token.FoldedType,
+					CharacterType: token.CharacterTypeIndicator,
+					Indicator:     token.BlockScalarIndicator,
+					Value:         ">",
+					Origin:        " >\n",
+				},
+				{
+					Type:          token.StringType,
+					CharacterType: token.CharacterTypeMiscellaneous,
+					Indicator:     token.NotIndicator,
+					Value:         "b   \n\n \nc d \n",
+					Origin:        " b   \n\n  \n c\n d \n",
+				},
+				{
+					Type:          token.StringType,
+					CharacterType: token.CharacterTypeMiscellaneous,
+					Indicator:     token.NotIndicator,
+					Value:         "e",
+					Origin:        "e",
+				},
+				{
+					Type:          token.MappingValueType,
+					CharacterType: token.CharacterTypeIndicator,
+					Indicator:     token.BlockStructureIndicator,
+					Value:         ":",
+					Origin:        ":",
+				},
+				{
+					Type:          token.StringType,
+					CharacterType: token.CharacterTypeMiscellaneous,
+					Indicator:     token.NotIndicator,
+					Value:         "f",
+					Origin:        " f",
+				},
+			},
+		},
+		{
+			YAML: `
 a: >
   Text`,
 			Tokens: token.Tokens{

diff --git a/parser/parser.go b/parser/parser.go
@@ -481,7 +481,11 @@ func (p *parser) parseMappingValue(ctx *context) (ast.Node, error) {
 		ntk = p.nextNotCommentToken()
 		antk = p.afterNextNotCommentToken()
 	}
-	if tk := p.nextNotCommentToken(); tk != nil && tk.Position.Line > node.Start.Position.Line && tk.Position.Column > node.Start.Position.Column {
+	validationTk := node.Start
+	if len(node.Values) != 0 {
+		validationTk = node.Values[len(node.Values)-1].Key.GetToken()
+	}
+	if tk := p.nextNotCommentToken(); tk != nil && tk.Position.Line > validationTk.Position.Line && tk.Position.Column > validationTk.Position.Column {
 		// a: b
 		//   c <= this token is invalid.
 		return nil, errors.ErrSyntax("value is not allowed in this context", tk)

diff --git a/parser/parser_test.go b/parser/parser_test.go
@@ -608,6 +608,20 @@ i: 'j'
 "e": "f"
 g: "h"
 i: 'j'
+`,
+		},
+		{
+			`
+a:
+  - |2
+        b
+    c: d
+`,
+			`
+a:
+  - |2
+        b
+    c: d
 `,
 		},
 	}
@@ -1165,6 +1179,39 @@ b: - 2
          ^
 `,
 		},
+		{
+			`
+a:
+  - |
+        b
+    c: d
+`,
+			`
+[5:5] value is not allowed in this context
+   2 | a:
+   3 |   - |
+   4 |         b
+>  5 |     c: d
+           ^
+`,
+		},
+		{
+			`
+a:
+  - |
+        b
+    c:
+      d: e
+`,
+			`
+[5:5] value is not allowed in this context
+   2 | a:
+   3 |   - |
+   4 |         b
+>  5 |     c:
+           ^
+   6 |       d: e`,
+		},
 	}
 	for _, test := range tests {
 		t.Run(test.source, func(t *testing.T) {

diff --git a/scanner/context.go b/scanner/context.go
@@ -145,33 +145,32 @@ func (c *Context) addDocumentIndent(column int) {
 
 	// If the first line of the document has already been evaluated, the number is treated as the threshold, since the `docFirstLineIndentColumn` is a positive number.
 	if c.docFirstLineIndentColumn <= column {
-		// In the folded state, new-line-char is normally treated as space,
-		// but if the number of indents is different from the number of indents in the first line,
-		// new-line-char is used as is instead of space.
-		// Therefore, it is necessary to replace the space already added to buf.
 		// `c.docFoldedNewLine` is a variable that is set to true for every newline.
-		if c.isFolded && c.docFoldedNewLine {
-			c.buf[len(c.buf)-1] = '\n'
+		if (c.isFolded || c.isRawFolded) && c.docFoldedNewLine {
 			c.docFoldedNewLine = false
 		}
 		// Since addBuf ignore space character, add to the buffer directly.
 		c.buf = append(c.buf, ' ')
 	}
 }
 
-func (c *Context) addDocumentNewLineInFolded(column int) {
-	if !c.isFolded {
+// updateDocumentNewLineInFolded if Folded or RawFolded context and the content on the current line starts at the same column as the previous line,
+// treat the new-line-char as a space.
+func (c *Context) updateDocumentNewLineInFolded(column int) {
+	if c.isLiteral {
 		return
 	}
+
+	// Folded or RawFolded.
+
 	if !c.docFoldedNewLine {
 		return
 	}
-	if c.docFirstLineIndentColumn == c.docLineIndentColumn &&
-		c.docLineIndentColumn == c.docPrevLineIndentColumn {
-		// use space as a new line delimiter.
-		return
+	if c.docLineIndentColumn == c.docPrevLineIndentColumn {
+		if c.buf[len(c.buf)-1] == '\n' {
+			c.buf[len(c.buf)-1] = ' '
+		}
 	}
-	c.buf[len(c.buf)-1] = '\n'
 	c.docFoldedNewLine = false
 }
 
@@ -298,12 +297,18 @@ func (c *Context) bufferedSrc() []rune {
 
 		// If the text ends with a space character, remove all of them.
 		src = []rune(strings.TrimRight(string(src), " "))
+		if string(src) == "\n" {
+			// If the content consists only of a newline,
+			// it can be considered as the document ending without any specified value,
+			// so it is treated as an empty string.
+			src = []rune{}
+		}
 	}
 	return src
 }
 
 func (c *Context) hasTrimAllEndNewlineOpt() bool {
-	return strings.HasPrefix(c.docOpt, "-") || strings.HasSuffix(c.docOpt, "-")
+	return strings.HasPrefix(c.docOpt, "-") || strings.HasSuffix(c.docOpt, "-") || c.isRawFolded
 }
 
 func (c *Context) bufferedToken(pos *token.Position) *token.Token {