From 6b0c68e62a7238ec54faa45f188e165412464277 Mon Sep 17 00:00:00 2001 From: Masaaki Goshima Date: Sat, 9 Nov 2024 13:00:12 +0900 Subject: [PATCH] Fix parsing of document and string (#513) * fix invalid test case * fix parsing of document * fix validation --- encode_test.go | 3 - lexer/lexer_test.go | 210 ++++++++++++++++++++++++++++++++++++++ parser/parser.go | 6 +- parser/parser_test.go | 47 +++++++++ scanner/context.go | 33 +++--- scanner/scanner.go | 30 +++--- testdata/validate_test.go | 16 +-- 7 files changed, 306 insertions(+), 39 deletions(-) diff --git a/encode_test.go b/encode_test.go index 7c55269c..2917c857 100644 --- a/encode_test.go +++ b/encode_test.go @@ -922,9 +922,6 @@ func TestEncodeWithNestedYAML(t *testing.T) { value: map[string]interface{}{"v": "# comment\n"}, expectDifferent: true, }, - { - value: map[string]interface{}{"v": "\n"}, - }, } for _, test := range tests { diff --git a/lexer/lexer_test.go b/lexer/lexer_test.go index bd2bd24c..658ff88b 100644 --- a/lexer/lexer_test.go +++ b/lexer/lexer_test.go @@ -1877,6 +1877,216 @@ a: !!binary | }, { YAML: ` +a: + b + + c +`, + Tokens: token.Tokens{ + { + Type: token.StringType, + CharacterType: token.CharacterTypeMiscellaneous, + Indicator: token.NotIndicator, + Value: "a", + Origin: "\na", + }, + { + Type: token.MappingValueType, + CharacterType: token.CharacterTypeIndicator, + Indicator: token.BlockStructureIndicator, + Value: ":", + Origin: ":", + }, + { + Type: token.StringType, + CharacterType: token.CharacterTypeMiscellaneous, + Indicator: token.NotIndicator, + Value: "b\nc", + Origin: "\n b\n\n c", + }, + }, + }, + { + YAML: ` +a: + b + + + c + d +e: f +`, + Tokens: token.Tokens{ + { + Type: token.StringType, + CharacterType: token.CharacterTypeMiscellaneous, + Indicator: token.NotIndicator, + Value: "a", + Origin: "\na", + }, + { + Type: token.MappingValueType, + CharacterType: token.CharacterTypeIndicator, + Indicator: token.BlockStructureIndicator, + Value: ":", + Origin: ":", + }, + { + Type: token.StringType, + CharacterType: token.CharacterTypeMiscellaneous, + Indicator: token.NotIndicator, + Value: "b\nc d", + Origin: "\n b\n\n\n c\n d\n", + }, + { + Type: token.StringType, + CharacterType: token.CharacterTypeMiscellaneous, + Indicator: token.NotIndicator, + Value: "e", + Origin: "e", + }, + { + Type: token.MappingValueType, + CharacterType: token.CharacterTypeIndicator, + Indicator: token.BlockStructureIndicator, + Value: ":", + Origin: ":", + }, + { + Type: token.StringType, + CharacterType: token.CharacterTypeMiscellaneous, + Indicator: token.NotIndicator, + Value: "f", + Origin: " f", + }, + }, + }, + { + YAML: ` +a: | + b + + + c + d +e: f +`, + Tokens: token.Tokens{ + { + Type: token.StringType, + CharacterType: token.CharacterTypeMiscellaneous, + Indicator: token.NotIndicator, + Value: "a", + Origin: "\na", + }, + { + Type: token.MappingValueType, + CharacterType: token.CharacterTypeIndicator, + Indicator: token.BlockStructureIndicator, + Value: ":", + Origin: ":", + }, + { + Type: token.LiteralType, + CharacterType: token.CharacterTypeIndicator, + Indicator: token.BlockScalarIndicator, + Value: "|", + Origin: " |\n", + }, + { + Type: token.StringType, + CharacterType: token.CharacterTypeMiscellaneous, + Indicator: token.NotIndicator, + Value: "b \n\n \nc\nd \n", + Origin: " b \n\n \n c\n d \n", + }, + { + Type: token.StringType, + CharacterType: token.CharacterTypeMiscellaneous, + Indicator: token.NotIndicator, + Value: "e", + Origin: "e", + }, + { + Type: token.MappingValueType, + CharacterType: token.CharacterTypeIndicator, + Indicator: token.BlockStructureIndicator, + Value: ":", + Origin: ":", + }, + { + Type: token.StringType, + CharacterType: token.CharacterTypeMiscellaneous, + Indicator: token.NotIndicator, + Value: "f", + Origin: " f", + }, + }, + }, + { + YAML: ` +a: > + b + + + c + d +e: f +`, + Tokens: token.Tokens{ + { + Type: token.StringType, + CharacterType: token.CharacterTypeMiscellaneous, + Indicator: token.NotIndicator, + Value: "a", + Origin: "\na", + }, + { + Type: token.MappingValueType, + CharacterType: token.CharacterTypeIndicator, + Indicator: token.BlockStructureIndicator, + Value: ":", + Origin: ":", + }, + { + Type: token.FoldedType, + CharacterType: token.CharacterTypeIndicator, + Indicator: token.BlockScalarIndicator, + Value: ">", + Origin: " >\n", + }, + { + Type: token.StringType, + CharacterType: token.CharacterTypeMiscellaneous, + Indicator: token.NotIndicator, + Value: "b \n\n \nc d \n", + Origin: " b \n\n \n c\n d \n", + }, + { + Type: token.StringType, + CharacterType: token.CharacterTypeMiscellaneous, + Indicator: token.NotIndicator, + Value: "e", + Origin: "e", + }, + { + Type: token.MappingValueType, + CharacterType: token.CharacterTypeIndicator, + Indicator: token.BlockStructureIndicator, + Value: ":", + Origin: ":", + }, + { + Type: token.StringType, + CharacterType: token.CharacterTypeMiscellaneous, + Indicator: token.NotIndicator, + Value: "f", + Origin: " f", + }, + }, + }, + { + YAML: ` a: > Text`, Tokens: token.Tokens{ diff --git a/parser/parser.go b/parser/parser.go index 65bff398..36840565 100644 --- a/parser/parser.go +++ b/parser/parser.go @@ -481,7 +481,11 @@ func (p *parser) parseMappingValue(ctx *context) (ast.Node, error) { ntk = p.nextNotCommentToken() antk = p.afterNextNotCommentToken() } - if tk := p.nextNotCommentToken(); tk != nil && tk.Position.Line > node.Start.Position.Line && tk.Position.Column > node.Start.Position.Column { + validationTk := node.Start + if len(node.Values) != 0 { + validationTk = node.Values[len(node.Values)-1].Key.GetToken() + } + if tk := p.nextNotCommentToken(); tk != nil && tk.Position.Line > validationTk.Position.Line && tk.Position.Column > validationTk.Position.Column { // a: b // c <= this token is invalid. return nil, errors.ErrSyntax("value is not allowed in this context", tk) diff --git a/parser/parser_test.go b/parser/parser_test.go index ae2fe635..281cd945 100644 --- a/parser/parser_test.go +++ b/parser/parser_test.go @@ -608,6 +608,20 @@ i: 'j' "e": "f" g: "h" i: 'j' +`, + }, + { + ` +a: + - |2 + b + c: d +`, + ` +a: + - |2 + b + c: d `, }, } @@ -1165,6 +1179,39 @@ b: - 2 ^ `, }, + { + ` +a: + - | + b + c: d +`, + ` +[5:5] value is not allowed in this context + 2 | a: + 3 | - | + 4 | b +> 5 | c: d + ^ +`, + }, + { + ` +a: + - | + b + c: + d: e +`, + ` +[5:5] value is not allowed in this context + 2 | a: + 3 | - | + 4 | b +> 5 | c: + ^ + 6 | d: e`, + }, } for _, test := range tests { t.Run(test.source, func(t *testing.T) { diff --git a/scanner/context.go b/scanner/context.go index 5263f3e4..f6c06e19 100644 --- a/scanner/context.go +++ b/scanner/context.go @@ -145,13 +145,8 @@ func (c *Context) addDocumentIndent(column int) { // If the first line of the document has already been evaluated, the number is treated as the threshold, since the `docFirstLineIndentColumn` is a positive number. if c.docFirstLineIndentColumn <= column { - // In the folded state, new-line-char is normally treated as space, - // but if the number of indents is different from the number of indents in the first line, - // new-line-char is used as is instead of space. - // Therefore, it is necessary to replace the space already added to buf. // `c.docFoldedNewLine` is a variable that is set to true for every newline. - if c.isFolded && c.docFoldedNewLine { - c.buf[len(c.buf)-1] = '\n' + if (c.isFolded || c.isRawFolded) && c.docFoldedNewLine { c.docFoldedNewLine = false } // Since addBuf ignore space character, add to the buffer directly. @@ -159,19 +154,23 @@ func (c *Context) addDocumentIndent(column int) { } } -func (c *Context) addDocumentNewLineInFolded(column int) { - if !c.isFolded { +// updateDocumentNewLineInFolded if Folded or RawFolded context and the content on the current line starts at the same column as the previous line, +// treat the new-line-char as a space. +func (c *Context) updateDocumentNewLineInFolded(column int) { + if c.isLiteral { return } + + // Folded or RawFolded. + if !c.docFoldedNewLine { return } - if c.docFirstLineIndentColumn == c.docLineIndentColumn && - c.docLineIndentColumn == c.docPrevLineIndentColumn { - // use space as a new line delimiter. - return + if c.docLineIndentColumn == c.docPrevLineIndentColumn { + if c.buf[len(c.buf)-1] == '\n' { + c.buf[len(c.buf)-1] = ' ' + } } - c.buf[len(c.buf)-1] = '\n' c.docFoldedNewLine = false } @@ -298,12 +297,18 @@ func (c *Context) bufferedSrc() []rune { // If the text ends with a space character, remove all of them. src = []rune(strings.TrimRight(string(src), " ")) + if string(src) == "\n" { + // If the content consists only of a newline, + // it can be considered as the document ending without any specified value, + // so it is treated as an empty string. + src = []rune{} + } } return src } func (c *Context) hasTrimAllEndNewlineOpt() bool { - return strings.HasPrefix(c.docOpt, "-") || strings.HasSuffix(c.docOpt, "-") + return strings.HasPrefix(c.docOpt, "-") || strings.HasSuffix(c.docOpt, "-") || c.isRawFolded } func (c *Context) bufferedToken(pos *token.Position) *token.Token { diff --git a/scanner/scanner.go b/scanner/scanner.go index e89290ad..7d7cbbd2 100644 --- a/scanner/scanner.go +++ b/scanner/scanner.go @@ -176,7 +176,7 @@ func (s *Scanner) indentStateFromIndentNumDifference() IndentState { } func (s *Scanner) updateIndent(ctx *Context, c rune) { - if s.isFirstCharAtLine && s.isNewLineChar(c) && ctx.isDocument() { + if s.isFirstCharAtLine && s.isNewLineChar(c) { return } if s.isFirstCharAtLine && c == ' ' { @@ -557,21 +557,13 @@ func (s *Scanner) scanDocument(ctx *Context, c rune) error { s.progressColumn(ctx, 1) return ErrInvalidToken(err.Error(), invalidTk) } - if ctx.isLiteral { - ctx.addBuf(c) - } else if ctx.isFolded { - ctx.addBuf(c) - } + ctx.addBuf(c) value := ctx.bufferedSrc() ctx.addToken(token.String(string(value), string(ctx.obuf), s.pos())) ctx.resetBuffer() s.progressColumn(ctx, 1) } else if s.isNewLineChar(c) { - if ctx.isLiteral { - ctx.addBuf(c) - } else { - ctx.addBuf(' ') - } + ctx.addBuf(c) ctx.updateDocumentNewLineState() s.progressLine(ctx) } else if s.isFirstCharAtLine && c == ' ' { @@ -579,12 +571,15 @@ func (s *Scanner) scanDocument(ctx *Context, c rune) error { s.progressColumn(ctx, 1) } else { ctx.updateDocumentLineIndentColumn(s.column) + if ctx.docFirstLineIndentColumn > 0 { + s.lastDelimColumn = ctx.docFirstLineIndentColumn - 1 + } if err := ctx.validateDocumentLineIndentColumn(); err != nil { invalidTk := token.Invalid(string(ctx.obuf), s.pos()) s.progressColumn(ctx, 1) return ErrInvalidToken(err.Error(), invalidTk) } - ctx.addDocumentNewLineInFolded(s.column) + ctx.updateDocumentNewLineInFolded(s.column) ctx.addBuf(c) s.progressColumn(ctx, 1) } @@ -626,7 +621,15 @@ func (s *Scanner) scanNewLine(ctx *Context, c rune) { } else if s.isAnchor { s.addBufferedTokenIfExists(ctx) } - ctx.addBuf(' ') + if ctx.existsBuffer() && s.isFirstCharAtLine { + if ctx.buf[len(ctx.buf)-1] == ' ' { + ctx.buf[len(ctx.buf)-1] = '\n' + } else { + ctx.buf = append(ctx.buf, '\n') + } + } else { + ctx.addBuf(' ') + } ctx.addOriginBuf(c) s.progressLine(ctx) } @@ -789,6 +792,7 @@ func (s *Scanner) scanRawFoldedChar(ctx *Context) bool { return false } + ctx.updateDocumentLineIndentColumn(s.column) ctx.isRawFolded = true ctx.addBuf('-') ctx.addOriginBuf('-') diff --git a/testdata/validate_test.go b/testdata/validate_test.go index d73ee3a3..d33f95a2 100644 --- a/testdata/validate_test.go +++ b/testdata/validate_test.go @@ -117,11 +117,11 @@ name: myDocument roles: name: myRole permissions: - - hello - - how - - are - - you - `, + - hello + - how + - are + - you +`, ExpectedErr: `[4:7] mapping was used where sequence is expected 1 | --- 2 | name: myDocument @@ -129,9 +129,9 @@ roles: > 4 | name: myRole ^ 5 | permissions: - 6 | - hello - 7 | - how - 8 | `, + 6 | - hello + 7 | - how + 8 | `, Instance: &struct { Name string `yaml:"name"` Roles []struct {