From 17ae08e94b80dfffbc99c6a5e741009230162789 Mon Sep 17 00:00:00 2001 From: Masaaki Goshima Date: Wed, 13 Nov 2024 14:57:25 +0900 Subject: [PATCH 1/2] fix tab character handling --- lexer/lexer_test.go | 269 ++++++++++++++++++++++-------------------- parser/parser_test.go | 2 +- path_test.go | 3 +- scanner/context.go | 2 +- scanner/scanner.go | 32 ++++- 5 files changed, 171 insertions(+), 137 deletions(-) diff --git a/lexer/lexer_test.go b/lexer/lexer_test.go index b7392d20..c7be8aa1 100644 --- a/lexer/lexer_test.go +++ b/lexer/lexer_test.go @@ -15,14 +15,14 @@ func TestTokenize(t *testing.T) { }{ { YAML: `null - `, + `, Tokens: token.Tokens{ { Type: token.NullType, CharacterType: token.CharacterTypeMiscellaneous, Indicator: token.NotIndicator, Value: "null", - Origin: "null\n\t\t", + Origin: "null\n ", }, }, }, @@ -100,7 +100,7 @@ func TestTokenize(t *testing.T) { }, { YAML: `{} - `, + `, Tokens: token.Tokens{ { Type: token.MappingStartType, @@ -119,8 +119,7 @@ func TestTokenize(t *testing.T) { }, }, { - YAML: `v: hi - `, + YAML: `v: hi`, Tokens: token.Tokens{ { Type: token.StringType, @@ -141,13 +140,38 @@ func TestTokenize(t *testing.T) { CharacterType: token.CharacterTypeMiscellaneous, Indicator: token.NotIndicator, Value: "hi", - Origin: " hi\n", + Origin: " hi", }, }, }, { - YAML: `v: "true" - `, + YAML: `v: a`, + Tokens: token.Tokens{ + { + Type: token.StringType, + CharacterType: token.CharacterTypeMiscellaneous, + Indicator: token.NotIndicator, + Value: "v", + Origin: "v", + }, + { + Type: token.MappingValueType, + CharacterType: token.CharacterTypeIndicator, + Indicator: token.BlockStructureIndicator, + Value: ":", + Origin: ":", + }, + { + Type: token.StringType, + CharacterType: token.CharacterTypeMiscellaneous, + Indicator: token.NotIndicator, + Value: "a", + Origin: " a", + }, + }, + }, + { + YAML: `v: "true"`, Tokens: token.Tokens{ { Type: token.StringType, @@ -173,8 +197,7 @@ func TestTokenize(t *testing.T) { }, }, { - YAML: `v: "false" - `, + YAML: `v: "false"`, Tokens: token.Tokens{ { Type: token.StringType, @@ -200,8 +223,7 @@ func TestTokenize(t *testing.T) { }, }, { - YAML: `v: true - `, + YAML: `v: true`, Tokens: token.Tokens{ { Type: token.StringType, @@ -222,13 +244,12 @@ func TestTokenize(t *testing.T) { CharacterType: token.CharacterTypeMiscellaneous, Indicator: token.NotIndicator, Value: "true", - Origin: " true\n", + Origin: " true", }, }, }, { - YAML: `v: false - `, + YAML: `v: false`, Tokens: token.Tokens{ { Type: token.StringType, @@ -249,13 +270,12 @@ func TestTokenize(t *testing.T) { CharacterType: token.CharacterTypeMiscellaneous, Indicator: token.NotIndicator, Value: "false", - Origin: " false\n", + Origin: " false", }, }, }, { - YAML: `v: 10 - `, + YAML: `v: 10`, Tokens: token.Tokens{ { Type: token.StringType, @@ -276,13 +296,12 @@ func TestTokenize(t *testing.T) { CharacterType: token.CharacterTypeMiscellaneous, Indicator: token.NotIndicator, Value: "10", - Origin: " 10\n", + Origin: " 10", }, }, }, { - YAML: `v: -10 - `, + YAML: `v: -10`, Tokens: token.Tokens{ { Type: token.StringType, @@ -303,13 +322,12 @@ func TestTokenize(t *testing.T) { CharacterType: token.CharacterTypeMiscellaneous, Indicator: token.NotIndicator, Value: "-10", - Origin: " -10\n", + Origin: " -10", }, }, }, { - YAML: `v: 42 - `, + YAML: `v: 42`, Tokens: token.Tokens{ { Type: token.StringType, @@ -330,13 +348,12 @@ func TestTokenize(t *testing.T) { CharacterType: token.CharacterTypeMiscellaneous, Indicator: token.NotIndicator, Value: "42", - Origin: " 42\n", + Origin: " 42", }, }, }, { - YAML: `v: 4294967296 - `, + YAML: `v: 4294967296`, Tokens: token.Tokens{ { Type: token.StringType, @@ -357,13 +374,12 @@ func TestTokenize(t *testing.T) { CharacterType: token.CharacterTypeMiscellaneous, Indicator: token.NotIndicator, Value: "4294967296", - Origin: " 4294967296\n", + Origin: " 4294967296", }, }, }, { - YAML: `v: "10" - `, + YAML: `v: "10"`, Tokens: token.Tokens{ { Type: token.StringType, @@ -389,8 +405,7 @@ func TestTokenize(t *testing.T) { }, }, { - YAML: `v: 0.1 - `, + YAML: `v: 0.1`, Tokens: token.Tokens{ { Type: token.StringType, @@ -411,13 +426,12 @@ func TestTokenize(t *testing.T) { CharacterType: token.CharacterTypeMiscellaneous, Indicator: token.NotIndicator, Value: "0.1", - Origin: " 0.1\n", + Origin: " 0.1", }, }, }, { - YAML: `v: 0.99 - `, + YAML: `v: 0.99`, Tokens: token.Tokens{ { Type: token.StringType, @@ -438,13 +452,12 @@ func TestTokenize(t *testing.T) { CharacterType: token.CharacterTypeMiscellaneous, Indicator: token.NotIndicator, Value: "0.99", - Origin: " 0.99\n", + Origin: " 0.99", }, }, }, { - YAML: `v: -0.1 - `, + YAML: `v: -0.1`, Tokens: token.Tokens{ { Type: token.StringType, @@ -465,13 +478,12 @@ func TestTokenize(t *testing.T) { CharacterType: token.CharacterTypeMiscellaneous, Indicator: token.NotIndicator, Value: "-0.1", - Origin: " -0.1\n", + Origin: " -0.1", }, }, }, { - YAML: `v: .inf - `, + YAML: `v: .inf`, Tokens: token.Tokens{ { Type: token.StringType, @@ -492,13 +504,12 @@ func TestTokenize(t *testing.T) { CharacterType: token.CharacterTypeMiscellaneous, Indicator: token.NotIndicator, Value: ".inf", - Origin: " .inf\n", + Origin: " .inf", }, }, }, { - YAML: `v: -.inf - `, + YAML: `v: -.inf`, Tokens: token.Tokens{ { Type: token.StringType, @@ -519,13 +530,12 @@ func TestTokenize(t *testing.T) { CharacterType: token.CharacterTypeMiscellaneous, Indicator: token.NotIndicator, Value: "-.inf", - Origin: " -.inf\n", + Origin: " -.inf", }, }, }, { - YAML: `v: .nan - `, + YAML: `v: .nan`, Tokens: token.Tokens{ { Type: token.StringType, @@ -546,7 +556,7 @@ func TestTokenize(t *testing.T) { CharacterType: token.CharacterTypeMiscellaneous, Indicator: token.NotIndicator, Value: ".nan", - Origin: " .nan\n", + Origin: " .nan", }, }, }, @@ -586,8 +596,7 @@ a: }, }, { - YAML: `v: null - `, + YAML: `v: null`, Tokens: token.Tokens{ { Type: token.StringType, @@ -608,13 +617,12 @@ a: CharacterType: token.CharacterTypeMiscellaneous, Indicator: token.NotIndicator, Value: "null", - Origin: " null\n", + Origin: " null", }, }, }, { - YAML: `v: "" - `, + YAML: `v: ""`, Tokens: token.Tokens{ { Type: token.StringType, @@ -897,8 +905,7 @@ a: }, }, { - YAML: `a: '-' - `, + YAML: `a: '-'`, Tokens: token.Tokens{ { Type: token.StringType, @@ -924,15 +931,14 @@ a: }, }, { - YAML: `123 - `, + YAML: `123`, Tokens: token.Tokens{ { Type: token.IntegerType, CharacterType: token.CharacterTypeMiscellaneous, Indicator: token.NotIndicator, Value: "123", - Origin: "123\n\t\t", + Origin: "123", }, }, }, @@ -964,8 +970,7 @@ a: }, }, { - YAML: `a: null - `, + YAML: `a: null`, Tokens: token.Tokens{ { Type: token.StringType, @@ -986,13 +991,12 @@ a: CharacterType: token.CharacterTypeMiscellaneous, Indicator: token.NotIndicator, Value: "null", - Origin: " null\n", + Origin: " null", }, }, }, { - YAML: `a: {x: 1} - `, + YAML: `a: {x: 1}`, Tokens: token.Tokens{ { Type: token.StringType, @@ -1046,8 +1050,7 @@ a: }, }, { - YAML: `a: [1, 2] - `, + YAML: `a: [1, 2]`, Tokens: token.Tokens{ { Type: token.StringType, @@ -1101,16 +1104,17 @@ a: }, }, { - YAML: `t2: 2018-01-09T10:40:47Z - t4: 2098-01-09T10:40:47Z - `, + YAML: ` +t2: 2018-01-09T10:40:47Z +t4: 2098-01-09T10:40:47Z +`, Tokens: token.Tokens{ { Type: token.StringType, CharacterType: token.CharacterTypeMiscellaneous, Indicator: token.NotIndicator, Value: "t2", - Origin: "t2", + Origin: "\nt2", }, { Type: token.MappingValueType, @@ -1130,8 +1134,8 @@ a: Type: token.StringType, CharacterType: token.CharacterTypeMiscellaneous, Indicator: token.NotIndicator, - Value: "\t\tt4", - Origin: "\t\tt4", + Value: "t4", + Origin: "t4", }, { Type: token.MappingValueType, @@ -1145,13 +1149,12 @@ a: CharacterType: token.CharacterTypeMiscellaneous, Indicator: token.NotIndicator, Value: "2098-01-09T10:40:47Z", - Origin: " 2098-01-09T10:40:47Z\n", + Origin: " 2098-01-09T10:40:47Z", }, }, }, { - YAML: `a: {b: c, d: e} - `, + YAML: `a: {b: c, d: e}`, Tokens: token.Tokens{ { Type: token.StringType, @@ -1233,8 +1236,7 @@ a: }, }, { - YAML: `a: 3s - `, + YAML: `a: 3s`, Tokens: token.Tokens{ { Type: token.StringType, @@ -1255,13 +1257,12 @@ a: CharacterType: token.CharacterTypeMiscellaneous, Indicator: token.NotIndicator, Value: "3s", - Origin: " 3s\n", + Origin: " 3s", }, }, }, { - YAML: `a: - `, + YAML: `a: `, Tokens: token.Tokens{ { Type: token.StringType, @@ -1282,13 +1283,12 @@ a: CharacterType: token.CharacterTypeMiscellaneous, Indicator: token.NotIndicator, Value: "", - Origin: " \n", + Origin: " ", }, }, }, { - YAML: `a: "1:1" - `, + YAML: `a: "1:1"`, Tokens: token.Tokens{ { Type: token.StringType, @@ -1314,8 +1314,7 @@ a: }, }, { - YAML: `a: "\0" - `, + YAML: `a: "\0"`, Tokens: token.Tokens{ { Type: token.StringType, @@ -1341,8 +1340,7 @@ a: }, }, { - YAML: `a: !!binary gIGC - `, + YAML: `a: !!binary gIGC`, Tokens: token.Tokens{ { Type: token.StringType, @@ -1370,7 +1368,7 @@ a: CharacterType: token.CharacterTypeMiscellaneous, Indicator: token.NotIndicator, Value: "gIGC", - Origin: "gIGC\n", + Origin: "gIGC", }, }, }, @@ -1419,20 +1417,21 @@ a: !!binary | }, }, { - YAML: `b: 2 - a: 1 - d: 4 - c: 3 - sub: - e: 5 - `, + YAML: ` +b: 2 +a: 1 +d: 4 +c: 3 +sub: + e: 5 +`, Tokens: token.Tokens{ { Type: token.StringType, CharacterType: token.CharacterTypeMiscellaneous, Indicator: token.NotIndicator, Value: "b", - Origin: "b", + Origin: "\nb", }, { Type: token.MappingValueType, @@ -1452,8 +1451,8 @@ a: !!binary | Type: token.StringType, CharacterType: token.CharacterTypeMiscellaneous, Indicator: token.NotIndicator, - Value: "\t\ta", - Origin: "\t\ta", + Value: "a", + Origin: "a", }, { Type: token.MappingValueType, @@ -1473,8 +1472,8 @@ a: !!binary | Type: token.StringType, CharacterType: token.CharacterTypeMiscellaneous, Indicator: token.NotIndicator, - Value: "\t\td", - Origin: "\t\td", + Value: "d", + Origin: "d", }, { Type: token.MappingValueType, @@ -1494,8 +1493,8 @@ a: !!binary | Type: token.StringType, CharacterType: token.CharacterTypeMiscellaneous, Indicator: token.NotIndicator, - Value: "\t\tc", - Origin: "\t\tc", + Value: "c", + Origin: "c", }, { Type: token.MappingValueType, @@ -1515,8 +1514,8 @@ a: !!binary | Type: token.StringType, CharacterType: token.CharacterTypeMiscellaneous, Indicator: token.NotIndicator, - Value: "\t\tsub", - Origin: "\t\tsub", + Value: "sub", + Origin: "sub", }, { Type: token.MappingValueType, @@ -1529,8 +1528,8 @@ a: !!binary | Type: token.StringType, CharacterType: token.CharacterTypeMiscellaneous, Indicator: token.NotIndicator, - Value: "\t\t e", - Origin: "\n\t\t e", + Value: "e", + Origin: "\n e", }, { Type: token.MappingValueType, @@ -1544,13 +1543,12 @@ a: !!binary | CharacterType: token.CharacterTypeMiscellaneous, Indicator: token.NotIndicator, Value: "5", - Origin: " 5\n", + Origin: " 5", }, }, }, { - YAML: `a: 1.2.3.4 - `, + YAML: `a: 1.2.3.4`, Tokens: token.Tokens{ { Type: token.StringType, @@ -1571,13 +1569,12 @@ a: !!binary | CharacterType: token.CharacterTypeMiscellaneous, Indicator: token.NotIndicator, Value: "1.2.3.4", - Origin: " 1.2.3.4\n", + Origin: " 1.2.3.4", }, }, }, { - YAML: `a: "2015-02-24T18:19:39Z" - `, + YAML: `a: "2015-02-24T18:19:39Z"`, Tokens: token.Tokens{ { Type: token.StringType, @@ -1603,8 +1600,7 @@ a: !!binary | }, }, { - YAML: `a: 'b: c' - `, + YAML: `a: 'b: c'`, Tokens: token.Tokens{ { Type: token.StringType, @@ -1630,8 +1626,7 @@ a: !!binary | }, }, { - YAML: `a: 'Hello #comment' - `, + YAML: `a: 'Hello #comment'`, Tokens: token.Tokens{ { Type: token.StringType, @@ -1657,8 +1652,7 @@ a: !!binary | }, }, { - YAML: `a: 100.5 - `, + YAML: `a: 100.5`, Tokens: token.Tokens{ { Type: token.StringType, @@ -1679,13 +1673,12 @@ a: !!binary | CharacterType: token.CharacterTypeMiscellaneous, Indicator: token.NotIndicator, Value: "100.5", - Origin: " 100.5\n", + Origin: " 100.5", }, }, }, { - YAML: `a: bogus - `, + YAML: `a: bogus`, Tokens: token.Tokens{ { Type: token.StringType, @@ -1706,7 +1699,7 @@ a: !!binary | CharacterType: token.CharacterTypeMiscellaneous, Indicator: token.NotIndicator, Value: "bogus", - Origin: " bogus\n", + Origin: " bogus", }, }, }, @@ -1763,15 +1756,16 @@ a: !!binary | }, }, { - YAML: `a: "double quoted" - b: "value map"`, + YAML: ` +a: "double quoted" +b: "value map"`, Tokens: token.Tokens{ { Type: token.StringType, CharacterType: token.CharacterTypeMiscellaneous, Indicator: token.NotIndicator, Value: "a", - Origin: "a", + Origin: "\na", }, { Type: token.MappingValueType, @@ -1791,8 +1785,8 @@ a: !!binary | Type: token.StringType, CharacterType: token.CharacterTypeMiscellaneous, Indicator: token.NotIndicator, - Value: "\t\tb", - Origin: "\n\t\tb", + Value: "b", + Origin: "\nb", }, { Type: token.MappingValueType, @@ -1811,15 +1805,16 @@ a: !!binary | }, }, { - YAML: `a: 'single quoted' - b: 'value map'`, + YAML: ` +a: 'single quoted' +b: 'value map'`, Tokens: token.Tokens{ { Type: token.StringType, CharacterType: token.CharacterTypeMiscellaneous, Indicator: token.NotIndicator, Value: "a", - Origin: "a", + Origin: "\na", }, { Type: token.MappingValueType, @@ -1839,8 +1834,8 @@ a: !!binary | Type: token.StringType, CharacterType: token.CharacterTypeMiscellaneous, Indicator: token.NotIndicator, - Value: "\t\tb", - Origin: "\n\t\tb", + Value: "b", + Origin: "\nb", }, { Type: token.MappingValueType, @@ -3176,6 +3171,18 @@ a: |invalid`, name: "use reserved character `", src: "key: [`val]", }, + { + name: "use tab character as indent", + src: " a: b", + }, + { + name: "use tab character as indent in literal", + src: ` +a: | + b + c +`, + }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { diff --git a/parser/parser_test.go b/parser/parser_test.go index 6bb85683..189b9701 100644 --- a/parser/parser_test.go +++ b/parser/parser_test.go @@ -883,7 +883,7 @@ a: - f: g h: i # comment 4 - `, +`, ` - b: c d: e diff --git a/path_test.go b/path_test.go index 8a4d6b50..ddca8bc7 100644 --- a/path_test.go +++ b/path_test.go @@ -644,7 +644,7 @@ doc: - value1 - value2 other: value3 - ` +` path, err := yaml.PathString("$.doc.map[0]") if err != nil { log.Fatal(err) @@ -662,7 +662,6 @@ doc: // ^ // 8 | - value2 // 9 | other: value3 - // 10 | } func ExamplePath_PathString() { diff --git a/scanner/context.go b/scanner/context.go index 7c5517d8..015a66c2 100644 --- a/scanner/context.go +++ b/scanner/context.go @@ -182,7 +182,7 @@ func (c *Context) addToken(tk *token.Token) { } func (c *Context) addBuf(r rune) { - if len(c.buf) == 0 && r == ' ' { + if len(c.buf) == 0 && (r == ' ' || r == '\t') { return } c.buf = append(c.buf, r) diff --git a/scanner/scanner.go b/scanner/scanner.go index f1d27743..50ae83b3 100644 --- a/scanner/scanner.go +++ b/scanner/scanner.go @@ -183,6 +183,11 @@ func (s *Scanner) updateIndent(ctx *Context, c rune) { s.indentNum++ return } + if s.isFirstCharAtLine && c == '\t' { + // found tab indent. + // In this case, scanTab returns error. + return + } if !s.isFirstCharAtLine { s.indentState = IndentStateKeep return @@ -593,6 +598,13 @@ func (s *Scanner) scanDocument(ctx *Context, c rune) error { } else if s.isFirstCharAtLine && c == ' ' { ctx.addDocumentIndent(s.column) s.progressColumn(ctx, 1) + } else if s.isFirstCharAtLine && c == '\t' { + err := ErrInvalidToken( + "found a tab character where an indentation space is expected", + token.Invalid(string(ctx.obuf), s.pos()), + ) + s.progressColumn(ctx, 1) + return err } else { ctx.updateDocumentLineIndentColumn(s.column) if ctx.docFirstLineIndentColumn > 0 { @@ -735,7 +747,7 @@ func (s *Scanner) scanFlowEntry(ctx *Context, c rune) bool { func (s *Scanner) scanMapDelim(ctx *Context) bool { nc := ctx.nextChar() - if s.startedFlowMapNum <= 0 && nc != ' ' && !s.isNewLineChar(nc) && !ctx.isNextEOS() { + if s.startedFlowMapNum <= 0 && nc != ' ' && nc != '\t' && !s.isNewLineChar(nc) && !ctx.isNextEOS() { return false } @@ -1009,10 +1021,22 @@ func (s *Scanner) scanReservedChar(ctx *Context, c rune) error { return err } +func (s *Scanner) scanTab(ctx *Context, c rune) error { + if !s.isFirstCharAtLine { + return nil + } + + ctx.addBuf(c) + ctx.addOriginBuf(c) + err := ErrInvalidToken("found character '\t' that cannot start any token", token.Invalid(string(ctx.obuf), s.pos())) + s.progressColumn(ctx, 1) + ctx.clear() + return err +} + func (s *Scanner) scan(ctx *Context) error { for ctx.next() { c := ctx.currentChar() - // First, change the IndentState. // If the target character is the first character in a line, IndentState is Up/Down/Equal state. // The second and subsequent letters are Keep. @@ -1140,6 +1164,10 @@ func (s *Scanner) scan(ctx *Context) error { if err := s.scanReservedChar(ctx, c); err != nil { return err } + case '\t': + if err := s.scanTab(ctx, c); err != nil { + return err + } } ctx.addBuf(c) ctx.addOriginBuf(c) From 58941edd36f43df776b22771309ccbea56554d9a Mon Sep 17 00:00:00 2001 From: Masaaki Goshima Date: Wed, 13 Nov 2024 14:59:18 +0900 Subject: [PATCH 2/2] ignore lint error --- lexer/lexer_test.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lexer/lexer_test.go b/lexer/lexer_test.go index c7be8aa1..7b678c4b 100644 --- a/lexer/lexer_test.go +++ b/lexer/lexer_test.go @@ -166,6 +166,7 @@ func TestTokenize(t *testing.T) { CharacterType: token.CharacterTypeMiscellaneous, Indicator: token.NotIndicator, Value: "a", + //nolint: gci Origin: " a", }, }, @@ -3173,6 +3174,7 @@ a: |invalid`, }, { name: "use tab character as indent", + //nolint: gci src: " a: b", }, {