Skip to content

Commit

Permalink
support utf-16 surrogate pair (#564)
Browse files Browse the repository at this point in the history
  • Loading branch information
goccy authored Dec 2, 2024
1 parent bf03d4d commit 2ab584e
Show file tree
Hide file tree
Showing 3 changed files with 88 additions and 14 deletions.
12 changes: 12 additions & 0 deletions decode_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1117,6 +1117,18 @@ c:
`,
[]string{"Fun with \\", "\" \u0007 \b \u001b \f", "\n \r \t \u000b \u0000", "\u0020 \u00a0 \u0085 \u2028 \u2029 A A A"},
},
{
`"\ud83e\udd23"`,
"🤣",
},
{
`"\uD83D\uDE00\uD83D\uDE01"`,
"😀😁",
},
{
`"\uD83D\uDE00a\uD83D\uDE01"`,
"😀a😁",
},
}
for _, test := range tests {
t.Run(test.source, func(t *testing.T) {
Expand Down
20 changes: 20 additions & 0 deletions lexer/lexer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3233,6 +3233,26 @@ a: |
c
`,
},
{
name: "invalid UTF-16 character",
src: `"\u00"`,
},
{
name: "invalid UTF-16 surrogate pair length",
src: `"\ud800"`,
},
{
name: "invalid UTF-16 low surrogate prefix",
src: `"\ud800\v"`,
},
{
name: "invalid UTF-16 low surrogate",
src: `"\ud800\u0000"`,
},
{
name: "invalid UTF-32 character",
src: `"\U0000"`,
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
Expand Down
70 changes: 56 additions & 14 deletions scanner/scanner.go
Original file line number Diff line number Diff line change
Expand Up @@ -439,25 +439,67 @@ func (s *Scanner) scanDoubleQuote(ctx *Context) (*token.Token, error) {
value = append(value, rune(codeNum))
}
case 'u':
// \u0000 style must have 5 characters at least.
if idx+5 >= size {
progress = 1
ctx.addOriginBuf(nextChar)
value = append(value, nextChar)
} else {
progress = 5
codeNum := hexRunesToInt(src[idx+2 : idx+progress+1])
value = append(value, rune(codeNum))
return nil, ErrInvalidToken(
token.Invalid(
"not enough length for escaped UTF-16 character",
string(ctx.obuf), s.pos(),
),
)
}
progress = 5
codeNum := hexRunesToInt(src[idx+2 : idx+6])

// handle surrogate pairs.
if codeNum >= 0xD800 && codeNum <= 0xDBFF {
high := codeNum

// \u0000\u0000 style must have 11 characters at least.
if idx+11 >= size {
return nil, ErrInvalidToken(
token.Invalid(
"not enough length for escaped UTF-16 surrogate pair",
string(ctx.obuf), s.pos(),
),
)
}

if src[idx+6] != '\\' || src[idx+7] != 'u' {
return nil, ErrInvalidToken(
token.Invalid(
"found unexpected character after high surrogate for UTF-16 surrogate pair",
string(ctx.obuf), s.pos(),
),
)
}

low := hexRunesToInt(src[idx+8 : idx+12])
if low < 0xDC00 || low > 0xDFFF {
return nil, ErrInvalidToken(
token.Invalid(
"found unexpected low surrogate after high surrogate",
string(ctx.obuf), s.pos(),
),
)
}
codeNum = ((high - 0xD800) * 0x400) + (low - 0xDC00) + 0x10000
progress += 6
}
value = append(value, rune(codeNum))
case 'U':
// \U00000000 style must have 9 characters at least.
if idx+9 >= size {
progress = 1
ctx.addOriginBuf(nextChar)
value = append(value, nextChar)
} else {
progress = 9
codeNum := hexRunesToInt(src[idx+2 : idx+progress+1])
value = append(value, rune(codeNum))
return nil, ErrInvalidToken(
token.Invalid(
"not enough length for escaped UTF-32 character",
string(ctx.obuf), s.pos(),
),
)
}
progress = 9
codeNum := hexRunesToInt(src[idx+2 : idx+10])
value = append(value, rune(codeNum))
case '\n':
isFirstLineChar = true
isNewLine = true
Expand Down

0 comments on commit 2ab584e

Please sign in to comment.