From 80fe0ae75835556fc41daef16ad0dffd4a0563fc Mon Sep 17 00:00:00 2001 From: Steve Hoeksema Date: Tue, 9 Apr 2024 12:18:50 +1200 Subject: [PATCH] Add support for escaped UTF-16 surrogate pairs I have followed the convention above that error handling is TODO, but I can start implementing it if you like. References: * https://russellcottrell.com/greek/utilities/SurrogatePairCalculator.htm * https://mathiasbynens.be/notes/javascript-unicode * https://en.wikipedia.org/wiki/UTF-16#U+D800_to_U+DFFF_(surrogates) --- decode_test.go | 12 ++++++++++++ scanner/scanner.go | 28 ++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/decode_test.go b/decode_test.go index cabfd33c..23229ef5 100644 --- a/decode_test.go +++ b/decode_test.go @@ -437,6 +437,18 @@ func TestDecoder(t *testing.T) { `"1": "a\x2Fb\u002Fc\U0000002Fd"`, map[interface{}]interface{}{"1": `a/b/c/d`}, }, + { + `"\ud83e\udd23"`, + "🤣", + }, + { + `"\uD83D\uDE00\uD83D\uDE01"`, + "😀😁", + }, + { + `"\uD83D\uDE00a\uD83D\uDE01"`, + "😀a😁", + }, { "'1': \"2\\n3\"", map[interface{}]interface{}{"1": "2\n3"}, diff --git a/scanner/scanner.go b/scanner/scanner.go index b0eac48d..865cfb84 100644 --- a/scanner/scanner.go +++ b/scanner/scanner.go @@ -391,6 +391,34 @@ func (s *Scanner) scanDoubleQuote(ctx *Context) (tk *token.Token, pos int) { return } codeNum := hexRunesToInt(src[idx+2 : idx+6]) + + // Handle surrogate pairs + if codeNum >= 0xD800 && codeNum <= 0xDBFF { + high := codeNum + + if idx+11 >= size { + // TODO: need to return error + //err = xerrors.New("not enough characters for surrogate pair") + return + } + + if src[idx+6] != '\\' || src[idx+7] != 'u' { + // TODO: need to return error + //err = xerrors.New("expected escape code after high surrogate") + return + } + + low := hexRunesToInt(src[idx+8 : idx+12]) + if low < 0xDC00 || low > 0xDFFF { + // TODO: need to return error + //err = xerrors.New("expected low surrogate after high surrogate") + return + } + + codeNum = ((high - 0xD800) * 0x400) + (low - 0xDC00) + 0x10000 + idx += 6 + } + value = append(value, rune(codeNum)) idx += 5 continue