Skip to content

Commit

Permalink
Merge pull request #28 from orlandos-nl/bugfix/jo/unicode-decoding
Browse files Browse the repository at this point in the history
Fix decoding escaped unicode sequences
  • Loading branch information
Joannis authored May 25, 2023
2 parents 07e5de4 + 9ddac85 commit 92004af
Show file tree
Hide file tree
Showing 4 changed files with 94 additions and 31 deletions.
100 changes: 72 additions & 28 deletions Sources/IkigaJSON/Core/Bounds.swift
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ internal struct Bounds {
///
/// - see: `makeStringFromData` for more information
func makeString(from pointer: UnsafePointer<UInt8>, escaping: Bool, unicode: Bool) -> String? {
if let data = makeStringData(from: pointer, escaping: escaping, unicode: unicode) {
if let data = try? makeStringData(from: pointer, escaping: escaping, unicode: unicode) {
return String(data: data, encoding: .utf8)
}

Expand All @@ -36,25 +36,34 @@ internal struct Bounds {
///
/// If `escaping` is false, the string is assumed unescaped and no additional effort will be put
/// towards unescaping.
func makeStringData(from pointer: UnsafePointer<UInt8>, escaping: Bool, unicode: Bool) -> Data? {
func makeStringData(from pointer: UnsafePointer<UInt8>, escaping: Bool, unicode: Bool) throws -> Data? {
var data = Data(bytes: pointer + Int(offset), count: Int(length))

// If we can't take a shortcut by decoding immediately thanks to an escaping character
if escaping || unicode {
var length = Int(self.length)
var i = 0
var unicodes = [UInt16]()

func flushUnicodes() {
if !unicodes.isEmpty {
let character = String(utf16CodeUnits: unicodes, count: unicodes.count)
print(i)
data.insert(contentsOf: character.utf8, at: i)
unicodes.removeAll(keepingCapacity: true)
}
}

next: while i < length {
defer {
i = i &+ 1
}

let byte = data[i]

unescape: if escaping {
// If this character is not a baskslash or this was the last character
// We don't need to unescape the next character
if byte != .backslash || i &+ 1 >= length {
// Flush unprocessed unicodes and move past this character
flushUnicodes()
i = i &+ 1
break unescape
}

Expand All @@ -64,32 +73,59 @@ internal struct Bounds {

switch data[i] {
case .backslash, .solidus, .quote:
continue next // just removal needed
// just removal needed
flushUnicodes()

// Move past this character
i = i &+ 1

continue next
case .u:
// `\u` indicates a unicode character
data.remove(at: i)
length = length &- 1
decodeUnicode(from: &data, offset: i, length: &length)
let unicode = try decodeUnicode(from: &data, offset: &i, length: &length)
unicodes.append(unicode)

// Continue explicitly, so that we do not trigger the unicode 'flush' flow
continue next
case .t:
data[i] = .tab
// Move past this character
i = i &+ 1
case .r:
data[i] = .carriageReturn
// Move past this character
i = i &+ 1
case .n:
data[i] = .newLine
// Move past this character
i = i &+ 1
case .f: // form feed, will just be passed on
return nil
case .b: // backspace, will just be passed on
return nil
default:
// Try unicode decoding
break unescape
throw JSONParserError.unexpectedEscapingToken
}


// 'flush' the accumulated `unicodes` to the buffer
flushUnicodes()

continue next
} else {
// End of unicodes, flush them
flushUnicodes()

// Move past this character
i = i &+ 1
}
}

// End of string, flush unicode
flushUnicodes()
}

return data
}

Expand Down Expand Up @@ -122,21 +158,29 @@ internal struct Bounds {
}
}

// FIXME: Test, probably broken still
fileprivate func decodeUnicode(from data: inout Data, offset: Int, length: inout Int) {
var offset = offset

return data.withUnsafeMutableBytes { buffer in
let bytes = buffer.bindMemory(to: UInt8.self).baseAddress!.advanced(by: offset)

while offset < length {
guard let base = bytes[offset].decodeHex(), let secondHex = bytes[offset &+ 1].decodeHex() else {
return
}

bytes.pointee = (base << 4) &+ secondHex
length = length &- 1
offset = offset &+ 2
}
struct UTF8ParsingError: Error {}

fileprivate func decodeUnicode(from data: inout Data, offset: inout Int, length: inout Int) throws -> UInt16 {
let hexCharacters = 4
guard length - offset >= hexCharacters else {
throw UTF8ParsingError()
}

guard
let hex0 = data.remove(at: offset).decodeHex(),
let hex1 = data.remove(at: offset).decodeHex(),
let hex2 = data.remove(at: offset).decodeHex(),
let hex3 = data.remove(at: offset).decodeHex()
else {
throw UTF8ParsingError()
}

length -= hexCharacters
var unicode: UInt16 = 0
unicode += UInt16(hex0) << 12
unicode += UInt16(hex1) << 8
unicode += UInt16(hex2) << 4
unicode += UInt16(hex3)

return unicode
}
2 changes: 1 addition & 1 deletion Sources/IkigaJSON/Core/JSONDescription.swift
Original file line number Diff line number Diff line change
Expand Up @@ -633,7 +633,7 @@ extension JSONDescription {
let bounds = dataBounds(atIndexOffset: offset)
let escaping = self.type(atOffset: offset) == .stringWithEscaping

if var stringData = bounds.makeStringData(from: buffer, escaping: escaping, unicode: unicode) {
if var stringData = try? bounds.makeStringData(from: buffer, escaping: escaping, unicode: unicode) {
if convertingSnakeCasing {
convertSnakeCasing(for: &stringData)
}
Expand Down
7 changes: 5 additions & 2 deletions Sources/IkigaJSON/Errors.swift
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,14 @@ public enum JSONParserError: Error, CustomStringConvertible {
return "Missing token '\(Character(.init(uInt8)))': \(reason)"
case .unexpectedToken(_, _, let uInt8, let reason):
return "Unexpected token '\(Character(.init(uInt8)))': \(reason)"
case .unexpectedEscapingToken:
return "Unexpected escaping token"
}
}

public var column: Int? {
switch self {
case .expectedObject, .expectedArray:
case .expectedObject, .expectedArray, .unexpectedEscapingToken:
return nil
case .internalStateError(_, column: let column):
return column
Expand All @@ -86,7 +88,7 @@ public enum JSONParserError: Error, CustomStringConvertible {

public var line: Int? {
switch self {
case .expectedObject, .expectedArray:
case .expectedObject, .expectedArray, .unexpectedEscapingToken:
return nil
case .internalStateError(line: let line, _):
return line
Expand Down Expand Up @@ -129,6 +131,7 @@ public enum JSONParserError: Error, CustomStringConvertible {
case invalidObjectIdLiteral(line: Int, column: Int)
case missingToken(line: Int, column: Int, token: UInt8, reason: Reason)
case unexpectedToken(line: Int, column: Int, token: UInt8, reason: Reason)
case unexpectedEscapingToken
}

internal struct TypeConversionError<F: FixedWidthInteger>: Error {
Expand Down
16 changes: 16 additions & 0 deletions Tests/IkigaJSONTests/JSONTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,22 @@ final class IkigaJSONTests: XCTestCase {

XCTAssertNoThrow(try IkigaJSONDecoder().decode([UInt64].self, from: json))
}

func testEscapedUnicode() throws {
do {
let json: Data = #"{"simple":"\u00DF", "complex": "\ud83d\udc69\u200d\ud83d\udc69"}"#.data(using: .utf8)!

let result = try IkigaJSONDecoder().decode([String: String].self, from: json)
XCTAssertEqual(result, ["simple": "\u{00DF}", "complex": "\u{1F469}\u{200D}\u{1F469}"])
}

do {
let json: Data = #"{"simple":"\u00DFhello world", "complex": "\uD83D\uDC69\u200D\uD83D\uDC69\u200D\uD83D\uDC67\u200D\uD83D\uDC67hello world"}"#.data(using: .utf8)!

let result = try IkigaJSONDecoder().decode([String: String].self, from: json)
XCTAssertEqual(result, ["simple": "ßhello world", "complex": "👩‍👩‍👧‍👧hello world"])
}
}

func testPropertyWrapper() throws {
@propertyWrapper struct FluentPropertyTest<Value: Codable & Equatable>: Codable, Equatable {
Expand Down

0 comments on commit 92004af

Please sign in to comment.