Skip to content

Commit 037992f

Browse files
committed
Prioritize BOM check in encoding detection
1 parent 30443c7 commit 037992f

File tree

3 files changed

+44
-72
lines changed

3 files changed

+44
-72
lines changed

CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
### Improvements
1313

14-
- Improve the algorithm to scan encoding decralations.
14+
- Improve the algorithm of text encoding detection.
1515
- Update the C++ syntax to include the .cu and .cuh filename extensions.
1616

1717

Packages/EditorCore/Sources/FileEncoding/String+Encoding.swift

Lines changed: 18 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ public extension String {
3838

3939
struct DetectionOptions: Sendable {
4040

41-
/// The list of encodings to test the encoding.
41+
/// The list of encodings to test decoding.
4242
public var candidates: [String.Encoding]
4343

4444
/// The text encoding read from the file's extended attributes.
@@ -115,14 +115,25 @@ public extension String {
115115

116116
extension String {
117117

118-
/// Reads string from data by detecting the text encoding automatically.
118+
/// Reads string from data by detecting the text encoding based on the detection options.
119119
///
120120
/// - Parameters:
121121
/// - data: The data to encode.
122122
/// - options: The options for encoding detection.
123123
/// - Returns: The decoded string and used encoding.
124+
/// - Throws: `CocoaError(.fileReadUnknownStringEncoding)`
124125
static func string(data: Data, options: String.DetectionOptions) throws(CocoaError) -> (String, String.Encoding) {
125126

127+
// check BOMs
128+
for bom in Unicode.BOM.allCases {
129+
if options.candidates.contains(bom.encoding),
130+
data.starts(with: bom.sequence),
131+
let string = String(bomCapableData: data, encoding: bom.encoding)
132+
{
133+
return (string, bom.encoding)
134+
}
135+
}
136+
126137
// try interpreting with xattr encoding
127138
if let xattrEncoding = options.xattrEncoding {
128139
// just trust xattr encoding if the content is empty
@@ -140,45 +151,11 @@ extension String {
140151
return (string, encoding)
141152
}
142153

143-
// detect encoding from data
144-
var usedEncoding: String.Encoding?
145-
let string = try String(data: data, suggestedEncodings: options.candidates, usedEncoding: &usedEncoding)
146-
if let encoding = usedEncoding {
147-
return (string, encoding)
148-
}
149-
150-
throw CocoaError(.fileReadUnknownStringEncoding)
151-
}
152-
153-
154-
/// Returns a `String` initialized by converting given `data` into Unicode characters using an intelligent encoding detection.
155-
///
156-
/// - Parameters:
157-
/// - data: The data object containing the string data.
158-
/// - suggestedEncodings: The prioritized list of encoding candidates.
159-
/// - usedEncoding: The encoding used to interpret the data.
160-
/// - Throws: `CocoaError(.fileReadUnknownStringEncoding)`
161-
init(data: Data, suggestedEncodings: [String.Encoding], usedEncoding: inout String.Encoding?) throws(CocoaError) {
162-
163-
// check BOMs
164-
for bom in Unicode.BOM.allCases {
165-
guard
166-
data.starts(with: bom.sequence),
167-
let string = String(bomCapableData: data, encoding: bom.encoding)
168-
else { continue }
169-
170-
usedEncoding = bom.encoding
171-
self = string
172-
return
173-
}
174-
175-
// try encodings in order from the top of the encoding list
176-
for encoding in suggestedEncodings {
177-
guard let string = String(data: data, encoding: encoding) else { continue }
178-
179-
usedEncoding = encoding
180-
self = string
181-
return
154+
// try applying encodings in order from the top of the candidates
155+
for encoding in options.candidates {
156+
if let string = String(data: data, encoding: encoding) {
157+
return (string, encoding)
158+
}
182159
}
183160

184161
throw CocoaError(.fileReadUnknownStringEncoding)

Packages/EditorCore/Tests/FileEncodingTests/EncodingDetectionTests.swift

Lines changed: 25 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,7 @@ struct EncodingDetectionTests {
4646
}
4747
#expect(String(bomCapableData: data, encoding: .utf8) == "0")
4848

49-
var encoding: String.Encoding?
50-
let string = try self.encodedStringForFileName("UTF-8 BOM", usedEncoding: &encoding)
49+
let (string, encoding) = try self.encodedStringForFileName("UTF-8 BOM")
5150

5251
#expect(string == "0")
5352
#expect(encoding == .utf8)
@@ -89,8 +88,7 @@ struct EncodingDetectionTests {
8988

9089
@Test func utf16() throws {
9190

92-
var encoding: String.Encoding?
93-
let string = try self.encodedStringForFileName("UTF-16", usedEncoding: &encoding)
91+
let (string, encoding) = try self.encodedStringForFileName("UTF-16")
9492

9593
#expect(string == "0")
9694
#expect(encoding == .utf16)
@@ -99,8 +97,7 @@ struct EncodingDetectionTests {
9997

10098
@Test func utf32() throws {
10199

102-
var encoding: String.Encoding?
103-
let string = try self.encodedStringForFileName("UTF-32", usedEncoding: &encoding)
100+
let (string, encoding) = try self.encodedStringForFileName("UTF-32")
104101

105102
#expect(string == "0")
106103
#expect(encoding == .utf32)
@@ -112,52 +109,42 @@ struct EncodingDetectionTests {
112109
let data = try self.dataForFileName("ISO 2022-JP")
113110
let encodings: [String.Encoding] = [.iso2022JP, .utf16]
114111

115-
var encoding: String.Encoding?
116-
let string = try String(data: data, suggestedEncodings: encodings, usedEncoding: &encoding)
112+
let (string, encoding) = try String.string(data: data, options: .init(candidates: encodings))
117113

118114
#expect(string == "dog犬")
119115
#expect(encoding == .iso2022JP)
120116
}
121117

122118

123-
@Test func utf8() throws {
119+
@Test func emptySuggestion() throws {
124120

125121
let data = try self.dataForFileName("UTF-8")
126122

127-
var encoding: String.Encoding?
128123
#expect(throws: CocoaError(.fileReadUnknownStringEncoding)) {
129-
try String(data: data, suggestedEncodings: [], usedEncoding: &encoding)
124+
try String.string(data: data, options: .init(candidates: []))
130125
}
131-
#expect(encoding == nil)
132126
}
133127

134128

135-
@Test func suggestedEncoding() throws {
129+
@Test func utf8() throws {
136130

137131
let data = try self.dataForFileName("UTF-8")
138132

139-
var encoding: String.Encoding?
140133
let invalidEncoding = String.Encoding(cfEncoding: kCFStringEncodingInvalidId)
141-
let string = try String(data: data, suggestedEncodings: [invalidEncoding, .utf8], usedEncoding: &encoding)
134+
let (string, encoding) = try String.string(data: data, options: .init(candidates: [invalidEncoding, .utf8, .utf16]))
142135

143136
#expect(string == "0")
144137
#expect(encoding == .utf8)
145138
}
146139

147140

148-
@Test func emptyData() {
141+
@Test func emptyData() throws {
149142

150143
let data = Data()
144+
let (string, encoding) = try String.string(data: data, options: .init(candidates: [.shiftJIS]))
151145

152-
var encoding: String.Encoding?
153-
var string: String?
154-
155-
#expect(throws: CocoaError(.fileReadUnknownStringEncoding)) {
156-
string = try String(data: data, suggestedEncodings: [], usedEncoding: &encoding)
157-
}
158-
159-
#expect(string == nil)
160-
#expect(encoding == nil)
146+
#expect(string.isEmpty)
147+
#expect(encoding == .shiftJIS)
161148
#expect(!data.starts(with: Unicode.BOM.utf8.sequence))
162149
}
163150

@@ -291,11 +278,19 @@ private extension String.Encoding {
291278

292279
private extension EncodingDetectionTests {
293280

294-
func encodedStringForFileName(_ fileName: String, usedEncoding: inout String.Encoding?) throws -> String {
295-
296-
let data = try self.dataForFileName(fileName)
297-
298-
return try String(data: data, suggestedEncodings: [], usedEncoding: &usedEncoding)
281+
func encodedStringForFileName(_ fileName: String) throws -> (String, String.Encoding) {
282+
283+
try String.string(
284+
data: try self.dataForFileName(fileName),
285+
options: .init(candidates: [
286+
.utf8,
287+
.utf16,
288+
.utf16BigEndian,
289+
.utf16LittleEndian,
290+
.utf32,
291+
.utf32BigEndian,
292+
.utf32LittleEndian,
293+
]))
299294
}
300295

301296

0 commit comments

Comments
 (0)