Skip to content

Commit 4f5cf65

Browse files
committed
stream: fix UTF-8 character corruption in fast-utf8-stream
Fix releaseWritingBuf() to correctly handle partial writes that split multi-byte UTF-8 characters. The previous implementation incorrectly converted byte counts to character counts, causing: - 3-byte characters (CJK) to be silently dropped - 4-byte characters (emoji) to leave lone surrogates in the buffer The fix backs up from the byte position to find a valid UTF-8 character boundary by checking for continuation bytes (pattern 10xxxxxx), then decodes the properly-aligned bytes to get the correct character count. Also fixes a typo where this._asyncDrainScheduled was used instead of the private field this.#asyncDrainScheduled. Fixes: #61744
1 parent f77a709 commit 4f5cf65

File tree

2 files changed

+340
-6
lines changed

2 files changed

+340
-6
lines changed

lib/internal/streams/fast-utf8-stream.js

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -237,7 +237,7 @@ class Utf8Stream extends EventEmitter {
237237

238238
this.on('newListener', (name) => {
239239
if (name === 'drain') {
240-
this._asyncDrainScheduled = false;
240+
this.#asyncDrainScheduled = false;
241241
}
242242
});
243243

@@ -894,11 +894,23 @@ class Utf8Stream extends EventEmitter {
894894
* @returns {{writingBuf: string | Buffer, len: number}} released writingBuf and length
895895
*/
896896
function releaseWritingBuf(writingBuf, len, n) {
897-
// if Buffer.byteLength is equal to n, that means writingBuf contains no multi-byte character
898-
if (typeof writingBuf === 'string' && Buffer.byteLength(writingBuf) !== n) {
899-
// Since the fs.write callback parameter `n` means how many bytes the passed of string
900-
// We calculate the original string length for avoiding the multi-byte character issue
901-
n = Buffer.from(writingBuf).subarray(0, n).toString().length;
897+
if (typeof writingBuf === 'string') {
898+
const byteLength = Buffer.byteLength(writingBuf);
899+
if (byteLength !== n) {
900+
// Since fs.write returns the number of bytes written, we need to find
901+
// how many complete characters fit within those n bytes.
902+
// If a partial write splits a multi-byte UTF-8 character, we must back up
903+
// to the start of that character to avoid data corruption.
904+
const buf = Buffer.from(writingBuf);
905+
// Back up from position n to find a valid UTF-8 character boundary.
906+
// UTF-8 continuation bytes have the pattern 10xxxxxx (0x80-0xBF).
907+
// We need to find the start of the character that was split.
908+
while (n > 0 && (buf[n] & 0xC0) === 0x80) {
909+
n--;
910+
}
911+
// Decode the properly-aligned bytes to get the character count.
912+
n = buf.subarray(0, n).toString().length;
913+
}
902914
}
903915
len = MathMax(len - n, 0);
904916
writingBuf = writingBuf.slice(n);
Lines changed: 322 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,322 @@
1+
'use strict';
2+
3+
// Tests for UTF-8 character preservation when partial writes split multi-byte characters.
4+
// See: https://github.com/nodejs/node/issues/61744
5+
6+
const common = require('../common');
7+
const tmpdir = require('../common/tmpdir');
8+
const assert = require('node:assert');
9+
const {
10+
openSync,
11+
write,
12+
writeSync,
13+
} = require('node:fs');
14+
const { Utf8Stream } = require('node:fs');
15+
const { join } = require('node:path');
16+
const { isMainThread } = require('node:worker_threads');
17+
18+
tmpdir.refresh();
19+
if (isMainThread) {
20+
process.umask(0o000);
21+
}
22+
23+
let fileCounter = 0;
24+
25+
function getTempFile() {
26+
return join(tmpdir.path, `fastutf8stream-partial-${process.pid}-${Date.now()}-${fileCounter++}.log`);
27+
}
28+
29+
runTests(false);
30+
runTests(true);
31+
32+
function runTests(sync) {
33+
// Test 1: Partial write splitting a 3-byte UTF-8 character (CJK)
34+
// "abc中def" where "中" is 3 bytes (E4 B8 AD)
35+
// Simulate partial write of 4 bytes: "abc" (3 bytes) + first byte of "中"
36+
// The remaining buffer should be "中def" (not "def")
37+
{
38+
const dest = getTempFile();
39+
const fd = openSync(dest, 'w');
40+
41+
let firstWrite = true;
42+
const writtenChunks = [];
43+
const fsOverride = {};
44+
45+
if (sync) {
46+
fsOverride.writeSync = common.mustCall((...args) => {
47+
const data = args[1];
48+
writtenChunks.push(typeof data === 'string' ? data : data.toString());
49+
if (firstWrite) {
50+
firstWrite = false;
51+
// Simulate partial write: only 4 bytes written out of 9
52+
// This splits the 3-byte "中" character
53+
return 4;
54+
}
55+
return writeSync(...args);
56+
}, 2);
57+
} else {
58+
fsOverride.write = common.mustCall((...args) => {
59+
const data = args[1];
60+
writtenChunks.push(typeof data === 'string' ? data : data.toString());
61+
const callback = args[args.length - 1];
62+
if (firstWrite) {
63+
firstWrite = false;
64+
// Simulate partial write: only 4 bytes written out of 9
65+
process.nextTick(callback, null, 4);
66+
return;
67+
}
68+
return write(...args);
69+
}, 2);
70+
}
71+
72+
const stream = new Utf8Stream({
73+
fd,
74+
sync,
75+
minLength: 0,
76+
fs: fsOverride,
77+
});
78+
79+
stream.on('ready', common.mustCall(() => {
80+
stream.write('abc中def');
81+
stream.end();
82+
83+
stream.on('finish', common.mustCall(() => {
84+
// Verify the second chunk contains the preserved CJK character
85+
assert.strictEqual(writtenChunks.length, 2);
86+
assert.strictEqual(writtenChunks[0], 'abc中def'); // First attempt
87+
assert.strictEqual(writtenChunks[1], '中def'); // Retry with preserved char
88+
}));
89+
}));
90+
}
91+
92+
// Test 2: Partial write splitting a 4-byte UTF-8 character (emoji)
93+
// "hello🌍world" where "🌍" is 4 bytes (F0 9F 8C 8D)
94+
// Simulate partial write of 7 bytes: "hello" (5 bytes) + first 2 bytes of "🌍"
95+
// The remaining buffer should be "🌍world" (not a lone surrogate + "world")
96+
{
97+
const dest = getTempFile();
98+
const fd = openSync(dest, 'w');
99+
100+
let firstWrite = true;
101+
const writtenChunks = [];
102+
const fsOverride = {};
103+
104+
if (sync) {
105+
fsOverride.writeSync = common.mustCall((...args) => {
106+
const data = args[1];
107+
writtenChunks.push(typeof data === 'string' ? data : data.toString());
108+
if (firstWrite) {
109+
firstWrite = false;
110+
// Simulate partial write: only 7 bytes written
111+
return 7;
112+
}
113+
return writeSync(...args);
114+
}, 2);
115+
} else {
116+
fsOverride.write = common.mustCall((...args) => {
117+
const data = args[1];
118+
writtenChunks.push(typeof data === 'string' ? data : data.toString());
119+
const callback = args[args.length - 1];
120+
if (firstWrite) {
121+
firstWrite = false;
122+
process.nextTick(callback, null, 7);
123+
return;
124+
}
125+
return write(...args);
126+
}, 2);
127+
}
128+
129+
const stream = new Utf8Stream({
130+
fd,
131+
sync,
132+
minLength: 0,
133+
fs: fsOverride,
134+
});
135+
136+
stream.on('ready', common.mustCall(() => {
137+
stream.write('hello🌍world');
138+
stream.end();
139+
140+
stream.on('finish', common.mustCall(() => {
141+
assert.strictEqual(writtenChunks.length, 2);
142+
assert.strictEqual(writtenChunks[0], 'hello🌍world'); // First attempt
143+
assert.strictEqual(writtenChunks[1], '🌍world'); // Retry with preserved emoji
144+
145+
// Verify no lone surrogates in the retry chunk
146+
const retryChunk = writtenChunks[1];
147+
for (let i = 0; i < retryChunk.length; i++) {
148+
const code = retryChunk.charCodeAt(i);
149+
if (code >= 0xD800 && code <= 0xDBFF) {
150+
// High surrogate - next must be low surrogate
151+
const next = retryChunk.charCodeAt(i + 1);
152+
assert.ok(next >= 0xDC00 && next <= 0xDFFF,
153+
`Found lone high surrogate at position ${i}`);
154+
i++; // Skip the low surrogate we just verified
155+
} else if (code >= 0xDC00 && code <= 0xDFFF) {
156+
// Low surrogate without preceding high surrogate
157+
assert.fail(`Found lone low surrogate at position ${i}: 0x${code.toString(16)}`);
158+
}
159+
}
160+
}));
161+
}));
162+
}
163+
164+
// Test 3: Partial write at exactly 0 bytes (edge case)
165+
{
166+
const dest = getTempFile();
167+
const fd = openSync(dest, 'w');
168+
169+
let firstWrite = true;
170+
const writtenChunks = [];
171+
const fsOverride = {};
172+
173+
if (sync) {
174+
fsOverride.writeSync = common.mustCall((...args) => {
175+
const data = args[1];
176+
writtenChunks.push(typeof data === 'string' ? data : data.toString());
177+
if (firstWrite) {
178+
firstWrite = false;
179+
return 0; // No bytes written
180+
}
181+
return writeSync(...args);
182+
}, 2);
183+
} else {
184+
fsOverride.write = common.mustCall((...args) => {
185+
const data = args[1];
186+
writtenChunks.push(typeof data === 'string' ? data : data.toString());
187+
const callback = args[args.length - 1];
188+
if (firstWrite) {
189+
firstWrite = false;
190+
process.nextTick(callback, null, 0);
191+
return;
192+
}
193+
return write(...args);
194+
}, 2);
195+
}
196+
197+
const stream = new Utf8Stream({
198+
fd,
199+
sync,
200+
minLength: 0,
201+
fs: fsOverride,
202+
});
203+
204+
stream.on('ready', common.mustCall(() => {
205+
stream.write('中文');
206+
stream.end();
207+
208+
stream.on('finish', common.mustCall(() => {
209+
assert.strictEqual(writtenChunks.length, 2);
210+
assert.strictEqual(writtenChunks[0], '中文');
211+
assert.strictEqual(writtenChunks[1], '中文'); // Entire string retried
212+
}));
213+
}));
214+
}
215+
216+
// Test 4: Partial write splitting between characters (not mid-character)
217+
// This should work the same as before - no character preservation needed
218+
{
219+
const dest = getTempFile();
220+
const fd = openSync(dest, 'w');
221+
222+
let firstWrite = true;
223+
const writtenChunks = [];
224+
const fsOverride = {};
225+
226+
if (sync) {
227+
fsOverride.writeSync = common.mustCall((...args) => {
228+
const data = args[1];
229+
writtenChunks.push(typeof data === 'string' ? data : data.toString());
230+
if (firstWrite) {
231+
firstWrite = false;
232+
// Write exactly 3 bytes ("abc"), which is a clean character boundary
233+
return 3;
234+
}
235+
return writeSync(...args);
236+
}, 2);
237+
} else {
238+
fsOverride.write = common.mustCall((...args) => {
239+
const data = args[1];
240+
writtenChunks.push(typeof data === 'string' ? data : data.toString());
241+
const callback = args[args.length - 1];
242+
if (firstWrite) {
243+
firstWrite = false;
244+
process.nextTick(callback, null, 3);
245+
return;
246+
}
247+
return write(...args);
248+
}, 2);
249+
}
250+
251+
const stream = new Utf8Stream({
252+
fd,
253+
sync,
254+
minLength: 0,
255+
fs: fsOverride,
256+
});
257+
258+
stream.on('ready', common.mustCall(() => {
259+
stream.write('abc中def');
260+
stream.end();
261+
262+
stream.on('finish', common.mustCall(() => {
263+
assert.strictEqual(writtenChunks.length, 2);
264+
assert.strictEqual(writtenChunks[0], 'abc中def');
265+
assert.strictEqual(writtenChunks[1], '中def'); // Remaining after 3 bytes
266+
}));
267+
}));
268+
}
269+
270+
// Test 5: Single multi-byte character with partial write of 1 byte
271+
{
272+
const dest = getTempFile();
273+
const fd = openSync(dest, 'w');
274+
275+
let firstWrite = true;
276+
const writtenChunks = [];
277+
const fsOverride = {};
278+
279+
if (sync) {
280+
fsOverride.writeSync = common.mustCall((...args) => {
281+
const data = args[1];
282+
writtenChunks.push(typeof data === 'string' ? data : data.toString());
283+
if (firstWrite) {
284+
firstWrite = false;
285+
// Write only 1 byte of a 3-byte character
286+
return 1;
287+
}
288+
return writeSync(...args);
289+
}, 2);
290+
} else {
291+
fsOverride.write = common.mustCall((...args) => {
292+
const data = args[1];
293+
writtenChunks.push(typeof data === 'string' ? data : data.toString());
294+
const callback = args[args.length - 1];
295+
if (firstWrite) {
296+
firstWrite = false;
297+
process.nextTick(callback, null, 1);
298+
return;
299+
}
300+
return write(...args);
301+
}, 2);
302+
}
303+
304+
const stream = new Utf8Stream({
305+
fd,
306+
sync,
307+
minLength: 0,
308+
fs: fsOverride,
309+
});
310+
311+
stream.on('ready', common.mustCall(() => {
312+
stream.write('中');
313+
stream.end();
314+
315+
stream.on('finish', common.mustCall(() => {
316+
assert.strictEqual(writtenChunks.length, 2);
317+
assert.strictEqual(writtenChunks[0], '中');
318+
assert.strictEqual(writtenChunks[1], '中'); // Full character retried
319+
}));
320+
}));
321+
}
322+
}

0 commit comments

Comments
 (0)