stream: fix UTF-8 character corruption in fast-utf8-stream

mcollina · mcollina · commit 4f5cf6522421 · 2026-02-08T21:37:26.000+01:00
Fix releaseWritingBuf() to correctly handle partial writes that split multi-byte UTF-8 characters. The previous implementation incorrectly converted byte counts to character counts, causing: - 3-byte characters (CJK) to be silently dropped - 4-byte characters (emoji) to leave lone surrogates in the buffer The fix backs up from the byte position to find a valid UTF-8 character boundary by checking for continuation bytes (pattern 10xxxxxx), then decodes the properly-aligned bytes to get the correct character count. Also fixes a typo where this._asyncDrainScheduled was used instead of the private field this.#asyncDrainScheduled. Fixes: #61744
diff --git a/lib/internal/streams/fast-utf8-stream.js b/lib/internal/streams/fast-utf8-stream.js
@@ -237,7 +237,7 @@ class Utf8Stream extends EventEmitter {
 
     this.on('newListener', (name) => {
       if (name === 'drain') {
-        this._asyncDrainScheduled = false;
+        this.#asyncDrainScheduled = false;
       }
     });
 
@@ -894,11 +894,23 @@ class Utf8Stream extends EventEmitter {
  * @returns {{writingBuf: string | Buffer, len: number}} released writingBuf and length
  */
 function releaseWritingBuf(writingBuf, len, n) {
-  // if Buffer.byteLength is equal to n, that means writingBuf contains no multi-byte character
-  if (typeof writingBuf === 'string' && Buffer.byteLength(writingBuf) !== n) {
-    // Since the fs.write callback parameter `n` means how many bytes the passed of string
-    // We calculate the original string length for avoiding the multi-byte character issue
-    n = Buffer.from(writingBuf).subarray(0, n).toString().length;
+  if (typeof writingBuf === 'string') {
+    const byteLength = Buffer.byteLength(writingBuf);
+    if (byteLength !== n) {
+      // Since fs.write returns the number of bytes written, we need to find
+      // how many complete characters fit within those n bytes.
+      // If a partial write splits a multi-byte UTF-8 character, we must back up
+      // to the start of that character to avoid data corruption.
+      const buf = Buffer.from(writingBuf);
+      // Back up from position n to find a valid UTF-8 character boundary.
+      // UTF-8 continuation bytes have the pattern 10xxxxxx (0x80-0xBF).
+      // We need to find the start of the character that was split.
+      while (n > 0 && (buf[n] & 0xC0) === 0x80) {
+        n--;
+      }
+      // Decode the properly-aligned bytes to get the character count.
+      n = buf.subarray(0, n).toString().length;
+    }
   }
   len = MathMax(len - n, 0);
   writingBuf = writingBuf.slice(n);
diff --git a/test/parallel/test-fastutf8stream-partial-write-utf8.js b/test/parallel/test-fastutf8stream-partial-write-utf8.js
@@ -0,0 +1,322 @@
+'use strict';
+
+// Tests for UTF-8 character preservation when partial writes split multi-byte characters.
+// See: https://github.com/nodejs/node/issues/61744
+
+const common = require('../common');
+const tmpdir = require('../common/tmpdir');
+const assert = require('node:assert');
+const {
+  openSync,
+  write,
+  writeSync,
+} = require('node:fs');
+const { Utf8Stream } = require('node:fs');
+const { join } = require('node:path');
+const { isMainThread } = require('node:worker_threads');
+
+tmpdir.refresh();
+if (isMainThread) {
+  process.umask(0o000);
+}
+
+let fileCounter = 0;
+
+function getTempFile() {
+  return join(tmpdir.path, `fastutf8stream-partial-${process.pid}-${Date.now()}-${fileCounter++}.log`);
+}
+
+runTests(false);
+runTests(true);
+
+function runTests(sync) {
+  // Test 1: Partial write splitting a 3-byte UTF-8 character (CJK)
+  // "abc中def" where "中" is 3 bytes (E4 B8 AD)
+  // Simulate partial write of 4 bytes: "abc" (3 bytes) + first byte of "中"
+  // The remaining buffer should be "中def" (not "def")
+  {
+    const dest = getTempFile();
+    const fd = openSync(dest, 'w');
+
+    let firstWrite = true;
+    const writtenChunks = [];
+    const fsOverride = {};
+
+    if (sync) {
+      fsOverride.writeSync = common.mustCall((...args) => {
+        const data = args[1];
+        writtenChunks.push(typeof data === 'string' ? data : data.toString());
+        if (firstWrite) {
+          firstWrite = false;
+          // Simulate partial write: only 4 bytes written out of 9
+          // This splits the 3-byte "中" character
+          return 4;
+        }
+        return writeSync(...args);
+      }, 2);
+    } else {
+      fsOverride.write = common.mustCall((...args) => {
+        const data = args[1];
+        writtenChunks.push(typeof data === 'string' ? data : data.toString());
+        const callback = args[args.length - 1];
+        if (firstWrite) {
+          firstWrite = false;
+          // Simulate partial write: only 4 bytes written out of 9
+          process.nextTick(callback, null, 4);
+          return;
+        }
+        return write(...args);
+      }, 2);
+    }
+
+    const stream = new Utf8Stream({
+      fd,
+      sync,
+      minLength: 0,
+      fs: fsOverride,
+    });
+
+    stream.on('ready', common.mustCall(() => {
+      stream.write('abc中def');
+      stream.end();
+
+      stream.on('finish', common.mustCall(() => {
+        // Verify the second chunk contains the preserved CJK character
+        assert.strictEqual(writtenChunks.length, 2);
+        assert.strictEqual(writtenChunks[0], 'abc中def'); // First attempt
+        assert.strictEqual(writtenChunks[1], '中def'); // Retry with preserved char
+      }));
+    }));
+  }
+
+  // Test 2: Partial write splitting a 4-byte UTF-8 character (emoji)
+  // "hello🌍world" where "🌍" is 4 bytes (F0 9F 8C 8D)
+  // Simulate partial write of 7 bytes: "hello" (5 bytes) + first 2 bytes of "🌍"
+  // The remaining buffer should be "🌍world" (not a lone surrogate + "world")
+  {
+    const dest = getTempFile();
+    const fd = openSync(dest, 'w');
+
+    let firstWrite = true;
+    const writtenChunks = [];
+    const fsOverride = {};
+
+    if (sync) {
+      fsOverride.writeSync = common.mustCall((...args) => {
+        const data = args[1];
+        writtenChunks.push(typeof data === 'string' ? data : data.toString());
+        if (firstWrite) {
+          firstWrite = false;
+          // Simulate partial write: only 7 bytes written
+          return 7;
+        }
+        return writeSync(...args);
+      }, 2);
+    } else {
+      fsOverride.write = common.mustCall((...args) => {
+        const data = args[1];
+        writtenChunks.push(typeof data === 'string' ? data : data.toString());
+        const callback = args[args.length - 1];
+        if (firstWrite) {
+          firstWrite = false;
+          process.nextTick(callback, null, 7);
+          return;
+        }
+        return write(...args);
+      }, 2);
+    }
+
+    const stream = new Utf8Stream({
+      fd,
+      sync,
+      minLength: 0,
+      fs: fsOverride,
+    });
+
+    stream.on('ready', common.mustCall(() => {
+      stream.write('hello🌍world');
+      stream.end();
+
+      stream.on('finish', common.mustCall(() => {
+        assert.strictEqual(writtenChunks.length, 2);
+        assert.strictEqual(writtenChunks[0], 'hello🌍world'); // First attempt
+        assert.strictEqual(writtenChunks[1], '🌍world'); // Retry with preserved emoji
+
+        // Verify no lone surrogates in the retry chunk
+        const retryChunk = writtenChunks[1];
+        for (let i = 0; i < retryChunk.length; i++) {
+          const code = retryChunk.charCodeAt(i);
+          if (code >= 0xD800 && code <= 0xDBFF) {
+            // High surrogate - next must be low surrogate
+            const next = retryChunk.charCodeAt(i + 1);
+            assert.ok(next >= 0xDC00 && next <= 0xDFFF,
+                      `Found lone high surrogate at position ${i}`);
+            i++; // Skip the low surrogate we just verified
+          } else if (code >= 0xDC00 && code <= 0xDFFF) {
+            // Low surrogate without preceding high surrogate
+            assert.fail(`Found lone low surrogate at position ${i}: 0x${code.toString(16)}`);
+          }
+        }
+      }));
+    }));
+  }
+
+  // Test 3: Partial write at exactly 0 bytes (edge case)
+  {
+    const dest = getTempFile();
+    const fd = openSync(dest, 'w');
+
+    let firstWrite = true;
+    const writtenChunks = [];
+    const fsOverride = {};
+
+    if (sync) {
+      fsOverride.writeSync = common.mustCall((...args) => {
+        const data = args[1];
+        writtenChunks.push(typeof data === 'string' ? data : data.toString());
+        if (firstWrite) {
+          firstWrite = false;
+          return 0; // No bytes written
+        }
+        return writeSync(...args);
+      }, 2);
+    } else {
+      fsOverride.write = common.mustCall((...args) => {
+        const data = args[1];
+        writtenChunks.push(typeof data === 'string' ? data : data.toString());
+        const callback = args[args.length - 1];
+        if (firstWrite) {
+          firstWrite = false;
+          process.nextTick(callback, null, 0);
+          return;
+        }
+        return write(...args);
+      }, 2);
+    }
+
+    const stream = new Utf8Stream({
+      fd,
+      sync,
+      minLength: 0,
+      fs: fsOverride,
+    });
+
+    stream.on('ready', common.mustCall(() => {
+      stream.write('中文');
+      stream.end();
+
+      stream.on('finish', common.mustCall(() => {
+        assert.strictEqual(writtenChunks.length, 2);
+        assert.strictEqual(writtenChunks[0], '中文');
+        assert.strictEqual(writtenChunks[1], '中文'); // Entire string retried
+      }));
+    }));
+  }
+
+  // Test 4: Partial write splitting between characters (not mid-character)
+  // This should work the same as before - no character preservation needed
+  {
+    const dest = getTempFile();
+    const fd = openSync(dest, 'w');
+
+    let firstWrite = true;
+    const writtenChunks = [];
+    const fsOverride = {};
+
+    if (sync) {
+      fsOverride.writeSync = common.mustCall((...args) => {
+        const data = args[1];
+        writtenChunks.push(typeof data === 'string' ? data : data.toString());
+        if (firstWrite) {
+          firstWrite = false;
+          // Write exactly 3 bytes ("abc"), which is a clean character boundary
+          return 3;
+        }
+        return writeSync(...args);
+      }, 2);
+    } else {
+      fsOverride.write = common.mustCall((...args) => {
+        const data = args[1];
+        writtenChunks.push(typeof data === 'string' ? data : data.toString());
+        const callback = args[args.length - 1];
+        if (firstWrite) {
+          firstWrite = false;
+          process.nextTick(callback, null, 3);
+          return;
+        }
+        return write(...args);
+      }, 2);
+    }
+
+    const stream = new Utf8Stream({
+      fd,
+      sync,
+      minLength: 0,
+      fs: fsOverride,
+    });
+
+    stream.on('ready', common.mustCall(() => {
+      stream.write('abc中def');
+      stream.end();
+
+      stream.on('finish', common.mustCall(() => {
+        assert.strictEqual(writtenChunks.length, 2);
+        assert.strictEqual(writtenChunks[0], 'abc中def');
+        assert.strictEqual(writtenChunks[1], '中def'); // Remaining after 3 bytes
+      }));
+    }));
+  }
+
+  // Test 5: Single multi-byte character with partial write of 1 byte
+  {
+    const dest = getTempFile();
+    const fd = openSync(dest, 'w');
+
+    let firstWrite = true;
+    const writtenChunks = [];
+    const fsOverride = {};
+
+    if (sync) {
+      fsOverride.writeSync = common.mustCall((...args) => {
+        const data = args[1];
+        writtenChunks.push(typeof data === 'string' ? data : data.toString());
+        if (firstWrite) {
+          firstWrite = false;
+          // Write only 1 byte of a 3-byte character
+          return 1;
+        }
+        return writeSync(...args);
+      }, 2);
+    } else {
+      fsOverride.write = common.mustCall((...args) => {
+        const data = args[1];
+        writtenChunks.push(typeof data === 'string' ? data : data.toString());
+        const callback = args[args.length - 1];
+        if (firstWrite) {
+          firstWrite = false;
+          process.nextTick(callback, null, 1);
+          return;
+        }
+        return write(...args);
+      }, 2);
+    }
+
+    const stream = new Utf8Stream({
+      fd,
+      sync,
+      minLength: 0,
+      fs: fsOverride,
+    });
+
+    stream.on('ready', common.mustCall(() => {
+      stream.write('中');
+      stream.end();
+
+      stream.on('finish', common.mustCall(() => {
+        assert.strictEqual(writtenChunks.length, 2);
+        assert.strictEqual(writtenChunks[0], '中');
+        assert.strictEqual(writtenChunks[1], '中'); // Full character retried
+      }));
+    }));
+  }
+}