Skip to content

Commit 1d01fe6

Browse files
authored
fix: improve encoding handling for specific files (#249)
Certain files are UTF16LE encoding in xmltest.zip Those files are now decoded correctly.
1 parent f29b25e commit 1d01fe6

File tree

1 file changed

+105
-80
lines changed

1 file changed

+105
-80
lines changed

xmltest.js

100644100755
+105-80
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
1-
const getStream = require('get-stream')
2-
const path = require('path')
3-
const {promisify} = require('util')
4-
const yauzl = require('yauzl')
1+
const getStream = require("get-stream");
2+
const path = require("path");
3+
const { promisify } = require("util");
4+
const yauzl = require("yauzl");
55

6-
const {cache} = require('./cache')
6+
const { cache } = require("./cache");
77
// for type definitions
8-
const {Entry} = require('yauzl')
8+
const { Entry } = require("yauzl");
99

1010
/**
1111
* @typedef Entries {Record<string, string | undefined>}
@@ -17,6 +17,15 @@ const {Entry} = require('yauzl')
1717
* @typedef Loader {function (resolve: PromiseResolver, PromiseReject): LoaderInstance}
1818
*/
1919

20+
const encodingMap = {
21+
"xmltest/valid/sa/049.xml": "utf16le",
22+
"xmltest/valid/sa/050.xml": "utf16le",
23+
"xmltest/valid/sa/051.xml": "utf16le",
24+
"xmltest/valid/sa/out/049.xml": "utf16le",
25+
"xmltest/valid/sa/out/050.xml": "utf16le",
26+
"xmltest/valid/sa/out/051.xml": "utf16le",
27+
};
28+
2029
/**
2130
* Loads all file content from the zip file.
2231
*
@@ -30,16 +39,23 @@ const contentLoader = (resolve, reject, encoding) => {
3039
const data = {};
3140

3241
const end = () => {
33-
resolve(data)
34-
}
42+
resolve(data);
43+
};
3544

3645
const entry = async (entry, readFile) => {
37-
if (!entry.fileName.endsWith('/')) {
38-
data[entry.fileName] = await getStream(await readFile(entry), {encoding})
46+
if (!entry.fileName.endsWith("/")) {
47+
const enc = encoding
48+
? encoding
49+
: encodingMap[entry.fileName]
50+
? encodingMap[entry.fileName]
51+
: "utf8";
52+
data[entry.fileName] = await getStream(await readFile(entry), {
53+
encoding: enc,
54+
});
3955
}
40-
}
41-
return {end, entry}
42-
}
56+
};
57+
return { end, entry };
58+
};
4359
/**
4460
* The module level cache for the zip file content.
4561
*
@@ -62,17 +78,17 @@ contentLoader.CACHE = cache();
6278
*/
6379
const entriesLoader = (resolve, reject) => {
6480
/** @type {Entries} */
65-
const data = {}
81+
const data = {};
6682
const end = () => {
67-
resolve(data)
68-
}
83+
resolve(data);
84+
};
6985
const entry = (entry) => {
70-
data[entry.fileName] = entry.fileName.endsWith('/')
71-
? ''
72-
: path.basename(entry.fileName)
73-
}
74-
return {end, entry}
75-
}
86+
data[entry.fileName] = entry.fileName.endsWith("/")
87+
? ""
88+
: path.basename(entry.fileName);
89+
};
90+
return { end, entry };
91+
};
7692
entriesLoader.CACHE = cache();
7793

7894
/**
@@ -92,31 +108,35 @@ entriesLoader.CACHE = cache();
92108
* @param location {string} absolute path to zip file (default: xmltest.zip)
93109
* @returns {Promise<Entries>}
94110
*/
95-
const load = async (loader = contentLoader, location = path.join(__dirname, 'xmltest.zip')) => {
111+
const load = async (
112+
loader = contentLoader,
113+
location = path.join(__dirname, "xmltest.zip"),
114+
) => {
96115
if (loader.CACHE && loader.CACHE.has(location)) {
97-
return {...loader.CACHE.get(location)}
116+
return { ...loader.CACHE.get(location) };
98117
}
99118

100-
const zipfile = await promisify(yauzl.open)(
101-
location, {decodeStrings: true, lazyEntries: true}
102-
)
103-
const readFile = promisify(zipfile.openReadStream.bind(zipfile))
119+
const zipfile = await promisify(yauzl.open)(location, {
120+
decodeStrings: true,
121+
lazyEntries: true,
122+
});
123+
const readFile = promisify(zipfile.openReadStream.bind(zipfile));
104124
return new Promise((resolve, reject) => {
105125
const resolver = loader.CACHE
106126
? (data) => {
107-
loader.CACHE.set(location, data);
108-
resolve(data);
109-
}
127+
loader.CACHE.set(location, data);
128+
resolve(data);
129+
}
110130
: resolve;
111131
const handler = loader(resolver, reject);
112-
zipfile.on('end', handler.end);
113-
zipfile.on('entry', async (entry) => {
132+
zipfile.on("end", handler.end);
133+
zipfile.on("entry", async (entry) => {
114134
await handler.entry(entry, readFile);
115135
zipfile.readEntry();
116136
});
117137
zipfile.readEntry();
118-
})
119-
}
138+
});
139+
};
120140

121141
/**
122142
* A function that can be passed to functions like `Array.prototype.filter`
@@ -135,24 +155,24 @@ const load = async (loader = contentLoader, location = path.join(__dirname, 'xml
135155
* @returns {Predicate}
136156
*/
137157
const combineFilters = (...tests) => {
138-
const checks = tests.map(test => {
139-
if (typeof test === 'function') {
140-
return test
158+
const checks = tests.map((test) => {
159+
if (typeof test === "function") {
160+
return test;
141161
}
142162
let result;
143-
if (typeof test.test === 'function') {
144-
result = s => test.test(s)
145-
result.toString = () => `${test.toString}.test(str)`
163+
if (typeof test.test === "function") {
164+
result = (s) => test.test(s);
165+
result.toString = () => `${test.toString}.test(str)`;
146166
} else {
147-
result = s => s.includes(test)
148-
result.toString = () => `str.includes('${test}')`
167+
result = (s) => s.includes(test);
168+
result.toString = () => `str.includes('${test}')`;
149169
}
150170
return result;
151-
})
152-
const result = s => checks.every(check => check(s));
153-
result.toString = () => `[combineFilters:(str) => ${checks.join(' && ')}]`
171+
});
172+
const result = (s) => checks.every((check) => check(s));
173+
result.toString = () => `[combineFilters:(str) => ${checks.join(" && ")}]`;
154174
return result;
155-
}
175+
};
156176

157177
/**
158178
* Helpful filters based on the directory structure and content of `xmltest.zip`.
@@ -165,7 +185,7 @@ const combineFilters = (...tests) => {
165185
* @see ent
166186
*/
167187
const FILTERS = {
168-
INVALID: combineFilters('xmltest/invalid'),
188+
INVALID: combineFilters("xmltest/invalid"),
169189
NOT_WF: {
170190
EXT_SA: {
171191
files: combineFilters(/xmltest\/not-wf\/ext-sa\/[^/]+$/),
@@ -175,33 +195,33 @@ const FILTERS = {
175195
},
176196
SA: {
177197
files: combineFilters(/xmltest\/not-wf\/sa\/[^/]+$/),
178-
}
198+
},
179199
},
180200
VALID: {
181201
EXT_SA: {
182202
files: combineFilters(/xmltest\/valid\/ext-sa\/[^/]+$/),
183-
OUT: combineFilters('xmltest/valid/ext-sa/out')
203+
OUT: combineFilters("xmltest/valid/ext-sa/out"),
184204
},
185205
NOT_SA: {
186206
files: combineFilters(/xmltest\/valid\/not-sa\/[^/]+$/),
187-
OUT: combineFilters('xmltest/valid/not-sa/out')
207+
OUT: combineFilters("xmltest/valid/not-sa/out"),
188208
},
189209
SA: {
190210
files: combineFilters(/xmltest\/valid\/sa\/[^/]+$/),
191-
OUT: combineFilters('xmltest/valid/sa/out')
192-
}
211+
OUT: combineFilters("xmltest/valid/sa/out"),
212+
},
193213
},
194214
/**
195215
* @param s {string}
196216
* @returns {boolean}
197217
*/
198-
ent: s => s.endsWith('.ent'),
218+
ent: (s) => s.endsWith(".ent"),
199219
/**
200220
* @param s {string}
201221
* @returns {boolean}
202222
*/
203-
xml: s => s.endsWith('.xml')
204-
}
223+
xml: (s) => s.endsWith(".xml"),
224+
};
205225

206226
/**
207227
* Converts path in zipfile (keys of entries or content)
@@ -214,16 +234,17 @@ const RELATED = {
214234
* @param pathInZip {string}
215235
* @returns {string}
216236
*/
217-
ent: pathInZip => pathInZip.replace(/\.xml$/, '.ent'),
237+
ent: (pathInZip) => pathInZip.replace(/\.xml$/, ".ent"),
218238
/**
219239
* Returns the name of the related `./out/filename.xml` file with the same name as the given `.xml` file.
220240
* Be aware that only the `valid` folders have such files.
221241
*
222242
* @param pathInZip {string}
223243
* @returns {string}
224244
*/
225-
out: pathInZip => [path.dirname(pathInZip), 'out', path.basename(pathInZip)].join('/')
226-
}
245+
out: (pathInZip) =>
246+
[path.dirname(pathInZip), "out", path.basename(pathInZip)].join("/"),
247+
};
227248

228249
/**
229250
* Filters `data` by applying `filters` to it's keys
@@ -236,22 +257,23 @@ const RELATED = {
236257
* otherwise on object with all keys that match the filter.
237258
*/
238259
const getFiltered = (data, filters) => {
239-
if (filters.length === 0) return {...data}
240-
const key = filters[0]
241-
const isSingleExistingKey = filters.length === 1 && typeof key === 'string' && key in data
260+
if (filters.length === 0) return { ...data };
261+
const key = filters[0];
262+
const isSingleExistingKey =
263+
filters.length === 1 && typeof key === "string" && key in data;
242264
const keys = isSingleExistingKey
243265
? [key]
244-
: Object.keys(data).filter(combineFilters.apply(null, filters))
266+
: Object.keys(data).filter(combineFilters.apply(null, filters));
245267
return keys.length === 1 && filters.length === 1
246268
? data[keys[0]]
247269
: keys.reduce(
248-
(acc, key) => {
249-
acc[key] = data[key]
250-
return acc
251-
},
252-
/** @type {Entries} */{}
253-
)
254-
}
270+
(acc, key) => {
271+
acc[key] = data[key];
272+
return acc;
273+
},
274+
/** @type {Entries} */ {},
275+
);
276+
};
255277

256278
/**
257279
* Filters zip file content by applying `filters` to it's keys.
@@ -266,7 +288,7 @@ const getFiltered = (data, filters) => {
266288
* if the only filter only results a single entry,
267289
* otherwise on object with all keys that match the filter.
268290
*/
269-
const getContent = async (...filters) => getFiltered(await load(), filters)
291+
const getContent = async (...filters) => getFiltered(await load(), filters);
270292

271293
/**
272294
* Filters content of `xmltest.json` by applying `filters` to it's keys.
@@ -277,8 +299,8 @@ const getContent = async (...filters) => getFiltered(await load(), filters)
277299
* if the only filter only results a single entry,
278300
* otherwise on object with all keys that match the filter.
279301
*/
280-
const getEntries = (...filters) => getFiltered(require('./xmltest.json')
281-
, filters)
302+
const getEntries = (...filters) =>
303+
getFiltered(require("./xmltest.json"), filters);
282304

283305
/**
284306
* Makes module executable using `runex`.
@@ -309,11 +331,12 @@ const run = async (...filters) => {
309331

310332
return getFiltered(
311333
await load(filters.length === 0 ? entriesLoader : contentLoader, file),
312-
filters
334+
filters,
313335
);
314336
};
315337

316-
const replaceWithWrappedCodePointAt = char => `{!${char.codePointAt(0).toString(16)}!}`
338+
const replaceWithWrappedCodePointAt = (char) =>
339+
`{!${char.codePointAt(0).toString(16)}!}`;
317340

318341
/**
319342
* Some xml documents (purposely) contain characters that are not visible
@@ -328,9 +351,11 @@ const replaceWithWrappedCodePointAt = char => `{!${char.codePointAt(0).toString(
328351
* @param wrapper {function (string): string}
329352
*/
330353
const replaceNonTextChars = (value, wrapper = replaceWithWrappedCodePointAt) =>
331-
value === undefined || value === ''
354+
value === undefined || value === ""
332355
? value
333-
: value.toString().replace(/[\u0000\u001B\u001F\uDC00\uD800\uFFFE\uFFFF]/gu, wrapper)
356+
: value
357+
.toString()
358+
.replace(/[\u0000\u001B\u001F\uDC00\uD800\uFFFE\uFFFF]/gu, wrapper);
334359

335360
module.exports = {
336361
combineFilters,
@@ -344,11 +369,11 @@ module.exports = {
344369
entriesLoader,
345370
replaceNonTextChars,
346371
replaceWithWrappedCodePointAt,
347-
run
348-
}
372+
run,
373+
};
349374

350375
if (require.main === module) {
351376
// if you don't want to use `runex` just "launch" this module/package:
352377
// node xmltest ...
353-
module.exports.run(...process.argv.slice(2)).then(console.log)
378+
module.exports.run(...process.argv.slice(2)).then(console.log);
354379
}

0 commit comments

Comments
 (0)