Merge pull request emscripten-forge#22 from AnastasiaSliusar/separation-files

martinRenou · web-flow · commit b898025450b6 · 2024-11-07T15:58:48.000+01:00
Separation of files
diff --git a/README.md b/README.md
@@ -7,16 +7,22 @@ Fetching and unpacking archives. This package uses compiled `libarchive` into wa
 ## Using
 
 This package has 2 methods:
-- extract(url) - downloads an archive throught the url and returns extracted data in Uint8Array.
-- exctractData(data) - accepts Uint8Array archive data and returns exracted data.
+- extract(url) - downloads an archive throught the url and returns extracted data int the array of objects where each of them has next structure:
+```
+{
+      "data":  new Uint8Array([5, 6, 7, 8]),
+      "filename": "info/paths.json"
+}
+```
+- exctractData(data) - accepts Uint8Array archive data and returns exracted data in the same format which `extract` method does.
 
 The example of using:
 ```sh
 import untarjs from "@emscripten-forge/untarjs";
 
 const condaPackageUrl = 'https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2';
-untarjs.extract(condaPackageUrl).then((data)=>{
-    console.log(data);
+untarjs.extract(condaPackageUrl).then((files)=>{
+    console.log(files);
 });
 ```
 > Note: If this package is used in the project where Webpack is used then webpack.config should include next:
diff --git a/build_wasm.sh b/build_wasm.sh
@@ -86,7 +86,7 @@ emcc unpack.c -o $WASM_LIB/unpack.js \
     ${PREFIX}/lib/libz.a ${PREFIX}/lib/libbz2.a ${PREFIX}/lib/libzstd.a ${PREFIX}/lib/libiconv.a\
     -s MODULARIZE=1 -s WASM=1 -O3 -s ALLOW_MEMORY_GROWTH=1 \
     -s ENVIRONMENT=web \
-    -s EXPORTED_RUNTIME_METHODS='["ccall", "cwrap", "getValue"]' \
+    -s EXPORTED_RUNTIME_METHODS='["ccall", "cwrap", "getValue", "UTF8ToString"]' \
     -s EXPORTED_FUNCTIONS="['_extract_archive', '_malloc', '_free']"
 
 echo "Build completed successfully!"
diff --git a/src/index.ts b/src/index.ts
@@ -1,4 +1,5 @@
 import initializeWasm from './helper';
+import { IFileData } from './types';
 import { IWasmModule } from './unpack';
 
 const fetchByteArray = async (url: string): Promise<Uint8Array> => {
@@ -20,55 +21,66 @@ const init = async (): Promise<IWasmModule | null> => {
   }
 };
 
-const extractData = async (data: Uint8Array): Promise<Uint8Array | null> => {
+const extractData = async (data: Uint8Array): Promise<IFileData[]> => {
   const wasmModule = await init();
-
   if (!wasmModule) {
     console.error('WASM module not initialized.');
-    return null;
+    return [];
   }
+  const inputPtr = wasmModule._malloc(data.length);
+  wasmModule.HEAPU8.set(data, inputPtr);
+  const fileCountPtr = wasmModule._malloc(4);
+  const outputSizePtr = wasmModule._malloc(4);
 
   try {
-    const inputPtr = wasmModule._malloc(data.length);
-    wasmModule.HEAPU8.set(data, inputPtr);
-
-    const outputSizePtr = wasmModule._malloc(data.length);
-    const extractedDataPtr = wasmModule._extract_archive(
+    const extractedFilesPtr = wasmModule._extract_archive(
       inputPtr,
       data.length,
-      outputSizePtr
+      outputSizePtr,
+      fileCountPtr
     );
-    const extractedSize = wasmModule.getValue(outputSizePtr, 'i32');
-    if (extractedDataPtr === 0) {
-      throw new Error('Archive extraction failed.');
+
+    const fileCount = wasmModule.getValue(fileCountPtr, 'i32');
+    const files: IFileData[] = [];
+
+    for (let i = 0; i < fileCount; i++) {
+      const fileDataPtr = extractedFilesPtr + i * (3 * 4);
+      const filenamePtr = wasmModule.getValue(fileDataPtr, 'i32');
+      const dataSize = wasmModule.getValue(fileDataPtr + 8, 'i32');
+      const dataPtr = wasmModule.getValue(fileDataPtr + 4, 'i32');
+      const filename = wasmModule.UTF8ToString(filenamePtr);
+      const fileData = new Uint8Array(
+        wasmModule.HEAPU8.buffer,
+        dataPtr,
+        dataSize
+      );
+
+      files.push({
+        filename: filename,
+        data: fileData
+      });
     }
-    const extractedData = new Uint8Array(
-      wasmModule.HEAPU8.subarray(
-        extractedDataPtr,
-        extractedDataPtr + extractedSize
-      )
-    );
 
-    wasmModule._free(inputPtr);
+    wasmModule._free(fileCountPtr);
     wasmModule._free(outputSizePtr);
-    wasmModule._free(extractedDataPtr);
+    wasmModule._free(inputPtr);
+    wasmModule._free(extractedFilesPtr);
 
-    console.log('Extracted size:', extractedSize);
-    return extractedData;
+    return files;
   } catch (error) {
     console.error('Error during extracting:', error);
-    return null;
+    return [];
   }
 };
 
-const extract = async (url: string): Promise<Uint8Array | null> => {
+const extract = async (url: string): Promise<IFileData[]> => {
   try {
     const data = await fetchByteArray(url);
     console.log('Data downloaded:', data);
     return await extractData(data);
   } catch (error) {
     console.error('Error during extracting:', error);
-    return null;
+    return [];
   }
 };
 
diff --git a/src/types.ts b/src/types.ts
@@ -0,0 +1,4 @@
+export interface IFileData {
+  filename: string;
+  data: Uint8Array;
+}
diff --git a/src/unpack.d.ts b/src/unpack.d.ts
@@ -1,11 +1,13 @@
 export interface IWasmModule {
+  UTF8ToString(filenamePtr: number): string;
   HEAPU8: Uint8Array;
   _malloc(size: number): number;
   _free(ptr: number): void;
   _extract_archive(
     inputPtr: number,
     inputSize: number,
-    outputSizePtr: number
+    outputSizePtr: number,
+    fileCountPtr: number
   ): number;
   getValue(ptr: number, type: string): number;
 }
diff --git a/tests/index.spec.ts b/tests/index.spec.ts
@@ -15,8 +15,10 @@ jest.mock('../src/index', () => {
 describe('extract', () => {
   it('should download and extract data successfully', async () => {
     const mockData = new Uint8Array([1, 2, 3, 4]);
-    const extractedData = new Uint8Array([5, 6, 7, 8]);
-
+    const extractedData = {
+      data: new Uint8Array([5, 6, 7, 8]),
+      filename: "info/paths.json"
+    }
     const mockFetchByteArray = jest.fn().mockResolvedValue(mockData);
     const mockExtractData = jest.fn().mockResolvedValue(extractedData);
 
diff --git a/unpack.c b/unpack.c
@@ -5,52 +5,79 @@
 #include <archive_entry.h>
 #include <emscripten.h>
 
+typedef struct {
+    char* filename;
+    uint8_t* data;
+    size_t data_size;
+} FileData;
+
 EMSCRIPTEN_KEEPALIVE
-char* extract_archive(uint8_t* inputData, size_t inputSize, size_t* outputSize) {
+FileData* extract_archive(uint8_t* inputData, size_t inputSize, size_t* outputSize, size_t* fileCount) {
     struct archive* archive;
     struct archive_entry* entry;
-    char* outputBuffer = NULL;
-    size_t bufferCapacity = 0;
-    size_t totalBytes = 0;
+    FileData* files = NULL;
+    size_t files_count = 0;
 
     archive = archive_read_new();
-    archive_read_support_filter_all(archive); 
-    archive_read_support_format_all(archive); 
-   
+    archive_read_support_filter_all(archive);
+    archive_read_support_format_all(archive);
+
     if (archive_read_open_memory(archive, inputData, inputSize) != ARCHIVE_OK) {
         fprintf(stderr, "Error opening archive: %s\n", archive_error_string(archive));
         archive_read_free(archive);
         return NULL;
     }
+    printf("Archive opened successfully.\n");
 
     while (archive_read_next_header(archive, &entry) == ARCHIVE_OK) {
         const char* filename = archive_entry_pathname(entry);
         size_t entrySize = archive_entry_size(entry);
+        printf("Extracting file: %s, size: %zu\n", filename, entrySize);
+
+        files = realloc(files, sizeof(FileData) * (files_count + 1));
+        if (!files) {
+            fprintf(stderr, "Memory allocation error for FileData array.\n");
+            archive_read_free(archive);
+            return NULL;
+        }
 
-        if (totalBytes + entrySize > bufferCapacity) {
-            bufferCapacity = totalBytes + entrySize + 1024;
-            outputBuffer = realloc(outputBuffer, bufferCapacity);
+        files[files_count].filename = strdup(filename);
+        files[files_count].data = malloc(entrySize);
+        printf("Setting data_size for file: %s, size: %zu\n", filename, entrySize);
+        files[files_count].data_size = entrySize;
+
+        if (!files[files_count].data) {
+            fprintf(stderr, "Memory allocation error for file data.\n");
+            free(files[files_count].filename);
+            archive_read_free(archive);
+            return NULL;
         }
 
         size_t bytesRead = 0;
         while (bytesRead < entrySize) {
-            ssize_t ret = archive_read_data(archive, outputBuffer + totalBytes, entrySize - bytesRead);
+            ssize_t ret = archive_read_data(archive, files[files_count].data + bytesRead, entrySize - bytesRead);
             if (ret < 0) {
-                fprintf(stderr, "Error reading data: %s\n", archive_error_string(archive));
-                free(outputBuffer);
+                fprintf(stderr, "Error reading data for %s: %s\n", filename, archive_error_string(archive));
+                for (size_t i = 0; i <= files_count; i++) {
+                    free(files[i].filename);
+                    free(files[i].data);
+                }
+                free(files);
                 archive_read_free(archive);
                 return NULL;
             }
             bytesRead += ret;
-            totalBytes += ret;
+            printf("Read %zd bytes for file: %s\n", ret, filename);
         }
-
-        printf("Extracted file: %s, Size: %zu bytes\n", filename, entrySize);
+        files_count++;
     }
 
     archive_read_free(archive);
+    *outputSize = files_count;
+    *fileCount = files_count;
 
-    *outputSize = totalBytes;
-    return outputBuffer;
+    return files;
 }
 
+
+