diff --git a/.make.versions b/.make.versions index 3ea2ae2d1..92822959e 100644 --- a/.make.versions +++ b/.make.versions @@ -82,8 +82,8 @@ DOC_QUALITY_RAY_VERSION=$(DPK_VERSION) CODE_QUALITY_RAY_VERSION=$(DPK_VERSION) CODE_QUALITY_PYTHON_VERSION=$(DPK_VERSION) -CODE2PARQUET_PYTHON_VERSION=$(DPK_VERSION) -CODE2PARQUET_RAY_VERSION=$(DPK_VERSION) +ZIP2PARQUET_PYTHON_VERSION=$(DPK_VERSION) +ZIP2PARQUET_RAY_VERSION=$(DPK_VERSION) INGEST_TO_PARQUET_VERSION=$(DPK_VERSION) REPO_LVL_ORDER_RAY_VERSION=$(DPK_VERSION) diff --git a/kfp/README.md b/kfp/README.md index 2468e2429..cadec4e92 100644 --- a/kfp/README.md +++ b/kfp/README.md @@ -6,7 +6,7 @@ |-------------------------------------|:----------------------------------------------------------------------------------:| | language/lang_id | [lang_id_wf.py](../transforms/language/lang_id/kfp_ray/lang_id_wf.py) | | code/malware | [malware_wf.py](../transforms/code/malware/kfp_ray/malware_wf.py) | -| code/code2parquet | [code2parquet_wf.py](../transforms/code/code2parquet/kfp_ray/code2parquet_wf.py) | +| code/code2parquet | [code2parquet_wf.py](../transforms/universal/zip2parquet/kfp_ray/zip2parquet_wf.py) | | code/code_quality | [code_quality_wf.py](../transforms/code/code_quality/kfp_ray/code_quality_wf.py) | | code/proglang_select | [proglang_select_wf.py](../transforms/code/proglang_select/kfp_ray/proglang_select_wf.py) | | universal/doc_id | [doc_id_wf.py](../transforms/universal/doc_id/kfp_ray/doc_id_wf.py) | diff --git a/tools/ingest2parquet/README.md b/tools/ingest2parquet/README.md index dce3d042f..7fbe2ff02 100644 --- a/tools/ingest2parquet/README.md +++ b/tools/ingest2parquet/README.md @@ -2,7 +2,7 @@ **Please note: This tool is deprecated and will be removed soon. It is superseded by the transform-based implementation, -[code2parquet](../../transforms/code/code2parquet), providing identical capability, +[code2parquet](../../transforms/universal/zip2parquet), providing identical capability, but with support for ray-based scalability.** ## Summary diff --git a/transforms/code/code2parquet/python/test-data/expected/metadata.json b/transforms/code/code2parquet/python/test-data/expected/metadata.json deleted file mode 100644 index 5c2c6d0a0..000000000 --- a/transforms/code/code2parquet/python/test-data/expected/metadata.json +++ /dev/null @@ -1,40 +0,0 @@ -{ - "pipeline": "pipeline_id", - "job details": { - "job category": "preprocessing", - "job name": "code2parquet", - "job type": "pure python", - "job id": "job_id", - "start_time": "2024-07-25 15:38:20", - "end_time": "2024-07-25 15:38:21", - "status": "success" - }, - "code": null, - "job_input_params": { - "supported_langs_file": "/Users/dawood/git/data-prep-kit/transforms/code/code2parquet/python/test-data/languages/lang_extensions.json", - "detect_programming_lang": true, - "snapshot": null, - "domain": null, - "s3_cred": null, - "checkpointing": false, - "max_files": -1, - "random_samples": -1, - "files_to_use": [".zip"] - }, - "job_output_stats": { - "source_files": 3, - "source_size": 33885652, - "result_files": 3, - "result_size": 70167, - "processing_time": 1.5678541660308838, - "number of rows": 74 - }, - "source": { - "name": "/Users/dawood/git/data-prep-kit/transforms/code/code2parquet/python/test-data/input", - "type": "path" - }, - "target": { - "name": "/tmp/code2parquetbl3prm61", - "type": "path" - } -} diff --git a/transforms/code/code2parquet/ray/test-data/expected/metadata.json b/transforms/code/code2parquet/ray/test-data/expected/metadata.json deleted file mode 100644 index 5c2c6d0a0..000000000 --- a/transforms/code/code2parquet/ray/test-data/expected/metadata.json +++ /dev/null @@ -1,40 +0,0 @@ -{ - "pipeline": "pipeline_id", - "job details": { - "job category": "preprocessing", - "job name": "code2parquet", - "job type": "pure python", - "job id": "job_id", - "start_time": "2024-07-25 15:38:20", - "end_time": "2024-07-25 15:38:21", - "status": "success" - }, - "code": null, - "job_input_params": { - "supported_langs_file": "/Users/dawood/git/data-prep-kit/transforms/code/code2parquet/python/test-data/languages/lang_extensions.json", - "detect_programming_lang": true, - "snapshot": null, - "domain": null, - "s3_cred": null, - "checkpointing": false, - "max_files": -1, - "random_samples": -1, - "files_to_use": [".zip"] - }, - "job_output_stats": { - "source_files": 3, - "source_size": 33885652, - "result_files": 3, - "result_size": 70167, - "processing_time": 1.5678541660308838, - "number of rows": 74 - }, - "source": { - "name": "/Users/dawood/git/data-prep-kit/transforms/code/code2parquet/python/test-data/input", - "type": "path" - }, - "target": { - "name": "/tmp/code2parquetbl3prm61", - "type": "path" - } -} diff --git a/transforms/code/code2parquet/Makefile b/transforms/universal/zip2parquet/Makefile similarity index 100% rename from transforms/code/code2parquet/Makefile rename to transforms/universal/zip2parquet/Makefile diff --git a/transforms/code/code2parquet/README.md b/transforms/universal/zip2parquet/README.md similarity index 100% rename from transforms/code/code2parquet/README.md rename to transforms/universal/zip2parquet/README.md diff --git a/transforms/code/code2parquet/kfp_ray/Makefile b/transforms/universal/zip2parquet/kfp_ray/Makefile similarity index 100% rename from transforms/code/code2parquet/kfp_ray/Makefile rename to transforms/universal/zip2parquet/kfp_ray/Makefile diff --git a/transforms/code/code2parquet/kfp_ray/README.md b/transforms/universal/zip2parquet/kfp_ray/README.md similarity index 93% rename from transforms/code/code2parquet/kfp_ray/README.md rename to transforms/universal/zip2parquet/kfp_ray/README.md index d2c34e449..e500ebcf3 100644 --- a/transforms/code/code2parquet/kfp_ray/README.md +++ b/transforms/universal/zip2parquet/kfp_ray/README.md @@ -2,7 +2,7 @@ ## Summary -This project allows execution of the [noop Ray transform](../ray) as a +This project allows execution of the [zip2parquet Ray transform](../ray) as a [KubeFlow Pipeline](https://www.kubeflow.org/docs/components/pipelines/overview/) The detail pipeline is presented in the [Simplest Transform pipeline tutorial](../../../../kfp/doc/simple_transform_pipeline.md) diff --git a/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py b/transforms/universal/zip2parquet/kfp_ray/zip2parquet_wf.py similarity index 78% rename from transforms/code/code2parquet/kfp_ray/code2parquet_wf.py rename to transforms/universal/zip2parquet/kfp_ray/zip2parquet_wf.py index 7cc12fd60..c2accbb98 100644 --- a/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py +++ b/transforms/universal/zip2parquet/kfp_ray/zip2parquet_wf.py @@ -19,9 +19,9 @@ # the name of the job script -EXEC_SCRIPT_NAME: str = "code2parquet_transform_ray.py" +EXEC_SCRIPT_NAME: str = "zip2parquet_transform_ray.py" -task_image = "quay.io/dataprep1/data-prep-kit/code2parquet-ray:latest" +task_image = "quay.io/dataprep1/data-prep-kit/zip2parquet-ray:latest" # components @@ -42,10 +42,12 @@ def compute_exec_params_func( runtime_pipeline_id: str, runtime_job_id: str, runtime_code_location: dict, - code2parquet_supported_langs_file: str, - code2parquet_domain: str, - code2parquet_snapshot: str, - code2parquet_detect_programming_lang: bool, + zip2parquet_code_data: bool, + zip2parquet_programming_language_column: str, + zip2parquet_supported_langs_file: str, + zip2parquet_domain: str, + zip2parquet_snapshot: str, + zip2parquet_detect_programming_lang: bool, ) -> dict: from runtime_utils import KFPUtils @@ -59,10 +61,12 @@ def compute_exec_params_func( "runtime_pipeline_id": runtime_pipeline_id, "runtime_job_id": runtime_job_id, "runtime_code_location": str(runtime_code_location), - "code2parquet_supported_langs_file": code2parquet_supported_langs_file, - "code2parquet_domain": code2parquet_domain, - "code2parquet_snapshot": code2parquet_snapshot, - "code2parquet_detect_programming_lang": code2parquet_detect_programming_lang, + "zip2parquet_code_data": zip2parquet_code_data, + "zip2parquet_programming_language_column": zip2parquet_programming_language_column, + "zip2parquet_supported_langs_file": zip2parquet_supported_langs_file, + "zip2parquet_domain": zip2parquet_domain, + "zip2parquet_snapshot": zip2parquet_snapshot, + "zip2parquet_detect_programming_lang": zip2parquet_detect_programming_lang, } @@ -97,22 +101,22 @@ def compute_exec_params_func( # clean up Ray cleanup_ray_op = comp.load_component_from_file(component_spec_path + "deleteRayClusterComponent.yaml") # Task name is part of the pipeline name, the ray cluster name and the job name in DMF. -TASK_NAME: str = "code2parquet" -PREFIX: str = "code2parquet" +TASK_NAME: str = "zip2parquet" +PREFIX: str = "zip2parquet" @dsl.pipeline( name=TASK_NAME + "-ray-pipeline", description="Pipeline for converting zip files to parquet", ) -def code2parquet( - ray_name: str = "code2parquet-kfp-ray", # name of Ray cluster +def zip2parquet( + ray_name: str = "zip2parquet-kfp-ray", # name of Ray cluster # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", # data access - data_s3_config: str = "{'input_folder': 'test/code2parquet/input', 'output_folder': 'test/code2parquet/output/'}", + data_s3_config: str = "{'input_folder': 'test/zip2parquet/input', 'output_folder': 'test/zip2parquet/output/'}", data_s3_access_secret: str = "s3-secret", data_max_files: int = -1, data_num_samples: int = -1, @@ -121,12 +125,14 @@ def code2parquet( runtime_actor_options: dict = {'num_cpus': 0.8}, runtime_pipeline_id: str = "pipeline_id", runtime_code_location: dict = {'github': 'github', 'commit_hash': '12345', 'path': 'path'}, - # code to parquet - code2parquet_supported_langs_file: str = "test/code2parquet/languages/lang_extensions.json", - code2parquet_detect_programming_lang: bool = True, - code2parquet_domain: str = "code", - code2parquet_snapshot: str = "github", - code2parquet_s3_access_secret: str = "s3-secret", + # zip to parquet + zip2parquet_code_data: bool = True, + zip2parquet_programming_language_column: str = "programming_language", + zip2parquet_supported_langs_file: str = "test/zip2parquet/languages/lang_extensions.json", + zip2parquet_detect_programming_lang: bool = True, + zip2parquet_domain: str = "code", + zip2parquet_snapshot: str = "github", + zip2parquet_s3_access_secret: str = "s3-secret", # additional parameters additional_params: str = '{"wait_interval": 2, "wait_cluster_ready_tmout": 400, "wait_cluster_up_tmout": 300, "wait_job_ready_tmout": 400, "wait_print_tmout": 30, "http_retries": 5}', ) -> None: @@ -162,11 +168,13 @@ def code2parquet( :param runtime_actor_options - actor options :param runtime_pipeline_id - pipeline id :param runtime_code_location - code location - :param code2parquet_supported_langs_file - file to store allowed languages - :param code2parquet_detect_programming_lang - detect programming language flag - :param code2parquet_domain: domain - :param code2parquet_snapshot: snapshot - :param code2parquet_s3_access_secret - ingest to parquet s3 access secret + :param zip2parquet_code_data - flag that data is code + :param zip2parquet_programming_language_column - name for programming language column + :param zip2parquet_supported_langs_file - file to store allowed languages + :param zip2parquet_detect_programming_lang - detect programming language flag + :param zip2parquet_domain: domain + :param zip2parquet_snapshot: snapshot + :param zip2parquet_s3_access_secret - ingest to parquet s3 access secret (here we are assuming that select language info is in S3, but potentially in the different bucket) :return: None """ @@ -186,10 +194,12 @@ def code2parquet( runtime_pipeline_id=runtime_pipeline_id, runtime_job_id=run_id, runtime_code_location=runtime_code_location, - code2parquet_supported_langs_file=code2parquet_supported_langs_file, - code2parquet_domain=code2parquet_domain, - code2parquet_snapshot=code2parquet_snapshot, - code2parquet_detect_programming_lang=code2parquet_detect_programming_lang, + zip2parquet_code_data=zip2parquet_code_data, + zip2parquet_programming_language_column=zip2parquet_programming_language_column, + zip2parquet_supported_langs_file=zip2parquet_supported_langs_file, + zip2parquet_domain=zip2parquet_domain, + zip2parquet_snapshot=zip2parquet_snapshot, + zip2parquet_detect_programming_lang=zip2parquet_detect_programming_lang, ) ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) # start Ray cluster @@ -216,10 +226,10 @@ def code2parquet( ) ComponentUtils.add_settings_to_component(execute_job, ONE_WEEK_SEC) ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret) - ComponentUtils.set_s3_env_vars_to_component(execute_job, code2parquet_s3_access_secret, prefix=PREFIX) + ComponentUtils.set_s3_env_vars_to_component(execute_job, zip2parquet_s3_access_secret, prefix=PREFIX) execute_job.after(ray_cluster) if __name__ == "__main__": # Compiling the pipeline - compiler.Compiler().compile(code2parquet, __file__.replace(".py", ".yaml")) + compiler.Compiler().compile(zip2parquet, __file__.replace(".py", ".yaml")) diff --git a/transforms/code/code2parquet/python/.dockerignore b/transforms/universal/zip2parquet/python/.dockerignore similarity index 100% rename from transforms/code/code2parquet/python/.dockerignore rename to transforms/universal/zip2parquet/python/.dockerignore diff --git a/transforms/code/code2parquet/ray/.gitignore b/transforms/universal/zip2parquet/python/.gitignore similarity index 95% rename from transforms/code/code2parquet/ray/.gitignore rename to transforms/universal/zip2parquet/python/.gitignore index 17cee1df3..de14528f0 100644 --- a/transforms/code/code2parquet/ray/.gitignore +++ b/transforms/universal/zip2parquet/python/.gitignore @@ -1,5 +1,5 @@ test-data/output -output/* +output/metadata.json /output/ data-processing-lib/ diff --git a/transforms/code/code2parquet/python/Dockerfile b/transforms/universal/zip2parquet/python/Dockerfile similarity index 85% rename from transforms/code/code2parquet/python/Dockerfile rename to transforms/universal/zip2parquet/python/Dockerfile index b36b6a6c4..918de9aff 100644 --- a/transforms/code/code2parquet/python/Dockerfile +++ b/transforms/universal/zip2parquet/python/Dockerfile @@ -17,19 +17,19 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . # END OF STEPS destined for a data-prep-kit base image -COPY --chown=dpk:root src/ src/ +COPY --chown=dpk:root src src/ COPY --chown=dpk:root pyproject.toml pyproject.toml RUN pip install --no-cache-dir -e . # copy the main() entry point to the image -COPY ./src/code2parquet_transform_python.py . +COPY src/zip2parquet_transform_python.py . # copy some of the samples in -COPY ./src/code2parquet_local.py local/ +COPY src/zip2parquet_local.py local/ # copy test -COPY test/ test/ -COPY test-data/ test-data/ +COPY test test/ +COPY test-data test-data/ # Set environment ENV PYTHONPATH /home/dpk diff --git a/transforms/code/code2parquet/python/Makefile b/transforms/universal/zip2parquet/python/Makefile similarity index 82% rename from transforms/code/code2parquet/python/Makefile rename to transforms/universal/zip2parquet/python/Makefile index d0403e601..7937c4c25 100644 --- a/transforms/code/code2parquet/python/Makefile +++ b/transforms/universal/zip2parquet/python/Makefile @@ -7,7 +7,7 @@ REPOROOT=../../../.. # $(REPOROOT)/.make.versions file contains the versions -TRANSFORM_NAME=code2parquet +TRANSFORM_NAME=zip2parquet include $(REPOROOT)/transforms/.make.transforms @@ -33,7 +33,7 @@ setup:: .transforms.setup # distribution versions is the same as image version. set-versions: - $(MAKE) TRANSFORM_PYTHON_VERSION=$(CODE2PARQUET_PYTHON_VERSION) TOML_VERSION=$(CODE2PARQUET_PYTHON_VERSION) .transforms.set-versions + $(MAKE) TRANSFORM_PYTHON_VERSION=$(ZIP2PARQUET_PYTHON_VERSION) TOML_VERSION=$(ZIP2PARQUET_PYTHON_VERSION) .transforms.set-versions build-dist:: .defaults.build-dist @@ -46,8 +46,8 @@ run-cli-sample: RUN_ARGS=" \ --data_local_config \" { 'input_folder' : '../test-data/input', 'output_folder' : '../output' } \" \ --data_files_to_use \"['.zip']\" \ - --code2parquet_supported_langs_file ../test-data/languages/lang_extensions.json \ - --code2parquet_detect_programming_lang True " \ + --zip2parquet_supported_langs_file ../test-data/languages/lang_extensions.json \ + --zip2parquet_detect_programming_lang True " \ .transforms.run-src-file run-local-sample: .transforms.run-local-sample diff --git a/transforms/code/code2parquet/python/README.md b/transforms/universal/zip2parquet/python/README.md similarity index 61% rename from transforms/code/code2parquet/python/README.md rename to transforms/universal/zip2parquet/python/README.md index b93ff3717..8e843a443 100644 --- a/transforms/code/code2parquet/python/README.md +++ b/transforms/universal/zip2parquet/python/README.md @@ -1,10 +1,10 @@ -# Code2Parquet +# Zip2Parquet ## Summary -This code2parquet transform is designed to convert raw particularly ZIP files contain programming files (.py, .c, .java, etc) , -into Parquet format. -As a transform It is built to handle concurrent processing of Ray-based -multiple files using multiprocessing for efficient execution. +This zip2parquet transform is designed to convert ZIP files containing a set of files +into arrow table. If these files contain code data (`code_data` flag) we additionally +determine programming language (.py, .c, .java, etc). + Each file contained within the ZIP is transformed into a distinct row within the Parquet dataset, adhering to the below schema. **title:** (string) @@ -57,25 +57,28 @@ Each file contained within the ZIP is transformed into a distinct row within the - **Description:** Name indicating which dataset it belong to. - **Example:** `"snapshot": "github"` -**programming_language:** (string)(optional) - -- **Description:** Programming language detected using the file extension. -- **Example:** `"programming_language": "Java"` - **domain:** (string)(optional) - **Description:** Name indicating which domain it belong to, whether code, natural language etc.. - **Example:** `"domain": "code"` +**programming_language:** (string)(optional) - only if code_data is set to True + +- **Description:** Programming language detected using the file extension. +- **Example:** `"programming_language": "Java"` ## Configuration -The set of dictionary keys holding [code2parquet](src/code2parquet_transform.py) +The set of dictionary keys holding [zip2parquet](src/zip2parquet_transform.py) configuration for values are as follows: The transform can be configured with the following key/value pairs from the configuration dictionary. +* `code_data` - a flag defining whether to treat data as code or plain context. Default + is code. +* `programming_language_column` - name of the column where programming language information +is stored - default `programming_language`. Only used if `code_data` is True * `supported_languages` - a dictionary mapping file extensions to language names. * `supported_langs_file` - used if `supported_languages` key is not provided, and specifies the path to a JSON file containing the mapping of languages @@ -93,21 +96,42 @@ the file specified in `supported_langs_file`. ## Running ### Launched Command Line Options -When running the transform with the Ray launcher (i.e. TransformLauncher), -the following command line arguments are available in addition to -[the options provided by the launcher](../../../../data-processing-lib/doc/ray-launcher-options.md). +The following command line arguments are available in addition to +the options provided by +the [python launcher](../../../../data-processing-lib/doc/python-launcher-options.md). + +``` + --zip2parquet_code_data ZIP2PARQUET_CODE_DATA + flag to process files as code + --zip2parquet_programming_language_column ZIP2PARQUET_PROGRAMMING_LANGUAGE_COLUMN + Path to file containing the list of supported languages + --zip2parquet_supported_langs_file ZIP2PARQUET_SUPPORTED_LANGS_FILE + Path to file containing the list of supported languages + --zip2parquet_detect_programming_lang ZIP2PARQUET_DETECT_PROGRAMMING_LANG + Infer the programming lang from the file extension using the file of supported languages + --zip2parquet_snapshot ZIP2PARQUET_SNAPSHOT + Snapshot value assigned to all imported documents. + --zip2parquet_domain ZIP2PARQUET_DOMAIN + Domain value assigned to all imported documents. + --zip2parquet_s3_cred ZIP2PARQUET_S3_CRED + AST string of options for s3 credentials. Only required for S3 data access. + access_key: access key help text + secret_key: secret key help text + url: optional s3 url + region: optional s3 region + Example: { 'access_key': 'access', 'secret_key': 'secret', + 'url': 'https://s3.us-east.cloud-object-storage.appdomain.cloud', + 'region': 'us-east-1' } +``` -* `--code2parquet_supported_langs_file` - set the `supported_langs_file` configuration key. -* `--code2parquet_detect_programming_lang` - set the `detect_programming_lang` configuration key. -* `--code2parquet_domain` - set the `domain` configuration key. -* `--code2parquet_snapshot` - set the `snapshot` configuration key. +These correspond to the configuration keys described above. ### Running the samples To run the samples, use the following `make` targets -* `run-cli-sample` - runs src/code2parquet_transform_ray.py using command line args -* `run-local-sample` - runs src/code2parquet.py -* `run-s3-sample` - runs src/code2parquet.py +* `run-cli-sample` - runs src/zip2parquet_transform_ray.py using command line args +* `run-local-sample` - runs src/zip2parquet.py +* `run-s3-sample` - runs src/zip2parquet.py * Requires prior installation of minio, depending on your platform (e.g., from [here](https://min.io/docs/minio/macos/index.html) and [here](https://min.io/docs/minio/linux/index.html) and invocation of `make minio-start` to load data into local minio for S3 access. diff --git a/transforms/code/code2parquet/python/pyproject.toml b/transforms/universal/zip2parquet/python/pyproject.toml similarity index 87% rename from transforms/code/code2parquet/python/pyproject.toml rename to transforms/universal/zip2parquet/python/pyproject.toml index b8c97541d..23451c0d7 100644 --- a/transforms/code/code2parquet/python/pyproject.toml +++ b/transforms/universal/zip2parquet/python/pyproject.toml @@ -1,13 +1,13 @@ [project] -name = "dpk_code2parquet_transform_python" +name = "dpk_zip2parquet_transform_python" version = "0.2.1.dev0" requires-python = ">=3.10" -description = "code2parquet Python Transform" +description = "zip2parquet Python Transform" license = {text = "Apache-2.0"} readme = {file = "README.md", content-type = "text/markdown"} authors = [ { name = "David Wood", email = "dawood@us.ibm.com" }, - { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, + { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ "data-prep-toolkit==0.2.1.dev0", diff --git a/transforms/code/code2parquet/python/src/code2parquet_local.py b/transforms/universal/zip2parquet/python/src/zip2parquet_local.py similarity index 91% rename from transforms/code/code2parquet/python/src/code2parquet_local.py rename to transforms/universal/zip2parquet/python/src/zip2parquet_local.py index 8ebd4370b..15d0f7ae1 100644 --- a/transforms/code/code2parquet/python/src/code2parquet_local.py +++ b/transforms/universal/zip2parquet/python/src/zip2parquet_local.py @@ -13,11 +13,14 @@ import ast import os -from code2parquet_transform import ( # domain_key,; snapshot_key, +from zip2parquet_transform import ( # domain_key,; snapshot_key, CodeToParquetTransform, data_factory_key, detect_programming_lang_key, supported_langs_file_key, + domain_key, + snapshot_key, + code_data_key ) from data_processing.data_access import DataAccessFactory, DataAccessLocal @@ -30,8 +33,9 @@ params = { supported_langs_file_key: supported_languages_file, detect_programming_lang_key: True, - # snapshot_key: "github", - # domain_key: "code", + snapshot_key: "github", + domain_key: "code", + #code_data_key: False, "data_files_to_use": ast.literal_eval("['.zip']"), data_factory_key: DataAccessFactory(), # Expect to create DataAccessLocal } diff --git a/transforms/code/code2parquet/python/src/code2parquet_local_python.py b/transforms/universal/zip2parquet/python/src/zip2parquet_local_python.py similarity index 93% rename from transforms/code/code2parquet/python/src/code2parquet_local_python.py rename to transforms/universal/zip2parquet/python/src/zip2parquet_local_python.py index 66713a02f..94afa5ee7 100644 --- a/transforms/code/code2parquet/python/src/code2parquet_local_python.py +++ b/transforms/universal/zip2parquet/python/src/zip2parquet_local_python.py @@ -14,11 +14,11 @@ import os import sys -from code2parquet_transform import ( # domain_key,; snapshot_key, +from zip2parquet_transform import ( # domain_key,; snapshot_key, detect_programming_lang_cli_key, supported_langs_file_cli_key, ) -from code2parquet_transform_python import CodeToParquetPythonConfiguration +from zip2parquet_transform_python import CodeToParquetPythonConfiguration from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing.utils import ParamsUtils diff --git a/transforms/code/code2parquet/python/src/code2parquet_s3_python.py b/transforms/universal/zip2parquet/python/src/zip2parquet_s3_python.py similarity index 93% rename from transforms/code/code2parquet/python/src/code2parquet_s3_python.py rename to transforms/universal/zip2parquet/python/src/zip2parquet_s3_python.py index ca26b19cd..2dad7a854 100644 --- a/transforms/code/code2parquet/python/src/code2parquet_s3_python.py +++ b/transforms/universal/zip2parquet/python/src/zip2parquet_s3_python.py @@ -13,11 +13,11 @@ import ast import sys -from code2parquet_transform import ( # domain_key,; snapshot_key, +from zip2parquet_transform import ( # domain_key,; snapshot_key, detect_programming_lang_cli_key, supported_langs_file_cli_key, ) -from code2parquet_transform_python import CodeToParquetPythonConfiguration +from zip2parquet_transform_python import CodeToParquetPythonConfiguration from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing.utils import GB, ParamsUtils diff --git a/transforms/code/code2parquet/python/src/code2parquet_transform.py b/transforms/universal/zip2parquet/python/src/zip2parquet_transform.py similarity index 87% rename from transforms/code/code2parquet/python/src/code2parquet_transform.py rename to transforms/universal/zip2parquet/python/src/zip2parquet_transform.py index 7caf4c6eb..a1a7c841c 100644 --- a/transforms/code/code2parquet/python/src/code2parquet_transform.py +++ b/transforms/universal/zip2parquet/python/src/zip2parquet_transform.py @@ -30,7 +30,7 @@ from data_processing.utils import CLIArgumentProvider, TransformUtils, str2bool -shortname = "code2parquet" +shortname = "zip2parquet" cli_prefix = f"{shortname}_" supported_langs_file_key = "supported_langs_file" @@ -43,6 +43,14 @@ detect_programming_lang_cli_key = f"{cli_prefix}{detect_programming_lang_key}" detect_programming_lang_default = True +code_data_key = "code_data" +code_data_cli_key = f"{cli_prefix}{code_data_key}" +code_data_default = True + +programming_language_column_key = "programming_language_column" +programming_language_column_cli_key = f"{cli_prefix}{programming_language_column_key}" +programming_language_column_default = "programming_language" + data_factory_key = "data_factory" domain_key = "domain" @@ -91,6 +99,9 @@ def __init__(self, config: dict): raise RuntimeError( "Programming language detection requested without providing a mapping of extensions to languages" ) + self.programming_language_column = config.get(programming_language_column_key, + programming_language_column_default) + self.treat_as_code = config.get(code_data_key, code_data_default) domain = config.get(domain_key, None) snapshot = config.get(domain_key, None) self.shared_columns = {} @@ -138,9 +149,11 @@ def transform_binary(self, file_name: str, byte_array: bytes) -> tuple[list[tupl "date_acquired": datetime.now().isoformat(), "repo_name": os.path.splitext(os.path.basename(file_name))[0], } | self.shared_columns - if self.detect_programming_lang: + # extra processing for code + if self.treat_as_code and self.detect_programming_lang: lang = self._get_lang_from_ext(ext) - row_data["programming_language"] = lang # TODO column name should be configurable + if lang is not None: + row_data[self.programming_language_column] = lang data.append(row_data) number_of_rows += 1 else: @@ -178,13 +191,25 @@ def add_input_params(self, parser: ArgumentParser) -> None: (e.g, noop_, pii_, etc.) """ parser.add_argument( - f"--{cli_prefix}{supported_langs_file_key}", + f"--{code_data_cli_key}", + type=lambda x: bool(str2bool(x)), + default=code_data_default, + help="flag to process files as code" + ) + parser.add_argument( + f"--{programming_language_column_cli_key}", + type=str, + default=programming_language_column_default, + help="Path to file containing the list of supported languages", + ) + parser.add_argument( + f"--{supported_langs_file_cli_key}", type=str, default=None, help="Path to file containing the list of supported languages", ) parser.add_argument( - f"--{cli_prefix}{detect_programming_lang_key}", + f"--{detect_programming_lang_cli_key}", type=lambda x: bool(str2bool(x)), default=detect_programming_lang_default, help="Infer the programming lang from the file extension using the file of supported languages", diff --git a/transforms/code/code2parquet/python/src/code2parquet_transform_python.py b/transforms/universal/zip2parquet/python/src/zip2parquet_transform_python.py similarity index 89% rename from transforms/code/code2parquet/python/src/code2parquet_transform_python.py rename to transforms/universal/zip2parquet/python/src/zip2parquet_transform_python.py index ea09a1808..b35b9b6e7 100644 --- a/transforms/code/code2parquet/python/src/code2parquet_transform_python.py +++ b/transforms/universal/zip2parquet/python/src/zip2parquet_transform_python.py @@ -10,12 +10,9 @@ # limitations under the License. ################################################################################ -from code2parquet_transform import ( +from zip2parquet_transform import ( CodeToParquetTransform, CodeToParquetTransformConfiguration, - data_factory_key, - get_supported_languages, - supported_langs_file_key, ) from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing.runtime.pure_python.runtime_configuration import ( @@ -35,5 +32,5 @@ def __init__(self): if __name__ == "__main__": # launcher = NOOPRayLauncher() launcher = PythonTransformLauncher(CodeToParquetPythonConfiguration()) - logger.info("Launching noop transform") + logger.info("Launching zip2parquet transform") launcher.launch() diff --git a/transforms/code/code2parquet/python/test-data/expected/application-java.parquet b/transforms/universal/zip2parquet/python/test-data/expected/application-java.parquet similarity index 60% rename from transforms/code/code2parquet/python/test-data/expected/application-java.parquet rename to transforms/universal/zip2parquet/python/test-data/expected/application-java.parquet index 68be60a25..aea898a1f 100644 Binary files a/transforms/code/code2parquet/python/test-data/expected/application-java.parquet and b/transforms/universal/zip2parquet/python/test-data/expected/application-java.parquet differ diff --git a/transforms/code/code2parquet/python/test-data/expected/data-processing-lib.parquet b/transforms/universal/zip2parquet/python/test-data/expected/data-processing-lib.parquet similarity index 88% rename from transforms/code/code2parquet/python/test-data/expected/data-processing-lib.parquet rename to transforms/universal/zip2parquet/python/test-data/expected/data-processing-lib.parquet index f9c39bb6f..5c252f7db 100644 Binary files a/transforms/code/code2parquet/python/test-data/expected/data-processing-lib.parquet and b/transforms/universal/zip2parquet/python/test-data/expected/data-processing-lib.parquet differ diff --git a/transforms/code/code2parquet/python/test-data/expected/https___github.com_00000o1_environments_archive_refs_heads_master.parquet b/transforms/universal/zip2parquet/python/test-data/expected/https___github.com_00000o1_environments_archive_refs_heads_master.parquet similarity index 90% rename from transforms/code/code2parquet/python/test-data/expected/https___github.com_00000o1_environments_archive_refs_heads_master.parquet rename to transforms/universal/zip2parquet/python/test-data/expected/https___github.com_00000o1_environments_archive_refs_heads_master.parquet index bee0b0abc..1a3df408a 100644 Binary files a/transforms/code/code2parquet/python/test-data/expected/https___github.com_00000o1_environments_archive_refs_heads_master.parquet and b/transforms/universal/zip2parquet/python/test-data/expected/https___github.com_00000o1_environments_archive_refs_heads_master.parquet differ diff --git a/transforms/universal/zip2parquet/python/test-data/expected/metadata.json b/transforms/universal/zip2parquet/python/test-data/expected/metadata.json new file mode 100644 index 000000000..f1e9bd50d --- /dev/null +++ b/transforms/universal/zip2parquet/python/test-data/expected/metadata.json @@ -0,0 +1,49 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "code2parquet", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-08-21 08:31:37", + "end_time": "2024-08-21 08:31:37", + "status": "success" + }, + "code": { + "github": "github", + "commit_hash": "12345", + "path": "path" + }, + "job_input_params": { + "code_data": true, + "programming_language_column": "programming_language", + "supported_langs_file": "/Users/borisl/Projects/data-prep-kit/transforms/code/code2parquet/python/test-data/languages/lang_extensions.json", + "detect_programming_lang": true, + "snapshot": null, + "domain": null, + "s3_cred": null, + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [ + ".zip" + ], + "num_processors": 0 + }, + "job_output_stats": { + "source_files": 3, + "source_size": 33885652, + "result_files": 3, + "result_size": 70169, + "processing_time": 0.107, + "number of rows": 74 + }, + "source": { + "name": "/Users/borisl/Projects/data-prep-kit/transforms/code/code2parquet/python/test-data/input", + "type": "path" + }, + "target": { + "name": "/Users/borisl/Projects/data-prep-kit/transforms/code/code2parquet/python/output", + "type": "path" + } +} \ No newline at end of file diff --git a/transforms/code/code2parquet/python/test-data/input/application-java.zip b/transforms/universal/zip2parquet/python/test-data/input/application-java.zip similarity index 100% rename from transforms/code/code2parquet/python/test-data/input/application-java.zip rename to transforms/universal/zip2parquet/python/test-data/input/application-java.zip diff --git a/transforms/code/code2parquet/python/test-data/input/data-processing-lib.zip b/transforms/universal/zip2parquet/python/test-data/input/data-processing-lib.zip similarity index 100% rename from transforms/code/code2parquet/python/test-data/input/data-processing-lib.zip rename to transforms/universal/zip2parquet/python/test-data/input/data-processing-lib.zip diff --git a/transforms/code/code2parquet/python/test-data/input/https___github.com_00000o1_environments_archive_refs_heads_master.zip b/transforms/universal/zip2parquet/python/test-data/input/https___github.com_00000o1_environments_archive_refs_heads_master.zip similarity index 100% rename from transforms/code/code2parquet/python/test-data/input/https___github.com_00000o1_environments_archive_refs_heads_master.zip rename to transforms/universal/zip2parquet/python/test-data/input/https___github.com_00000o1_environments_archive_refs_heads_master.zip diff --git a/transforms/code/code2parquet/python/test-data/languages/lang_extensions.json b/transforms/universal/zip2parquet/python/test-data/languages/lang_extensions.json similarity index 100% rename from transforms/code/code2parquet/python/test-data/languages/lang_extensions.json rename to transforms/universal/zip2parquet/python/test-data/languages/lang_extensions.json diff --git a/transforms/code/code2parquet/python/test/test_code2parquet.py b/transforms/universal/zip2parquet/python/test/test_zip2parquet.py similarity index 97% rename from transforms/code/code2parquet/python/test/test_code2parquet.py rename to transforms/universal/zip2parquet/python/test/test_zip2parquet.py index 22524264b..aa7090050 100644 --- a/transforms/code/code2parquet/python/test/test_code2parquet.py +++ b/transforms/universal/zip2parquet/python/test/test_zip2parquet.py @@ -12,7 +12,7 @@ import os -from code2parquet_transform import ( # domain_key,; snapshot_key, +from zip2parquet_transform import ( # domain_key,; snapshot_key, CodeToParquetTransform, data_factory_key, detect_programming_lang_key, diff --git a/transforms/code/code2parquet/python/test/test_code2parquet_python.py b/transforms/universal/zip2parquet/python/test/test_zip2parquet_python.py similarity index 93% rename from transforms/code/code2parquet/python/test/test_code2parquet_python.py rename to transforms/universal/zip2parquet/python/test/test_zip2parquet_python.py index cee24e09f..912cecc2c 100644 --- a/transforms/code/code2parquet/python/test/test_code2parquet_python.py +++ b/transforms/universal/zip2parquet/python/test/test_zip2parquet_python.py @@ -13,13 +13,13 @@ import ast import os -from code2parquet_transform import ( # domain_key,; snapshot_key, +from zip2parquet_transform import ( # domain_key,; snapshot_key, detect_programming_lang_cli_key, domain_cli_key, snapshot_cli_key, supported_langs_file_cli_key, ) -from code2parquet_transform_python import CodeToParquetPythonConfiguration +from zip2parquet_transform_python import CodeToParquetPythonConfiguration from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing.test_support.launch.transform_test import ( AbstractTransformLauncherTest, diff --git a/transforms/code/code2parquet/ray/.dockerignore b/transforms/universal/zip2parquet/ray/.dockerignore similarity index 100% rename from transforms/code/code2parquet/ray/.dockerignore rename to transforms/universal/zip2parquet/ray/.dockerignore diff --git a/transforms/code/code2parquet/python/.gitignore b/transforms/universal/zip2parquet/ray/.gitignore similarity index 100% rename from transforms/code/code2parquet/python/.gitignore rename to transforms/universal/zip2parquet/ray/.gitignore diff --git a/transforms/code/code2parquet/ray/Dockerfile b/transforms/universal/zip2parquet/ray/Dockerfile similarity index 88% rename from transforms/code/code2parquet/ray/Dockerfile rename to transforms/universal/zip2parquet/ray/Dockerfile index 495acbb54..27fcc2e2c 100644 --- a/transforms/code/code2parquet/ray/Dockerfile +++ b/transforms/universal/zip2parquet/ray/Dockerfile @@ -19,19 +19,19 @@ COPY --chown=ray:users python-transform/ python-transform/ RUN cd python-transform && pip install --no-cache-dir -e . # Install ray project source -COPY --chown=ray:users src/ src/ +COPY --chown=ray:users src src/ COPY --chown=ray:users pyproject.toml pyproject.toml RUN pip install --no-cache-dir -e . # copy the main() entry point to the image -COPY src/code2parquet_transform_ray.py . +COPY src/zip2parquet_transform_ray.py . # copy some of the samples in -COPY src/code2parquet_local_ray.py local/ +COPY src/zip2parquet_local_ray.py local/ # copy test -COPY test/ test/ -COPY test-data/ test-data/ +COPY test test/ +COPY test-data test-data/ # Set environment ENV PYTHONPATH /home/ray diff --git a/transforms/code/code2parquet/ray/Makefile b/transforms/universal/zip2parquet/ray/Makefile similarity index 82% rename from transforms/code/code2parquet/ray/Makefile rename to transforms/universal/zip2parquet/ray/Makefile index bc1580987..945285687 100644 --- a/transforms/code/code2parquet/ray/Makefile +++ b/transforms/universal/zip2parquet/ray/Makefile @@ -6,7 +6,7 @@ REPOROOT=../../../.. # to override/redefine the rules below. include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=code2parquet +TRANSFORM_NAME=zip2parquet BASE_IMAGE=${RAY_BASE_IMAGE} venv:: .transforms.ray-venv @@ -33,7 +33,7 @@ setup:: .transforms.setup # set the version of python transform that this depends on. set-versions: - $(MAKE) TRANSFORM_PYTHON_VERSION=$(CODE2PARQUET_PYTHON_VERSION) TOML_VERSION=$(CODE2PARQUET_RAY_VERSION) .transforms.set-versions + $(MAKE) TRANSFORM_PYTHON_VERSION=$(ZIP2PARQUET_PYTHON_VERSION) TOML_VERSION=$(ZIP2PARQUET_RAY_VERSION) .transforms.set-versions build-dist:: .defaults.build-dist @@ -44,8 +44,8 @@ run-cli-sample: RUN_ARGS="--run_locally True \ --data_local_config \" { 'input_folder' : '../test-data/input', 'output_folder' : '../output' } \" \ --data_files_to_use \"['.zip']\" \ - --code2parquet_supported_langs_file ../test-data/languages/lang_extensions.json \ - --code2parquet_detect_programming_lang True " \ + --zip2parquet_supported_langs_file ../test-data/languages/lang_extensions.json \ + --zip2parquet_detect_programming_lang True " \ .transforms.run-src-file run-local-sample: .transforms.run-local-ray-sample diff --git a/transforms/code/code2parquet/ray/README.md b/transforms/universal/zip2parquet/ray/README.md similarity index 77% rename from transforms/code/code2parquet/ray/README.md rename to transforms/universal/zip2parquet/ray/README.md index 893f45375..1f49f625c 100644 --- a/transforms/code/code2parquet/ray/README.md +++ b/transforms/universal/zip2parquet/ray/README.md @@ -1,15 +1,15 @@ -# NOOP Ray Transform +# ZIP2Parquet Ray Transform Please see the set of [transform project conventions](../../../README.md#transform-project-conventions) for details on general project conventions, transform configuration, testing and IDE set up. ## Summary -This project wraps the [code2parquet transform](../python) with a Ray runtime. +This project wraps the [zip2parquet transform](../python) with a Ray runtime. ## Configuration and command line Options -code2parquet transform configuration and command line options are the same as for the base python transform. +ZIP2parquet transform configuration and command line options are the same as for the base python transform. ## Running @@ -21,9 +21,9 @@ the set of ### Running the samples To run the samples, use the following `make` targets -* `run-cli-sample` - runs src/code2parquet_transform.py using command line args -* `run-local-sample` - runs src/code2parquet_local_ray.py -* `run-s3-sample` - runs src/code2parquet_s3_ray.py +* `run-cli-sample` - runs src/zip2parquet_transform.py using command line args +* `run-local-sample` - runs src/zip2parquet_local_ray.py +* `run-s3-sample` - runs src/zip2parquet_s3_ray.py * Requires prior installation of minio, depending on your platform (e.g., from [here](https://min.io/docs/minio/macos/index.html) and [here](https://min.io/docs/minio/linux/index.html) and invocation of `make minio-start` to load data into local minio for S3 access. diff --git a/transforms/code/code2parquet/ray/pyproject.toml b/transforms/universal/zip2parquet/ray/pyproject.toml similarity index 84% rename from transforms/code/code2parquet/ray/pyproject.toml rename to transforms/universal/zip2parquet/ray/pyproject.toml index f610754d0..ac24763fa 100644 --- a/transforms/code/code2parquet/ray/pyproject.toml +++ b/transforms/universal/zip2parquet/ray/pyproject.toml @@ -1,17 +1,17 @@ [project] -name = "dpk_code2parquet_transform_ray" +name = "dpk_zip2parquet_transform_ray" version = "0.2.1.dev0" requires-python = ">=3.10" -description = "code2parquet Ray Transform" +description = "zip2parquet Ray Transform" license = {text = "Apache-2.0"} readme = {file = "README.md", content-type = "text/markdown"} authors = [ { name = "David Wood", email = "dawood@us.ibm.com" }, - { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, + { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ "data-prep-toolkit-ray==0.2.1.dev0", - "dpk-code2parquet-transform-python==0.2.1.dev0", + "dpk-zip2parquet-transform-python==0.2.1.dev0", "parameterized", "pandas", ] diff --git a/transforms/code/code2parquet/ray/src/code2parquet_local_ray.py b/transforms/universal/zip2parquet/ray/src/zip2parquet_local_ray.py similarity index 95% rename from transforms/code/code2parquet/ray/src/code2parquet_local_ray.py rename to transforms/universal/zip2parquet/ray/src/zip2parquet_local_ray.py index 1f2e4a008..b8dc3dd99 100644 --- a/transforms/code/code2parquet/ray/src/code2parquet_local_ray.py +++ b/transforms/universal/zip2parquet/ray/src/zip2parquet_local_ray.py @@ -14,11 +14,12 @@ import os import sys -from code2parquet_transform import ( +from zip2parquet_transform import ( detect_programming_lang_cli_key, supported_langs_file_cli_key, + ) -from code2parquet_transform_ray import CodeToParquetRayConfiguration +from zip2parquet_transform_ray import CodeToParquetRayConfiguration from data_processing.utils import GB, ParamsUtils from data_processing_ray.runtime.ray import RayTransformLauncher diff --git a/transforms/code/code2parquet/ray/src/code2parquet_s3_ray.py b/transforms/universal/zip2parquet/ray/src/zip2parquet_s3_ray.py similarity index 95% rename from transforms/code/code2parquet/ray/src/code2parquet_s3_ray.py rename to transforms/universal/zip2parquet/ray/src/zip2parquet_s3_ray.py index 783edd60c..bd156a2e2 100644 --- a/transforms/code/code2parquet/ray/src/code2parquet_s3_ray.py +++ b/transforms/universal/zip2parquet/ray/src/zip2parquet_s3_ray.py @@ -13,11 +13,11 @@ import ast import sys -from code2parquet_transform import ( +from zip2parquet_transform import ( detect_programming_lang_cli_key, supported_langs_file_cli_key, ) -from code2parquet_transform_ray import CodeToParquetRayConfiguration +from zip2parquet_transform_ray import CodeToParquetRayConfiguration from data_processing.utils import GB, ParamsUtils from data_processing_ray.runtime.ray import RayTransformLauncher diff --git a/transforms/code/code2parquet/ray/src/code2parquet_transform_ray.py b/transforms/universal/zip2parquet/ray/src/zip2parquet_transform_ray.py similarity index 99% rename from transforms/code/code2parquet/ray/src/code2parquet_transform_ray.py rename to transforms/universal/zip2parquet/ray/src/zip2parquet_transform_ray.py index 5c81ca910..4a88b9cf0 100644 --- a/transforms/code/code2parquet/ray/src/code2parquet_transform_ray.py +++ b/transforms/universal/zip2parquet/ray/src/zip2parquet_transform_ray.py @@ -13,7 +13,7 @@ from typing import Any import ray -from code2parquet_transform import ( +from zip2parquet_transform import ( CodeToParquetTransform, CodeToParquetTransformConfiguration, data_factory_key, diff --git a/transforms/code/code2parquet/ray/test-data/expected/application-java.parquet b/transforms/universal/zip2parquet/ray/test-data/expected/application-java.parquet similarity index 60% rename from transforms/code/code2parquet/ray/test-data/expected/application-java.parquet rename to transforms/universal/zip2parquet/ray/test-data/expected/application-java.parquet index 68be60a25..aea898a1f 100644 Binary files a/transforms/code/code2parquet/ray/test-data/expected/application-java.parquet and b/transforms/universal/zip2parquet/ray/test-data/expected/application-java.parquet differ diff --git a/transforms/code/code2parquet/ray/test-data/expected/data-processing-lib.parquet b/transforms/universal/zip2parquet/ray/test-data/expected/data-processing-lib.parquet similarity index 88% rename from transforms/code/code2parquet/ray/test-data/expected/data-processing-lib.parquet rename to transforms/universal/zip2parquet/ray/test-data/expected/data-processing-lib.parquet index f9c39bb6f..5c252f7db 100644 Binary files a/transforms/code/code2parquet/ray/test-data/expected/data-processing-lib.parquet and b/transforms/universal/zip2parquet/ray/test-data/expected/data-processing-lib.parquet differ diff --git a/transforms/code/code2parquet/ray/test-data/expected/https___github.com_00000o1_environments_archive_refs_heads_master.parquet b/transforms/universal/zip2parquet/ray/test-data/expected/https___github.com_00000o1_environments_archive_refs_heads_master.parquet similarity index 90% rename from transforms/code/code2parquet/ray/test-data/expected/https___github.com_00000o1_environments_archive_refs_heads_master.parquet rename to transforms/universal/zip2parquet/ray/test-data/expected/https___github.com_00000o1_environments_archive_refs_heads_master.parquet index bee0b0abc..1a3df408a 100644 Binary files a/transforms/code/code2parquet/ray/test-data/expected/https___github.com_00000o1_environments_archive_refs_heads_master.parquet and b/transforms/universal/zip2parquet/ray/test-data/expected/https___github.com_00000o1_environments_archive_refs_heads_master.parquet differ diff --git a/transforms/universal/zip2parquet/ray/test-data/expected/metadata.json b/transforms/universal/zip2parquet/ray/test-data/expected/metadata.json new file mode 100644 index 000000000..f1e9bd50d --- /dev/null +++ b/transforms/universal/zip2parquet/ray/test-data/expected/metadata.json @@ -0,0 +1,49 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "code2parquet", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-08-21 08:31:37", + "end_time": "2024-08-21 08:31:37", + "status": "success" + }, + "code": { + "github": "github", + "commit_hash": "12345", + "path": "path" + }, + "job_input_params": { + "code_data": true, + "programming_language_column": "programming_language", + "supported_langs_file": "/Users/borisl/Projects/data-prep-kit/transforms/code/code2parquet/python/test-data/languages/lang_extensions.json", + "detect_programming_lang": true, + "snapshot": null, + "domain": null, + "s3_cred": null, + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [ + ".zip" + ], + "num_processors": 0 + }, + "job_output_stats": { + "source_files": 3, + "source_size": 33885652, + "result_files": 3, + "result_size": 70169, + "processing_time": 0.107, + "number of rows": 74 + }, + "source": { + "name": "/Users/borisl/Projects/data-prep-kit/transforms/code/code2parquet/python/test-data/input", + "type": "path" + }, + "target": { + "name": "/Users/borisl/Projects/data-prep-kit/transforms/code/code2parquet/python/output", + "type": "path" + } +} \ No newline at end of file diff --git a/transforms/code/code2parquet/ray/test-data/input/application-java.zip b/transforms/universal/zip2parquet/ray/test-data/input/application-java.zip similarity index 100% rename from transforms/code/code2parquet/ray/test-data/input/application-java.zip rename to transforms/universal/zip2parquet/ray/test-data/input/application-java.zip diff --git a/transforms/code/code2parquet/ray/test-data/input/data-processing-lib.zip b/transforms/universal/zip2parquet/ray/test-data/input/data-processing-lib.zip similarity index 100% rename from transforms/code/code2parquet/ray/test-data/input/data-processing-lib.zip rename to transforms/universal/zip2parquet/ray/test-data/input/data-processing-lib.zip diff --git a/transforms/code/code2parquet/ray/test-data/input/https___github.com_00000o1_environments_archive_refs_heads_master.zip b/transforms/universal/zip2parquet/ray/test-data/input/https___github.com_00000o1_environments_archive_refs_heads_master.zip similarity index 100% rename from transforms/code/code2parquet/ray/test-data/input/https___github.com_00000o1_environments_archive_refs_heads_master.zip rename to transforms/universal/zip2parquet/ray/test-data/input/https___github.com_00000o1_environments_archive_refs_heads_master.zip diff --git a/transforms/code/code2parquet/ray/test-data/languages/lang_extensions.json b/transforms/universal/zip2parquet/ray/test-data/languages/lang_extensions.json similarity index 100% rename from transforms/code/code2parquet/ray/test-data/languages/lang_extensions.json rename to transforms/universal/zip2parquet/ray/test-data/languages/lang_extensions.json diff --git a/transforms/code/code2parquet/ray/test/test_code2parquet_ray.py b/transforms/universal/zip2parquet/ray/test/test_zip2parquet_ray.py similarity index 95% rename from transforms/code/code2parquet/ray/test/test_code2parquet_ray.py rename to transforms/universal/zip2parquet/ray/test/test_zip2parquet_ray.py index e05cba502..7d3074622 100644 --- a/transforms/code/code2parquet/ray/test/test_code2parquet_ray.py +++ b/transforms/universal/zip2parquet/ray/test/test_zip2parquet_ray.py @@ -13,7 +13,7 @@ import ast import os -from code2parquet_transform import ( +from zip2parquet_transform import ( detect_programming_lang_cli_key, detect_programming_lang_key, domain_cli_key, @@ -21,7 +21,7 @@ supported_langs_file_cli_key, supported_langs_file_key, ) -from code2parquet_transform_ray import CodeToParquetRayConfiguration +from zip2parquet_transform_ray import CodeToParquetRayConfiguration from data_processing.test_support.launch.transform_test import ( AbstractTransformLauncherTest, )