From 8acb197e17e8b928e1d978f7d75230a9a6a289a2 Mon Sep 17 00:00:00 2001 From: Jacob Wujciak-Jens Date: Mon, 1 Apr 2024 13:50:16 -0700 Subject: [PATCH] Move signature check and bias fuzzer to gha (#9191) Summary: Move the signature check and bias fuzzer from CCI to GHA. Pull Request resolved: https://github.com/facebookincubator/velox/pull/9191 Reviewed By: Yuhta Differential Revision: D55599384 Pulled By: kgpai fbshipit-source-id: b998a064319f451ebe2d3508f3dbca3d165a669a --- .circleci/README.md | 89 ----- .circleci/config.yml | 25 +- .circleci/dist_compile.yml | 532 ------------------------- .github/workflows/experimental.yml | 4 +- .github/workflows/linux-build.yml | 39 +- .github/workflows/scheduled.yml | 372 +++++++++++------ CMakeLists.txt | 11 - Makefile | 4 +- pyvelox/CMakeLists.txt | 19 +- scripts/adapters.dockerfile | 2 +- {.circleci => scripts}/hdfs-client.xml | 0 scripts/signature.py | 137 ++++++- setup.py | 15 +- 13 files changed, 423 insertions(+), 826 deletions(-) delete mode 100644 .circleci/README.md delete mode 100644 .circleci/dist_compile.yml rename {.circleci => scripts}/hdfs-client.xml (100%) diff --git a/.circleci/README.md b/.circleci/README.md deleted file mode 100644 index 31e80ebd3450..000000000000 --- a/.circleci/README.md +++ /dev/null @@ -1,89 +0,0 @@ -CircleCi integration is controlled by the `./circleci/config.yml` file. Our -config currently contains two workflows. One is triggered on every pull request update. -The other workflow runs nightly to verify our compatibility with prestodb internal protocol. - -The PR workflow is named `dist-compile` and has 4 jobs, 2 to build and run unit tests on linux and macos -and 2 to check code formatting and license headers: -* linux-build -* macos-build -* format-check -* header-check - -## Running locally - -The linux container based jobs can be run locally using the `circleci` cli: - -``` - circleci local execute --job JOB_NAME -``` - -For example to run unit tests use: - -``` - circleci local execute --job linux-build -``` - -A Nightly build with prestodb/master sync checks that the presto_protocol library -remains in sync with Presto Java. - -Run the nightly sync job locally: -``` - circleci local execute --job presto-sync -``` - -## Install CircleCi cli -``` - curl -fLSs https://circle.ci/cli | bash -``` - -To use containers Docker must be installed. Here are instructions to [Install -Docker on macos](https://docs.docker.com/docker-for-mac/install/). Docker deamon -must be running before issuing the `circleci` commands. - -### Macos testing - -Macos testing is done by using the CircleCi macos executor and installing -dependencies each time the job is run. This executor cannot be run locally. -The script `scripts/setup-macos.sh` contains commands that are run as part of -this job to install these dependencies. - -### Linux testing - -Linux testing uses a Docker container. The container build depends on the Velox CircleCi container. Check -velox/.circleci/config.yml to see that the base container in circleci-container.dockfile is using the latest. -The container build uses Docker and should be run on your macos or linux laptop with Docker installed and -running. - -#### Build the base container: - -* In an up-to-date clone of velox (maybe you have one?) - -``` -git clone git@github.com:facebookincubator/velox.git -cd velox -make base-container -``` -* Wait - This step takes rather a long time. It is building clang-format v8 to be compatible with fbcode -* When the base container is finished the new container name will be printed on the console. -* Push the container to DockerHub -``` -docker push prestocpp/base-container:$USER-YYYYMMDD -``` -* After the push, update `scripts/velox-container.dockfile` with the newly build base container name - -#### Build the dependencies container - -* If you have a new base-container update scripts/velox-container.dockfile to refer to it -* Build the velox container -``` -make velox-container.dockfile -``` -* Wait - This takes a few minutes, but not nearly as long as the base container. -* When the velox container is finished the new container name will be printed on the console. -* Push the container to DockerHub -``` -docker push prestocpp/velox-container:$USER-YYYYMMDD -``` -* Update `.circleci/config.yml` with the newly built circleci container name. - There are two places in the config.yml file that refer to the container, update - both. diff --git a/.circleci/config.yml b/.circleci/config.yml index 303f1356521e..b121fa7dfd36 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -17,28 +17,17 @@ version: 2.1 # This allows us to use CircleCI's dynamic configuration feature setup: true - -# Path-filtering orb is required to continue a pipeline based on -# the path of an updated fileset -orbs: - path-filtering: circleci/path-filtering@0.1.1 +jobs: + noop-build: + docker: + - image: cimg/base:2024.02 + steps: + - run: circleci-agent step halt workflows: version: 2 path-filtering-workflow: jobs: + - noop-build - - path-filtering/filter: - name: check-sensitive-paths - - # Format is: - # Regex below will filter out paths with test in them. - mapping: | - velox/expression/((?!.*test).*).* run-longer-expression-fuzzer true - velox/exec/((?!.*test).*).* run-longer-expression-fuzzer true - velox/common/((?!.*test).*).* run-longer-expression-fuzzer true - velox/core/((?!.*test).*).* run-longer-expression-fuzzer true - velox/vector/((?!.*test).*).* run-longer-expression-fuzzer true - - config-path: .circleci/dist_compile.yml diff --git a/.circleci/dist_compile.yml b/.circleci/dist_compile.yml deleted file mode 100644 index 7d607d100a97..000000000000 --- a/.circleci/dist_compile.yml +++ /dev/null @@ -1,532 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -version: 2.1 - - -# Default pipeline parameters, which will be updated according to -# the results of the path-filtering orb -parameters: - run-longer-expression-fuzzer: - type: boolean - default: false - -commands: - update-submodules: - steps: - - run: - name: "Update Submodules" - command: | - git submodule sync --recursive - git submodule update --init --recursive - - setup-environment: - steps: - - run: - name: "Setup Environment" - command: | - # Calculate ccache key. - git show -s --format=%cd --date="format:%Y%m%d" $(git merge-base origin/main HEAD) | tee merge-base-date - - # Set up xml gtest output. - mkdir -p /tmp/test_xml_output/ - echo "export XML_OUTPUT_FILE=\"/tmp/test_xml_output/\"" >> $BASH_ENV - - # Set up ccache configs. - mkdir -p .ccache - echo "export CCACHE_DIR=$(realpath .ccache)" >> $BASH_ENV - ccache -sz -M 5Gi - if [ -e /opt/rh/gcc-toolset-9/enable ]; then - source /opt/rh/gcc-toolset-9/enable - fi - - restore_cache: - name: "Restore CCache Cache" - keys: - - velox-ccache-debug-{{ arch }}-{{ checksum "merge-base-date" }} - - pre-steps: - steps: - - checkout - - update-submodules - - setup-environment - - post-steps: - steps: - - save_cache: - name: "Save CCache Cache" - key: velox-ccache-debug-{{ arch }}-{{ checksum "merge-base-date" }} - paths: - - .ccache/ - - store_artifacts: - path: '_build/debug/.ninja_log' - - store_test_results: - path: '/tmp/test_xml_output/' - - build-benchmarks: - parameters: - binary_output: - type: string - benchmark_class: - type: string - steps: - - run: - name: "Build Benchmarks - << parameters.benchmark_class >>" - command: | - make benchmarks-basic-build NUM_THREADS=8 MAX_HIGH_MEM_JOBS=4 MAX_LINK_JOBS=4 - ccache -s - mkdir -p << parameters.binary_output >> - cp -r --verbose _build/release/velox/benchmarks/basic/* << parameters.binary_output >> - - fuzzer-run: - parameters: - fuzzer_repro: - type: string - fuzzer_output: - type: string - fuzzer_name: - type: string - fuzzer_exe: - type: string - fuzzer_args: - type: string - steps: - - pre-steps - - run: - name: Build - command: | - make debug NUM_THREADS=8 MAX_HIGH_MEM_JOBS=8 MAX_LINK_JOBS=4 - ccache -s - no_output_timeout: 1h - - run: - name: "Run << parameters.fuzzer_name >> Fuzzer" - command: | - eval ' << parameters.fuzzer_exe >> << parameters.fuzzer_args >> ' \ - 2>&1 | tee "<< parameters.fuzzer_output >>" || ( \ - tail -n 1000 "<< parameters.fuzzer_output >>" ; \ - echo "FAIL: << parameters.fuzzer_name >> run failed"; \ - exit 1; \ - ) - echo -e "\n << parameters.fuzzer_name >> run finished successfully." - no_output_timeout: 120m - - store_artifacts: - path: << parameters.fuzzer_output >> - - store_artifacts: - path: << parameters.fuzzer_repro >> - - post-steps - -executors: - build: - docker: - - image : ghcr.io/facebookincubator/velox-dev:circleci-avx - resource_class: 2xlarge - environment: - CC: /opt/rh/gcc-toolset-9/root/bin/gcc - CXX: /opt/rh/gcc-toolset-9/root/bin/g++ - VELOX_DEPENDENCY_SOURCE: BUNDLED - simdjson_SOURCE: BUNDLED - check: - docker: - - image : ghcr.io/facebookincubator/velox-dev:check-avx - -jobs: - linux-build: - executor: build - environment: - DuckDB_SOURCE: SYSTEM - steps: - - pre-steps - - run: - name: "Build" - command: | - make debug NUM_THREADS=8 MAX_HIGH_MEM_JOBS=4 MAX_LINK_JOBS=4 EXTRA_CMAKE_FLAGS="-DVELOX_ENABLE_ARROW=ON" - ccache -s - no_output_timeout: 1h - - run: - name: "Run Unit Tests" - command: | - cd _build/debug && ctest -j 16 -VV --output-on-failure --no-tests=error - no_output_timeout: 1h - - store_test_results: - path: /tmp/test_xml_output/ - - run: - name: "Run Fuzzer Tests" - # Run fuzzer using the built executable - we do this instead of make - # since currently make fuzzertest tends to rebuild the project. - command: | - mkdir -p /tmp/fuzzer_repro/ - chmod -R 777 /tmp/fuzzer_repro - _build/debug/velox/expression/tests/velox_expression_fuzzer_test \ - --seed ${RANDOM} \ - --enable_variadic_signatures \ - --velox_fuzzer_enable_complex_types \ - --lazy_vector_generation_ratio 0.2 \ - --velox_fuzzer_enable_column_reuse \ - --velox_fuzzer_enable_expression_reuse \ - --max_expression_trees_per_step 2 \ - --retry_with_try \ - --enable_dereference \ - --duration_sec 60 \ - --logtostderr=1 \ - --minloglevel=0 \ - --repro_persist_path=/tmp/fuzzer_repro \ - && echo -e "\n\nFuzzer run finished successfully." - no_output_timeout: 5m - - store_artifacts: - path: '/tmp/fuzzer_repro' - - run: - name: "Run Spark Fuzzer Tests" - command: | - mkdir -p /tmp/spark_fuzzer_repro/ - chmod -R 777 /tmp/spark_fuzzer_repro - _build/debug/velox/expression/tests/spark_expression_fuzzer_test \ - --seed ${RANDOM} \ - --duration_sec 60 \ - --enable_variadic_signatures \ - --lazy_vector_generation_ratio 0.2 \ - --velox_fuzzer_enable_column_reuse \ - --velox_fuzzer_enable_expression_reuse \ - --max_expression_trees_per_step 2 \ - --retry_with_try \ - --enable_dereference \ - --logtostderr=1 \ - --minloglevel=0 \ - --repro_persist_path=/tmp/spark_fuzzer_repro \ - && echo -e "\n\nSpark Fuzzer run finished successfully." - no_output_timeout: 5m - - store_artifacts: - path: '/tmp/spark_fuzzer_repro' - - run: - name: "Run Spark Aggregate Fuzzer Tests" - command: | - mkdir -p /tmp/spark_aggregate_fuzzer_repro/ - chmod -R 777 /tmp/spark_aggregate_fuzzer_repro - _build/debug/velox/functions/sparksql/fuzzer/spark_aggregation_fuzzer_test \ - --seed ${RANDOM} \ - --duration_sec 60 \ - --logtostderr=1 \ - --minloglevel=0 \ - --repro_persist_path=/tmp/spark_aggregate_fuzzer_repro \ - && echo -e "\n\nSpark Aggregation Fuzzer run finished successfully." - no_output_timeout: 5m - - store_artifacts: - path: '/tmp/spark_aggregate_fuzzer_repro' - - run: - name: "Run Aggregate Fuzzer Tests" - # Run aggregation fuzzer using the built executable. - command: | - mkdir -p /tmp/aggregate_fuzzer_repro/ - rm -rfv /tmp/aggregate_fuzzer_repro/* - chmod -R 777 /tmp/aggregate_fuzzer_repro - _build/debug/velox/functions/prestosql/fuzzer/velox_aggregation_fuzzer_test \ - --seed ${RANDOM} \ - --duration_sec 1800 \ - --logtostderr=1 \ - --minloglevel=0 \ - --repro_persist_path=/tmp/aggregate_fuzzer_repro \ - && echo -e "\n\nAggregation fuzzer run finished successfully." - no_output_timeout: 5m - - store_artifacts: - path: '/tmp/aggregate_fuzzer_repro' - - run: - name: "Run Join Fuzzer Tests" - command: | - _build/debug/velox/exec/tests/velox_join_fuzzer_test \ - --seed ${RANDOM} \ - --duration_sec 1800 \ - --logtostderr=1 \ - --minloglevel=0 \ - && echo -e "\n\nJoin fuzzer run finished successfully." - no_output_timeout: 5m - - run: - name: "Run Example Binaries" - command: | - find _build/debug/velox/examples/ -maxdepth 1 -type f -executable -exec "{}" \; - - post-steps - - linux-build-release: - executor: build - steps: - - pre-steps - - run: - name: Build - command: | - make release NUM_THREADS=16 MAX_HIGH_MEM_JOBS=8 MAX_LINK_JOBS=8 - ccache -s - no_output_timeout: 1h - - run: - name: "Run Unit Tests" - command: | - cd _build/release && ctest -j 16 -VV --output-on-failure --no-tests=error - no_output_timeout: 1h - - post-steps - - # Build with different options - linux-build-options: - executor: build - steps: - - pre-steps - - run: - name: "Build Velox Minimal" - command: | - make min_debug NUM_THREADS=16 MAX_HIGH_MEM_JOBS=8 MAX_LINK_JOBS=16 - ccache -s - no_output_timeout: 1h - - run: - name: "Build Velox With Benchmarks and Without Testing" - command: | - make benchmarks-build NUM_THREADS=8 MAX_HIGH_MEM_JOBS=4 MAX_LINK_JOBS=4 - no_output_timeout: 1h - - post-steps - - linux-adapters: - executor: build - environment: - VELOX_DEPENDENCY_SOURCE: SYSTEM - ICU_SOURCE: BUNDLED - simdjson_SOURCE: BUNDLED - xsimd_SOURCE: BUNDLED - DuckDB_SOURCE: SYSTEM - steps: - - pre-steps - - run: - name: "Install Java for Hadoop" - command: | - set -xu - yum -y install java-1.8.0-openjdk - - run: - name: Build including all Benchmarks - command: | - EXTRA_CMAKE_FLAGS=( - "-DVELOX_ENABLE_BENCHMARKS=ON" - "-DVELOX_ENABLE_ARROW=ON" - "-DVELOX_ENABLE_PARQUET=ON" - "-DVELOX_ENABLE_HDFS=ON" - "-DVELOX_ENABLE_S3=ON" - "-DVELOX_ENABLE_GCS=ON" - "-DVELOX_ENABLE_ABFS=ON" - "-DVELOX_ENABLE_SUBSTRAIT=ON" - "-DVELOX_ENABLE_REMOTE_FUNCTIONS=ON" - ) - make release EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS[*]}" NUM_THREADS=16 MAX_HIGH_MEM_JOBS=8 MAX_LINK_JOBS=8 - ccache -s - no_output_timeout: 1h - - run: - name: "Run Unit Tests" - command: | - conda init bash - source ~/.bashrc - conda create -y --name testbench python=3.7 - conda activate testbench - pip install https://github.com/googleapis/storage-testbench/archive/refs/tags/v0.36.0.tar.gz - export LC_ALL=C - export JAVA_HOME=/usr/lib/jvm/jre-1.8.0-openjdk - export HADOOP_ROOT_LOGGER="WARN,DRFA" - export LIBHDFS3_CONF=$(pwd)/.circleci/hdfs-client.xml - export HADOOP_HOME='/usr/local/hadoop' - export PATH=/usr/local/hadoop/bin:${PATH} - # The following is used to install Azurite in the CI for running Abfs Hive Connector unit tests. - # Azurite is an emulator for local Azure Storage development, and it is a required component for running Abfs Hive Connector unit tests. - # It can be installed using npm. The following is used to install Node.js and npm for Azurite installation. - curl -sL https://rpm.nodesource.com/setup_10.x | bash - - yum install -y nodejs - npm install -g azurite - cd _build/release && ctest -j 16 -VV --output-on-failure --no-tests=error - no_output_timeout: 1h - - post-steps - - linux-presto-fuzzer-run: - executor: build - environment: - VELOX_DEPENDENCY_SOURCE: SYSTEM - simdjson_SOURCE: BUNDLED - xsimd_SOURCE: BUNDLED - DuckDB_SOURCE: BUNDLED - steps: - - fuzzer-run: - fuzzer_output: "/tmp/fuzzer.log" - fuzzer_repro: "/tmp/fuzzer_repro" - fuzzer_name: "Expression" - fuzzer_exe: "_build/debug/velox/expression/tests/velox_expression_fuzzer_test" - fuzzer_args: " --seed ${RANDOM} --lazy_vector_generation_ratio 0.2 \ - --duration_sec 1800 --enable_variadic_signatures \ - --velox_fuzzer_enable_complex_types \ - --velox_fuzzer_enable_column_reuse \ - --velox_fuzzer_enable_expression_reuse \ - --max_expression_trees_per_step 2 \ - --retry_with_try \ - --enable_dereference \ - --logtostderr=1 --minloglevel=0 \ - --repro_persist_path=/tmp/fuzzer_repro" - - linux-spark-fuzzer-run: - executor: build - environment: - VELOX_DEPENDENCY_SOURCE: SYSTEM - simdjson_SOURCE: BUNDLED - steps: - - fuzzer-run: - fuzzer_output: "/tmp/spark_fuzzer.log" - fuzzer_repro: "/tmp/spark_fuzzer_repro" - fuzzer_name: "Spark" - fuzzer_exe: "_build/debug/velox/expression/tests/spark_expression_fuzzer_test" - fuzzer_args: " --seed ${RANDOM} --duration_sec 600 --logtostderr=1 --minloglevel=0 \ - --repro_persist_path=/tmp/spark_fuzzer_repro" - - linux-spark-aggregate-fuzzer-run: - executor: build - environment: - VELOX_DEPENDENCY_SOURCE: SYSTEM - simdjson_SOURCE: BUNDLED - steps: - - fuzzer-run: - fuzzer_output: "/tmp/spark_aggregate_fuzzer.log" - fuzzer_repro: "/tmp/spark_aggregate_fuzzer_repro" - fuzzer_name: "SparkAggregate" - fuzzer_exe: "_build/debug/velox/functions/sparksql/fuzzer/spark_aggregation_fuzzer_test" - fuzzer_args: " --seed ${RANDOM} --duration_sec 600 --logtostderr=1 --minloglevel=0 \ - --repro_persist_path=/tmp/spark_aggregate_fuzzer_repro" - - - linux-aggregate-fuzzer-run: - executor: build - environment: - VELOX_DEPENDENCY_SOURCE: SYSTEM - simdjson_SOURCE: BUNDLED - steps: - - fuzzer-run: - fuzzer_output: "/tmp/aggregate_fuzzer.log" - fuzzer_repro: "/tmp/aggregate_fuzzer_repro" - fuzzer_name: "Aggregate" - fuzzer_exe: "_build/debug/velox/functions/prestosql/fuzzer/velox_aggregation_fuzzer_test" - fuzzer_args: " --seed ${RANDOM} --duration_sec 3600 --logtostderr=1 --minloglevel=0 \ - --repro_persist_path=/tmp/aggregate_fuzzer_repro" - - linux-join-fuzzer-run: - executor: build - environment: - VELOX_DEPENDENCY_SOURCE: SYSTEM - simdjson_SOURCE: BUNDLED - steps: - - fuzzer-run: - fuzzer_output: "/tmp/join_fuzzer.log" - fuzzer_repro: "/tmp/join_fuzzer_repro" - fuzzer_name: "Join" - fuzzer_exe: "_build/debug/velox/exec/tests/velox_join_fuzzer_test" - fuzzer_args: " --seed ${RANDOM} --duration_sec 3600 --logtostderr=1 --minloglevel=0" - - linux-pr-fuzzer-run: - executor: build - steps: - - pre-steps - - run: - name: "Get merge base function signatures" - command: | - source ~/.bashrc - conda create -y --name pyveloxenv python=3.7 - conda activate pyveloxenv - cp ./scripts/signature.py /tmp/signature.py - pip install deepdiff - git remote add upstream https://github.com/facebookincubator/velox - git fetch upstream - merge_base=$(git merge-base 'upstream/main' `git rev-parse HEAD`) || \ - { echo "::error::Failed to find merge_base"; exit 1; } - echo "Merge Base: $merge_base" - git checkout $merge_base - git submodule update --init --recursive - LD_LIBRARY_PATH=/usr/local/lib make python-clean - LD_LIBRARY_PATH=/usr/local/lib make python-build - python /tmp/signature.py export --spark spark_merge_base_signatures.json - python /tmp/signature.py export --presto presto_merge_base_signatures.json - - checkout - - run: - name: "Build" - command: | - make debug NUM_THREADS=8 MAX_HIGH_MEM_JOBS=4 MAX_LINK_JOBS=4 EXTRA_CMAKE_FLAGS="-DVELOX_ENABLE_ARROW=ON" - ccache -s - no_output_timeout: 1h - - run: - name: "Build and test PyVelox" - command: | - conda init bash - source ~/.bashrc - conda activate pyveloxenv - LD_LIBRARY_PATH=/usr/local/lib make python-test - - run: - name: "Check and create bias function signatures" - command: | - source ~/.bashrc - conda activate pyveloxenv - pip install deepdiff - python ./scripts/signature.py export --presto presto_pr_signatures.json - python ./scripts/signature.py export --spark spark_pr_signatures.json - if python ./scripts/signature.py bias presto_merge_base_signatures.json presto_pr_signatures.json /tmp/presto_bias_functions 2>&1 > /tmp/presto-err-message; \ - then echo "Presto signature check success" ; else echo "Presto signature check failed" > /tmp/presto-signature-error-code ; fi - if python ./scripts/signature.py bias spark_merge_base_signatures.json spark_pr_signatures.json /tmp/spark_bias_functions ; \ - then echo "Spark signature check success"; else echo "Spark signature check failed" > /tmp/spark-signature-error-code ; fi - - - store_artifacts: - path: 'presto_merge_base_signatures.json' - - store_artifacts: - path: 'presto_pr_signatures.json' - - store_artifacts: - path: 'spark_merge_base_signatures.json' - - store_artifacts: - path: 'spark_pr_signatures.json' - - fuzzer-run: - fuzzer_output: "/tmp/fuzzer.log" - fuzzer_repro: "/tmp/fuzzer_repro" - fuzzer_name: "Expression Bias Run" - fuzzer_exe: "if [ -f /tmp/presto_bias_functions ]; then _build/debug/velox/expression/tests/velox_expression_fuzzer_test" - fuzzer_args: " --seed ${RANDOM} --lazy_vector_generation_ratio 0.2 \ - --assign_function_tickets $(cat /tmp/presto_bias_functions) \ - --duration_sec 3600 --enable_variadic_signatures \ - --velox_fuzzer_enable_complex_types \ - --velox_fuzzer_enable_column_reuse \ - --velox_fuzzer_enable_expression_reuse \ - --max_expression_trees_per_step 2 \ - --retry_with_try \ - --enable_dereference \ - --logtostderr=1 --minloglevel=0 \ - --repro_persist_path=/tmp/fuzzer_repro ; fi" - - - fuzzer-run: - fuzzer_output: "/tmp/spark_fuzzer.log" - fuzzer_repro: "/tmp/spark_fuzzer_repro" - fuzzer_name: "Spark Bias Run" - fuzzer_exe: "if [ -f /tmp/spark_bias_functions ]; then _build/debug/velox/expression/tests/spark_expression_fuzzer_test" - fuzzer_args: " --seed ${RANDOM} --duration_sec 3600 --logtostderr=1 --minloglevel=0 \ - --assign_function_tickets $(cat /tmp/spark_bias_functions) \ - --repro_persist_path=/tmp/spark_fuzzer_repro ; fi" - - - run: - name: "Surface only Presto function signature errors if any" - command: | - if [ -f /tmp/presto-signature-error-code ]; then \ - echo "Incompatible changes have been made to function signatures:\n"; \ - cat /tmp/presto-err-message ; \ - exit 1 ; \ - fi - - -workflows: - - longer-fuzzer: - when: << pipeline.parameters.run-longer-expression-fuzzer >> - jobs: - - linux-pr-fuzzer-run - - shorter-fuzzer: - unless: << pipeline.parameters.run-longer-expression-fuzzer >> - jobs: - - linux-pr-fuzzer-run diff --git a/.github/workflows/experimental.yml b/.github/workflows/experimental.yml index ac9a658f86f2..7394f0836e1a 100644 --- a/.github/workflows/experimental.yml +++ b/.github/workflows/experimental.yml @@ -51,7 +51,6 @@ jobs: timeout-minutes: 120 env: CCACHE_DIR: "${{ github.workspace }}/.ccache/" - CCACHE_BASEDIR: "${{ github.workspace }}" LINUX_DISTRO: "ubuntu" steps: @@ -110,8 +109,7 @@ jobs: container: ghcr.io/facebookincubator/velox-dev:presto-java timeout-minutes: 120 env: - CCACHE_DIR: "${{ github.workspace }}/.ccache/" - CCACHE_BASEDIR: "${{ github.workspace }}" + CCACHE_DIR: "/__w/velox/velox/.ccache/" LINUX_DISTRO: "centos" steps: diff --git a/.github/workflows/linux-build.yml b/.github/workflows/linux-build.yml index 73b37ee4c72c..0c0db76f8214 100644 --- a/.github/workflows/linux-build.yml +++ b/.github/workflows/linux-build.yml @@ -53,8 +53,7 @@ jobs: run: shell: bash env: - CCACHE_DIR: "${{ github.workspace }}/.ccache" - CCACHE_BASEDIR: "${{ github.workspace }}" + CCACHE_DIR: "/__w/velox/velox/.ccache" VELOX_DEPENDENCY_SOURCE: SYSTEM simdjson_SOURCE: BUNDLED xsimd_SOURCE: BUNDLED @@ -71,6 +70,10 @@ jobs: path: '${{ env.CCACHE_DIR }}' key: ccache-linux-adapters + - name: "Zero Ccache Statistics" + run: | + ccache -sz + - name: Make Release Build env: MAKEFLAGS: 'NUM_THREADS=8 MAX_HIGH_MEM_JOBS=4 MAX_LINK_JOBS=4' @@ -87,6 +90,9 @@ jobs: "-DVELOX_ENABLE_REMOTE_FUNCTIONS=ON" ) make release EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS[*]}" + + - name: Ccache after + run: ccache -s - uses: assignUser/stash/save@v1 with: @@ -97,7 +103,7 @@ jobs: # Some of the adapters dependencies are in the 'adapters' conda env shell: mamba run --no-capture-output -n adapters /usr/bin/bash -e {0} env: - LIBHDFS3_CONF: "${{ github.workspace }}/.circleci/hdfs-client.xml" + LIBHDFS3_CONF: "/__w/velox/velox/scripts/hdfs-client.xml" working-directory: _build/release run: | ctest -j 8 --output-on-failure --no-tests=error @@ -107,24 +113,30 @@ jobs: name: "Ubuntu debug with resolve_dependency" env: CCACHE_DIR: "${{ github.workspace }}/.ccache" - CCACHE_BASEDIR: "${{ github.workspace }}" defaults: run: shell: bash + working-directory: velox steps: - - uses: actions/checkout@v4 - - - name: Install Dependencies - run: | - source scripts/setup-ubuntu.sh - - uses: assignUser/stash/restore@v1 + - name: Get Ccache Stash + uses: assignUser/stash/restore@v1 with: path: '${{ env.CCACHE_DIR }}' key: ccache-ubuntu-debug-default - - run: | - mkdir -p .ccache + - name: Ensure Stash Dirs Exists + working-directory: ${{ github.workspace }} + run: | + mkdir -p '${{ env.CCACHE_DIR }}' + + - uses: actions/checkout@v4 + with: + path: velox + + - name: Install Dependencies + run: | + source scripts/setup-ubuntu.sh - name: Clear CCache Statistics run: | @@ -134,8 +146,9 @@ jobs: env: VELOX_DEPENDENCY_SOURCE: BUNDLED MAKEFLAGS: "NUM_THREADS=8 MAX_HIGH_MEM_JOBS=4 MAX_LINK_JOBS=4" + EXTRA_CMAKE_FLAGS: "-DVELOX_ENABLE_ARROW=ON" run: | - make debug EXTRA_CMAKE_FLAGS="-DVELOX_ENABLE_ARROW=ON" + make debug - name: CCache after run: | diff --git a/.github/workflows/scheduled.yml b/.github/workflows/scheduled.yml index 6ea4d84d0c87..daad6cbb6113 100644 --- a/.github/workflows/scheduled.yml +++ b/.github/workflows/scheduled.yml @@ -24,7 +24,6 @@ on: - "third_party/**" - "scripts/setup-ubuntu.sh" - "scripts/setup-helper-functions.sh" - - ".github/workflows/linux-build.yml" - ".github/workflows/scheduled.yml" schedule: @@ -71,46 +70,133 @@ env: jobs: compile: name: Build - runs-on: 8-core + runs-on: 16-core + container: ghcr.io/facebookincubator/velox-dev:centos8 timeout-minutes: 120 env: - CCACHE_DIR: "${{ github.workspace }}/.ccache/" - CCACHE_BASEDIR: "${{ github.workspace }}" + CCACHE_DIR: "/__w/velox/velox/.ccache" LINUX_DISTRO: "ubuntu" + MAKEFLAGS: "NUM_THREADS=${{ inputs.numThreads || 16 }} MAX_HIGH_MEM_JOBS=${{ inputs.maxHighMemJobs || 8 }} MAX_LINK_JOBS=${{ inputs.maxLinkJobs || 4 }}" + + defaults: + run: + shell: bash + working-directory: velox + outputs: + presto_bias: ${{ steps.sig-check.outputs.presto_functions }} + presto_error: ${{ steps.sig-check.outputs.presto_error }} + spark_bias: ${{ steps.sig-check.outputs.spark_functions }} + spark_error: ${{ steps.sig-check.outputs.spark_error }} + steps: - - name: "Restore ccache" + - name: Get Function Signature Stash + uses: assignUser/stash/restore@v1 + id: get-sig + with: + path: /tmp/signatures + key: function-signatures + + - name: Restore ccache uses: assignUser/stash/restore@v1 with: path: "${{ env.CCACHE_DIR }}" - key: ccache-fuzzer + key: ccache-fuzzer-centos + + - name: Fix git permissions + working-directory: ${{ github.workspace }} + # Usually actions/checkout does this but as we run in a container + # it doesn't work + run: | + git config --global --add safe.directory /__w/velox/velox/velox + git config --global --add safe.directory /__w/velox/velox/velox_main - - name: "Checkout Repo" + - name: Ensure Stash Dirs Exists + working-directory: ${{ github.workspace }} + run: | + mkdir -p '${{ env.CCACHE_DIR }}' + mkdir -p /tmp/signatures + + - name: Checkout Main + if: ${{ steps.get-sig.outputs.stash-hit != 'true' }} + uses: actions/checkout@v4 + with: + # hardcode ref without broken pr + ref: 'main' + path: velox_main + + - name: Build PyVelox + if: ${{ steps.get-sig.outputs.stash-hit != 'true' }} + working-directory: velox_main + run: | + python3 -m venv .venv + source .venv/bin/activate + + make python-build + + - name: Create Baseline Signatures + if: ${{ steps.get-sig.outputs.stash-hit != 'true' }} + working-directory: velox_main + run: | + source .venv/bin/activate + python3 -m pip install deepdiff + python3 scripts/signature.py export --spark /tmp/signatures/spark_signatures_main.json + python3 scripts/signature.py export --presto /tmp/signatures/presto_signatures_main.json + + - name: Save Function Signature Stash + if: ${{ steps.get-sig.outputs.stash-hit != 'true' }} + uses: assignUser/stash/save@v1 + with: + path: /tmp/signatures + key: function-signatures + + - name: Checkout Contender uses: actions/checkout@v4 with: path: velox submodules: 'recursive' ref: "${{ inputs.ref }}" - - name: "Install dependencies" + - name: Zero Ccache Statistics run: | - cd velox - source ./scripts/setup-ubuntu.sh - ccache -vsz + ccache -sz - name: Build + env: + EXTRA_CMAKE_FLAGS: "-DVELOX_ENABLE_ARROW=ON -DVELOX_BUILD_PYTHON_PACKAGE=ON ${{ inputs.extraCMakeFlags }}" run: | - cd velox - make debug NUM_THREADS="${{ inputs.numThreads || 8 }}" MAX_HIGH_MEM_JOBS="${{ inputs.maxHighMemJobs || 8 }}" MAX_LINK_JOBS="${{ inputs.maxLinkJobs || 4 }}" EXTRA_CMAKE_FLAGS="-DVELOX_ENABLE_ARROW=ON ${{ inputs.extraCMakeFlags }}" + EXTRA_CMAKE_FLAGS="-DPYTHON_EXECUTABLE=$(which python3) $EXTRA_CMAKE_FLAGS" + make debug - name: Ccache after - run: ccache -vs + run: ccache -s - - name: "Save ccache" + - name: Save ccache uses: assignUser/stash/save@v1 with: path: "${{ env.CCACHE_DIR }}" - key: ccache-fuzzer + key: ccache-fuzzer-centos + + - name: Build PyVelox + env: + VELOX_BUILD_DIR: "_build/debug" + run: | + python3 -m venv .venv + source .venv/bin/activate + python3 -m pip install -e . + + - name: Create and test new function signatures + id: sig-check + run: | + source .venv/bin/activate + python3 -m pip install deepdiff + python3 scripts/signature.py gh_bias_check presto spark + + - name: Upload Signature Artifacts + uses: actions/upload-artifact@v4 + with: + name: signatures + path: /tmp/signatures retention-days: "${{ env.RETENTION }}" - name: Upload presto fuzzer @@ -155,21 +241,15 @@ jobs: path: velox/_build/debug//velox/exec/tests/velox_exchange_fuzzer_test retention-days: "${{ env.RETENTION }}" - linux-presto-fuzzer-run: - name: "Presto Fuzzer" + presto-fuzzer-run: + name: Presto Fuzzer + if: ${{ needs.compile.outputs.presto_bias != 'true' }} runs-on: ubuntu-latest + container: ghcr.io/facebookincubator/velox-dev:centos8 needs: compile timeout-minutes: 120 steps: - - name: "Checkout Repo" - uses: actions/checkout@v4 - with: - ref: "${{ inputs.ref }}" - - - name: "Install dependencies" - run: source ./scripts/setup-ubuntu.sh - - uses: dorny/paths-filter@v3 if: github.event_name == 'pull_request' id: changes @@ -204,7 +284,7 @@ jobs: with: name: presto - - name: "Run Presto Fuzzer" + - name: Run Presto Fuzzer run: | mkdir -p /tmp/fuzzer_repro/ chmod -R 777 /tmp/fuzzer_repro @@ -221,7 +301,7 @@ jobs: --enable_dereference \ --duration_sec $DURATION \ --logtostderr=1 \ - --minloglevel=0 \ + --minloglevel=1 \ --repro_persist_path=/tmp/fuzzer_repro \ && echo -e "\n\nFuzzer run finished successfully." @@ -233,27 +313,71 @@ jobs: path: | /tmp/fuzzer_repro - linux-spark-aggregate-fuzzer-run: - name: "Spark Aggregate Fuzzer" + presto-bias-fuzzer: + name: Presto Bias Fuzzer runs-on: ubuntu-latest + container: ghcr.io/facebookincubator/velox-dev:centos8 needs: compile - timeout-minutes: 60 + if: ${{ needs.compile.outputs.presto_bias == 'true' }} + timeout-minutes: 120 steps: - - name: "Checkout Repo" - uses: actions/checkout@v4 + - name: Download presto expression fuzzer + uses: actions/download-artifact@v4 with: - ref: "${{ inputs.ref }}" + name: presto + + - name: Download Signatures + uses: actions/download-artifact@v4 + with: + name: signatures + path: /tmp/signatures + + - name: Run Presto Expression Fuzzer + run: | + ls /tmp/signatures + mkdir -p /tmp/presto_fuzzer_repro/ + chmod -R 777 /tmp/presto_fuzzer_repro + chmod +x velox_expression_fuzzer_test + ./velox_expression_fuzzer_test \ + --seed ${RANDOM} \ + --lazy_vector_generation_ratio 0.2 \ + --assign_function_tickets $(cat /tmp/signatures/presto_bias_functions) \ + --duration_sec 3600 \ + --enable_variadic_signatures \ + --velox_fuzzer_enable_complex_types \ + --velox_fuzzer_enable_column_reuse \ + --velox_fuzzer_enable_expression_reuse \ + --max_expression_trees_per_step 2 \ + --retry_with_try \ + --enable_dereference \ + --logtostderr=1 \ + --minloglevel=1 \ + --repro_persist_path=/tmp/presto_fuzzer_repro \ + && echo -e "\n\nPresto Fuzzer run finished successfully." - - name: "Install dependencies" - run: source ./scripts/setup-ubuntu.sh + - name: Archive Spark expression production artifacts + if: ${{ !cancelled() }} + uses: actions/upload-artifact@v4 + with: + name: presto-bias-fuzzer-failure-artifacts + path: | + /tmp/presto_bias_fuzzer_repro + + spark-aggregate-fuzzer-run: + name: Spark Aggregate Fuzzer + runs-on: ubuntu-latest + container: ghcr.io/facebookincubator/velox-dev:centos8 + needs: compile + timeout-minutes: 60 + steps: - name: Download spark aggregation fuzzer uses: actions/download-artifact@v4 with: name: spark_aggregation_fuzzer - - name: "Run Spark Aggregate Fuzzer" + - name: Run Spark Aggregate Fuzzer run: | mkdir -p /tmp/spark_aggregate_fuzzer_repro/ chmod -R 777 /tmp/spark_aggregate_fuzzer_repro @@ -262,7 +386,7 @@ jobs: --seed ${RANDOM} \ --duration_sec $DURATION \ --logtostderr=1 \ - --minloglevel=0 \ + --minloglevel=1 \ --repro_persist_path=/tmp/spark_aggregate_fuzzer_repro \ && echo -e "\n\nSpark Aggregation Fuzzer run finished successfully." @@ -274,27 +398,64 @@ jobs: path: | /tmp/spark_aggregate_fuzzer_repro - linux-spark-fuzzer-run: - name: "Spark Fuzzer" + spark-bias-fuzzer: + name: Spark Bias Fuzzer runs-on: ubuntu-latest + container: ghcr.io/facebookincubator/velox-dev:centos8 needs: compile + if: ${{ needs.compile.outputs.spark_bias == 'true' }} timeout-minutes: 120 steps: - - name: "Checkout Repo" - uses: actions/checkout@v4 + - name: Download spark expression fuzzer + uses: actions/download-artifact@v4 with: - ref: "${{ inputs.ref }}" + name: spark_expression_fuzzer + + - name: Download Signatures + uses: actions/download-artifact@v4 + with: + name: signatures + path: /tmp/signatures + + - name: Run Spark Expression Fuzzer + run: | + ls /tmp/signatures + mkdir -p /tmp/spark_fuzzer_repro/ + chmod -R 777 /tmp/spark_fuzzer_repro + chmod +x spark_expression_fuzzer_test + ./spark_expression_fuzzer_test \ + --seed ${RANDOM} \ + --duration_sec $DURATION \ + --logtostderr=1 \ + --minloglevel=1 \ + --assign_function_tickets $(cat /tmp/signatures/spark_bias_functions) \ + --repro_persist_path=/tmp/spark_fuzzer_repro \ + && echo -e "\n\nSpark Fuzzer run finished successfully." - - name: "Install dependencies" - run: source ./scripts/setup-ubuntu.sh + - name: Archive Spark expression production artifacts + if: ${{ !cancelled() }} + uses: actions/upload-artifact@v4 + with: + name: spark-fuzzer-failure-artifacts + path: | + /tmp/spark_bias_fuzzer_repro + + spark-fuzzer: + name: Spark Fuzzer + if: ${{ needs.compile.outputs.spark_bias != 'true' }} + runs-on: ubuntu-latest + container: ghcr.io/facebookincubator/velox-dev:centos8 + needs: compile + timeout-minutes: 120 + steps: - name: Download spark expression fuzzer uses: actions/download-artifact@v4 with: name: spark_expression_fuzzer - - name: "Run Spark Expression Fuzzer" + - name: Run Spark Expression Fuzzer run: | mkdir -p /tmp/spark_fuzzer_repro/ chmod -R 777 /tmp/spark_fuzzer_repro @@ -310,7 +471,7 @@ jobs: --enable_dereference \ --duration_sec $DURATION \ --logtostderr=1 \ - --minloglevel=0 \ + --minloglevel=1 \ --repro_persist_path=/tmp/spark_fuzzer_repro \ && echo -e "\n\nSpark Fuzzer run finished successfully." @@ -322,27 +483,20 @@ jobs: path: | /tmp/spark_fuzzer_repro - linux-aggregate-fuzzer-run: - name: "Aggregate Fuzzer" + aggregate-fuzzer-run: + name: Aggregate Fuzzer runs-on: ubuntu-latest + container: ghcr.io/facebookincubator/velox-dev:centos8 needs: compile timeout-minutes: 120 steps: - - name: "Checkout Repo" - uses: actions/checkout@v4 - with: - ref: "${{ inputs.ref }}" - - - name: "Install dependencies" - run: source ./scripts/setup-ubuntu.sh - - name: Download aggregation fuzzer uses: actions/download-artifact@v4 with: name: aggregation - - name: "Run Aggregate Fuzzer" + - name: Run Aggregate Fuzzer run: | mkdir -p /tmp/aggregate_fuzzer_repro/ rm -rfv /tmp/aggregate_fuzzer_repro/* @@ -352,7 +506,7 @@ jobs: --seed ${RANDOM} \ --duration_sec $DURATION \ --logtostderr=1 \ - --minloglevel=0 \ + --minloglevel=1 \ --repro_persist_path=/tmp/aggregate_fuzzer_repro \ && echo -e "\n\nAggregation fuzzer run finished successfully." @@ -364,27 +518,20 @@ jobs: path: | /tmp/aggregate_fuzzer_repro - linux-join-fuzzer-run: - name: "Join Fuzzer" + join-fuzzer-run: + name: Join Fuzzer runs-on: ubuntu-latest + container: ghcr.io/facebookincubator/velox-dev:centos8 needs: compile timeout-minutes: 120 steps: - - name: "Checkout Repo" - uses: actions/checkout@v4 - with: - ref: "${{ inputs.ref }}" - - - name: "Install dependencies" - run: source ./scripts/setup-ubuntu.sh - - name: Download join fuzzer uses: actions/download-artifact@v4 with: name: join - - name: "Run Join Fuzzer" + - name: Run Join Fuzzer run: | mkdir -p /tmp/join_fuzzer_repro/ rm -rfv /tmp/join_fuzzer_repro/* @@ -394,7 +541,7 @@ jobs: --seed ${RANDOM} \ --duration_sec $DURATION \ --logtostderr=1 \ - --minloglevel=0 \ + --minloglevel=1 \ && echo -e "\n\nAggregation fuzzer run finished successfully." - name: Archive aggregate production artifacts @@ -405,20 +552,14 @@ jobs: path: | /tmp/join_fuzzer_repro - linux-exchange-fuzzer-run: + exchange-fuzzer-run: + name: Exchange Fuzzer runs-on: ubuntu-latest + container: ghcr.io/facebookincubator/velox-dev:centos8 needs: compile timeout-minutes: 120 steps: - - name: Checkout Repo - uses: actions/checkout@v4 - with: - ref: "${{ inputs.ref }}" - - - name: Install dependencies - run: source ./scripts/setup-ubuntu.sh - - name: Download exchange fuzzer uses: actions/download-artifact@v4 with: @@ -426,7 +567,6 @@ jobs: - name: Run exchange Fuzzer run: | - sudo sysctl -w vm.max_map_count=67108864 cat /proc/sys/vm/max_map_count mkdir -p /tmp/exchange_fuzzer_repro/ rm -rfv /tmp/exchange_fuzzer_repro/* @@ -434,9 +574,9 @@ jobs: chmod +x velox_exchange_fuzzer_test ./velox_exchange_fuzzer_test \ --seed ${RANDOM} \ - --duration_sec $DURATION \ + --duration_sec ${{ env.DURATION }} \ --logtostderr=1 \ - --minloglevel=0 \ + --minloglevel=1 \ --repro_path=/tmp/exchange_fuzzer_repro \ && echo -e "\n\Exchange fuzzer run finished successfully." @@ -448,54 +588,22 @@ jobs: path: | /tmp/exchange_fuzzer_repro - presto-java-aggregation-fuzzer-run: name: Aggregation Fuzzer with Presto as source of truth - runs-on: 8-core + needs: compile + runs-on: ubuntu-latest container: ghcr.io/facebookincubator/velox-dev:presto-java timeout-minutes: 120 if: ${{ github.event_name != 'pull_request' }} env: CCACHE_DIR: "${{ github.workspace }}/.ccache/" - CCACHE_BASEDIR: "${{ github.workspace }}" LINUX_DISTRO: "centos" steps: - - name: "Restore ccache" - uses: assignUser/stash/restore@v1 - with: - path: "${{ env.CCACHE_DIR }}" - key: ccache-presto-java-fuzzer - - - name: "Checkout Repo" - uses: actions/checkout@v4 - with: - path: velox - submodules: 'recursive' - ref: "${{ inputs.ref }}" - - - name: Fix git permissions - # Usually actions/checkout does this but as we run in a container - # it doesn't work - run: git config --global --add safe.directory /__w/velox/velox/velox - - - name: Zero Ccache Statistics - run: ccache -sz - - - - name: "Build" - env: - EXTRA_CMAKE_FLAGS: "-DVELOX_ENABLE_ARROW=ON ${{ inputs.extraCMakeFlags }}" - run: | - cd velox - make debug NUM_THREADS="${{ inputs.numThreads || 8 }}" MAX_HIGH_MEM_JOBS="${{ inputs.maxHighMemJobs || 8 }}" MAX_LINK_JOBS="${{ inputs.maxLinkJobs || 4 }}" EXTRA_CMAKE_FLAGS="-DVELOX_ENABLE_ARROW=ON ${{ inputs.extraCMakeFlags }}" - ccache -s - - - name: "Save ccache" - uses: assignUser/stash/save@v1 + - name: Download aggregation fuzzer + uses: actions/download-artifact@v4 with: - path: "${{ env.CCACHE_DIR }}" - key: ccache-presto-java-fuzzer + name: aggregation - name: "Run Aggregate Fuzzer" run: | @@ -504,16 +612,16 @@ jobs: ls -lR $PRESTO_HOME/etc $PRESTO_HOME/bin/launcher run -v > /tmp/server.log 2>&1 & # Sleep for 60 seconds to allow Presto server to start. - sleep 60 + sleep 60 /opt/presto-cli --server 127.0.0.1:8080 --execute 'CREATE SCHEMA hive.tpch;' mkdir -p /tmp/aggregate_fuzzer_repro/ rm -rfv /tmp/aggregate_fuzzer_repro/* chmod -R 777 /tmp/aggregate_fuzzer_repro - _build/debug/velox/functions/prestosql/fuzzer/velox_aggregation_fuzzer_test \ + velox_aggregation_fuzzer_test \ --seed ${RANDOM} \ --duration_sec $DURATION \ --logtostderr=1 \ - --minloglevel=0 \ + --minloglevel=1 \ --repro_persist_path=/tmp/aggregate_fuzzer_repro \ --enable_sorted_aggregations=true \ --presto_url=http://127.0.0.1:8080 \ @@ -527,3 +635,21 @@ jobs: path: | /tmp/aggregate_fuzzer_repro /tmp/server.log + + surface-signature-errors: + name: Signature Changes + needs: compile + runs-on: ubuntu-latest + steps: + - name: Download Signatures + uses: actions/download-artifact@v4 + with: + name: signatures + path: /tmp/signatures + + - name: Surface Presto function signature errors + if: ${{ needs.compile.outputs.presto_error == 'true' }} + run: | + cat /tmp/signatures/presto_errors + exit 1 + diff --git a/CMakeLists.txt b/CMakeLists.txt index fa30c5fa0bd6..e34fca1a7013 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -166,23 +166,12 @@ if(${VELOX_ENABLE_EXAMPLES}) endif() if(${VELOX_BUILD_PYTHON_PACKAGE}) - set(VELOX_BUILD_TESTING OFF) set(VELOX_ENABLE_PRESTO_FUNCTIONS ON) set(VELOX_ENABLE_DUCKDB ON) set(VELOX_ENABLE_EXPRESSION ON) set(VELOX_ENABLE_PARSE ON) set(VELOX_ENABLE_EXEC ON) - set(VELOX_ENABLE_AGGREGATES OFF) - set(VELOX_ENABLE_HIVE_CONNECTOR OFF) - set(VELOX_ENABLE_TPCH_CONNECTOR OFF) set(VELOX_ENABLE_SPARK_FUNCTIONS ON) - set(VELOX_ENABLE_EXAMPLES OFF) - set(VELOX_ENABLE_S3 OFF) - set(VELOX_ENABLE_GCS OFF) - set(VELOX_ENABLE_ABFS OFF) - set(VELOX_ENABLE_SUBSTRAIT OFF) - set(VELOX_ENABLE_BENCHMARKS_BASIC OFF) - set(VELOX_ENABLE_BENCHMARKS OFF) endif() # We look for OpenSSL here to cache the result enforce the version across our diff --git a/Makefile b/Makefile index 02f385b6eb71..2bab08bce1ab 100644 --- a/Makefile +++ b/Makefile @@ -183,8 +183,8 @@ python-clean: DEBUG=1 ${PYTHON_EXECUTABLE} setup.py clean python-build: - DEBUG=1 CMAKE_BUILD_PARALLEL_LEVEL=4 ${PYTHON_EXECUTABLE} -m pip install -e .$(extras) --verbose + DEBUG=1 CMAKE_BUILD_PARALLEL_LEVEL=${NUM_THREADS} ${PYTHON_EXECUTABLE} -m pip install -e .$(extras) --verbose -python-test: +python-test: $(MAKE) python-build extras="[tests]" DEBUG=1 ${PYTHON_EXECUTABLE} -m unittest -v diff --git a/pyvelox/CMakeLists.txt b/pyvelox/CMakeLists.txt index 4bffa203b7d4..1dba45c58a7f 100644 --- a/pyvelox/CMakeLists.txt +++ b/pyvelox/CMakeLists.txt @@ -14,18 +14,16 @@ if(VELOX_BUILD_PYTHON_PACKAGE) message("Creating pyvelox module") - include_directories(SYSTEM ${CMAKE_SOURCE_DIR}) - add_definitions(-DCREATE_PYVELOX_MODULE -DVELOX_DISABLE_GOOGLETEST) # Define our Python module: pybind11_add_module( pyvelox MODULE + complex.cpp + conversion.cpp pyvelox.cpp serde.cpp - signatures.cpp - complex.cpp - conversion.cpp) - # Link with Velox: + signatures.cpp) + target_link_libraries( pyvelox PRIVATE velox_type @@ -37,11 +35,8 @@ if(VELOX_BUILD_PYTHON_PACKAGE) velox_functions_prestosql velox_functions_spark) + target_include_directories(pyvelox SYSTEM + PRIVATE ${CMAKE_CURRENT_LIST_DIR}/..) + target_compile_definitions(pyvelox PRIVATE -DCREATE_PYVELOX_MODULE) install(TARGETS pyvelox LIBRARY DESTINATION .) -else() - # Torcharrow will not use pyvelox as an extension module for compatibility - # reasons. - message("Creating pyvelox library") - add_library(pyvelox pyvelox.cpp pyvelox.h) - target_link_libraries(pyvelox velox_type pybind11::module) endif() diff --git a/scripts/adapters.dockerfile b/scripts/adapters.dockerfile index d7da3c088074..1b32fed7d695 100644 --- a/scripts/adapters.dockerfile +++ b/scripts/adapters.dockerfile @@ -38,7 +38,7 @@ RUN npm install -g azurite ENV HADOOP_HOME=/usr/local/hadoop \ HADOOP_ROOT_LOGGER="WARN,DRFA" \ LC_ALL=C \ - LIBHDFS3_CONF=/velox/.circleci/hdfs-client.xml \ + LIBHDFS3_CONF=/velox/scripts/hdfs-client.xml \ PATH=/usr/local/hadoop/bin:${PATH} ENTRYPOINT ["/bin/bash", "-c", "source /opt/rh/gcc-toolset-9/enable && exec \"$@\"", "--"] diff --git a/.circleci/hdfs-client.xml b/scripts/hdfs-client.xml similarity index 100% rename from .circleci/hdfs-client.xml rename to scripts/hdfs-client.xml diff --git a/scripts/signature.py b/scripts/signature.py index 17aa0b33c87e..e7acb1cd0aaa 100644 --- a/scripts/signature.py +++ b/scripts/signature.py @@ -13,11 +13,13 @@ # limitations under the License. import argparse import json +import os import sys +from typing import Any -import pyvelox.pyvelox as pv from deepdiff import DeepDiff +import pyvelox.pyvelox as pv # Utility to export and diff function signatures. @@ -38,9 +40,30 @@ def get_error_string(error_message): {error_message} Changing or removing function signatures breaks backwards compatibility as some users may rely on function signatures that no longer exist. + """ +def set_gh_output(name: str, value: Any): + """Sets a Github Actions output variable. Only single line values are supported. + value will be converted to a lower case string.""" + value = str(value).lower() + + if "\n" in value: + raise ValueError("Only single line values are supported.") + + with open(os.environ["GITHUB_OUTPUT"], "a") as f: + f.write(f"{name}={value}\n") + + +def show_error(error_message, error_path): + if error_path: + with open(error_path, "a+") as f: + f.writelines(get_error_string(error_message)) + + print(get_error_string(error_message)) + + def export(args): """Exports Velox function signatures.""" pv.clear_signatures() @@ -59,11 +82,13 @@ def export(args): jsoned_signatures[key] = [str(value) for value in signatures[key]] # Persist to file - json.dump(jsoned_signatures, args.output_file) + with open(args.output_file, "w") as f: + json.dump(jsoned_signatures, f) + return 0 -def diff_signatures(base_signatures, contender_signatures): +def diff_signatures(base_signatures, contender_signatures, error_path=""): """Diffs Velox function signatures. Returns a tuple of the delta diff and exit status""" delta = DeepDiff( @@ -82,28 +107,28 @@ def diff_signatures(base_signatures, contender_signatures): error_message += ( f"""Function '{dic_removed.get_root_key()}' has been removed.\n""" ) - print(get_error_string(error_message)) + show_error(error_message, error_path) exit_status = 1 if "values_changed" in delta: error_message = "" for value_change in delta["values_changed"]: error_message += f"""'{value_change.get_root_key()}{value_change.t1}' is changed to '{value_change.get_root_key()}{value_change.t2}'.\n""" - print(get_error_string(error_message)) + show_error(error_message, error_path) exit_status = 1 if "repetition_change" in delta: error_message = "" for rep_change in delta["repetition_change"]: error_message += f"""'{rep_change.get_root_key()}{rep_change.t1}' is repeated {rep_change.repetition['new_repeat']} times.\n""" - print(get_error_string(error_message)) + show_error(error_message, error_path) exit_status = 1 if "iterable_item_removed" in delta: error_message = "" for iter_change in delta["iterable_item_removed"]: error_message += f"""{iter_change.get_root_key()} has its function signature '{iter_change.t1}' removed.\n""" - print(get_error_string(error_message)) + show_error(error_message, error_path) exit_status = 1 else: @@ -114,17 +139,24 @@ def diff_signatures(base_signatures, contender_signatures): def diff(args): """Diffs Velox function signatures.""" - base_signatures = json.load(args.base) - contender_signatures = json.load(args.contender) + with open(args.base) as f: + base_signatures = json.load(f) + + with open(args.contender) as f: + contender_signatures = json.load(f) return diff_signatures(base_signatures, contender_signatures)[1] def bias(args): - base_signatures = json.load(args.base) - contender_signatures = json.load(args.contender) + with open(args.base) as f: + base_signatures = json.load(f) + + with open(args.contender) as f: + contender_signatures = json.load(f) + tickets = args.ticket_value bias_output, status = bias_signatures( - base_signatures, contender_signatures, tickets + base_signatures, contender_signatures, tickets, args.error_path ) if bias_output: @@ -134,12 +166,12 @@ def bias(args): return status -def bias_signatures(base_signatures, contender_signatures, tickets): +def bias_signatures(base_signatures, contender_signatures, tickets, error_path): """Returns newly added functions as string and a status flag. Newly added functions are biased like so `fn_name1=,fn_name2=`. If it detects incompatible changes returns 1 in the status. """ - delta, status = diff_signatures(base_signatures, contender_signatures) + delta, status = diff_signatures(base_signatures, contender_signatures, error_path) if not delta: print(f"{bcolors.BOLD} No changes detected: Nothing to do!") @@ -156,6 +188,50 @@ def bias_signatures(base_signatures, contender_signatures, tickets): return "", status +def gh_bias_check(args): + """ + Exports signatures for the given group(s) and checks them for changes compared to a baseline. + Saves the results to a file and sets a Github Actions Output for each group. + """ + if not os.getenv("GITHUB_ACTIONS"): + print("This command is meant to be run in a Github Actions environment.") + return 1 + + # export signatures for each group + for group in args.group: + print(f"Exporting {group} signatures...") + export_args = parse_args( + [ + "export", + f"--{group}", + os.path.join(args.signature_dir, group + args.contender_postfix), + ] + ) + export(export_args) + + # compare signatures for each group + for group in args.group: + print(f"Comparing {group} signatures...") + bias_args = parse_args( + [ + "bias", + os.path.join(args.signature_dir, group + args.base_postfix), + os.path.join(args.signature_dir, group + args.contender_postfix), + os.path.join(args.signature_dir, group + args.output_postfix), + os.path.join(args.signature_dir, group + "_errors"), + ] + ) + + bias_status = bias(bias_args) + set_gh_output(f"{group}_error", bias_status == 1) + + # check if there are any changes that require the bias fuzzer to run + has_tickets = os.path.isfile( + os.path.join(args.signature_dir, group + args.output_postfix) + ) + set_gh_output(f"{group}_functions", has_tickets) + + def get_tickets(val): tickets = int(val) if tickets < 0: @@ -175,19 +251,40 @@ def parse_args(args): export_command_parser = command.add_parser("export") export_command_parser.add_argument("--spark", action="store_true") export_command_parser.add_argument("--presto", action="store_true") - export_command_parser.add_argument("output_file", type=argparse.FileType("w")) + export_command_parser.add_argument("output_file", type=str) diff_command_parser = command.add_parser("diff") - diff_command_parser.add_argument("base", type=argparse.FileType("r")) - diff_command_parser.add_argument("contender", type=argparse.FileType("r")) + diff_command_parser.add_argument("base", type=str) + diff_command_parser.add_argument("contender", type=str) bias_command_parser = command.add_parser("bias") - bias_command_parser.add_argument("base", type=argparse.FileType("r")) - bias_command_parser.add_argument("contender", type=argparse.FileType("r")) - bias_command_parser.add_argument("output_path") + bias_command_parser.add_argument("base", type=str) + bias_command_parser.add_argument("contender", type=str) + bias_command_parser.add_argument("output_path", type=str) bias_command_parser.add_argument( "ticket_value", type=get_tickets, default=10, nargs="?" ) + bias_command_parser.add_argument("error_path", type=str, default="") + gh_command_parser = command.add_parser("gh_bias_check") + gh_command_parser.add_argument( + "group", + nargs="+", + help='One or more group names to check for changed signatures. e.g. "spark" or "presto"', + type=str, + ) + gh_command_parser.add_argument( + "--signature_dir", type=str, default="/tmp/signatures" + ) + gh_command_parser.add_argument( + "--base_postfix", type=str, default="_signatures_main.json" + ) + gh_command_parser.add_argument( + "--contender_postfix", type=str, default="_signatures_contender.json" + ) + gh_command_parser.add_argument( + "--output_postfix", type=str, default="_bias_functions" + ) + parser.set_defaults(command="help") return parser.parse_args(args) diff --git a/setup.py b/setup.py index 465e511bb519..111ede9e5567 100644 --- a/setup.py +++ b/setup.py @@ -109,6 +109,16 @@ def run(self): def build_extension(self, ext): extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name))) + # Allow using a pre-built Velox library (for CI and development) e.g. 'VELOX_BUILD_DIR=_build/velox/debug' + # The build in question must have been built with 'VELOX_BUILD_PYTHON_PACKAGE=ON' and the same python version. + if "VELOX_BUILD_DIR" in os.environ: + velox_dir = os.path.abspath(os.environ["VELOX_BUILD_DIR"]) + + if not os.path.isdir(extdir): + os.symlink(velox_dir, os.path.dirname(extdir), target_is_directory=True) + + print(f"Using pre-built Velox library from {velox_dir}") + return # required for auto-detection of auxiliary "native" libs if not extdir.endswith(os.path.sep): @@ -126,6 +136,7 @@ def build_extension(self, ext): f"-DCMAKE_BUILD_TYPE={cfg}", f"-DCMAKE_INSTALL_PREFIX={extdir}", "-DCMAKE_VERBOSE_MAKEFILE=ON", + "-DVELOX_BUILD_MINIMAL=ON", "-DVELOX_BUILD_PYTHON_PACKAGE=ON", f"-DPYTHON_EXECUTABLE={exec_path} ", ] @@ -148,9 +159,9 @@ def build_extension(self, ext): os.makedirs(self.build_temp) subprocess.check_call( - ["cmake", str(ROOT_DIR)] + cmake_args, cwd=self.build_temp + ["cmake", str(os.path.join(ROOT_DIR, "pyvelox"))] + cmake_args, + cwd=self.build_temp, ) - print(self.build_temp) subprocess.check_call( ["cmake", "--build", "."] + build_args, cwd=self.build_temp )