diff --git a/.github/workflows/auto-merge.yml b/.github/workflows/auto-merge.yml index a22814c3..aa8e2e61 100644 --- a/.github/workflows/auto-merge.yml +++ b/.github/workflows/auto-merge.yml @@ -18,25 +18,16 @@ name: auto-merge HEAD to BASE on: pull_request_target: branches: - - branch-24.08 + - branch-* types: [closed] jobs: auto-merge: if: github.event.pull_request.merged == true - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v4 - with: - ref: branch-24.08 # force to fetch from latest upstream instead of PR ref - - - name: auto-merge job - uses: ./.github/workflows/auto-merge - env: - OWNER: NVIDIA - REPO_NAME: spark-rapids-ml - HEAD: branch-24.08 - BASE: branch-24.10 - AUTOMERGE_TOKEN: ${{ secrets.AUTOMERGE_TOKEN }} # use to merge PR - + uses: NVIDIA/spark-rapids-common/.github/workflows/auto-merge.yml@main + with: + owner: ${{ github.repository_owner }} + repo: spark-rapids-ml + branch: ${{ github.event.pull_request.base.ref }} + secrets: + token: ${{ secrets.AUTOMERGE_TOKEN }} diff --git a/.github/workflows/auto-merge/Dockerfile b/.github/workflows/auto-merge/Dockerfile deleted file mode 100644 index e98b1a48..00000000 --- a/.github/workflows/auto-merge/Dockerfile +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -FROM python:alpine - -WORKDIR / -COPY automerge . -RUN pip install requests && chmod +x /automerge - -# require envs: OWNER,REPO_NAME,HEAD,BASE,GITHUB_TOKEN -ENTRYPOINT ["/automerge"] diff --git a/.github/workflows/auto-merge/action.yml b/.github/workflows/auto-merge/action.yml deleted file mode 100644 index ee557731..00000000 --- a/.github/workflows/auto-merge/action.yml +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -name: 'auto-merge action' -description: 'auto-merge HEAD to BASE' -runs: - using: 'docker' - image: 'Dockerfile' - diff --git a/.github/workflows/auto-merge/automerge b/.github/workflows/auto-merge/automerge deleted file mode 100755 index 948cc30a..00000000 --- a/.github/workflows/auto-merge/automerge +++ /dev/null @@ -1,137 +0,0 @@ -#!/usr/bin/env python - -# Copyright (c) 2022, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""A auto-merge tool - -Create a PR to merge HEAD to BASE branch. -NOTE: - The generated PR should be automatically merged if no conflict. Otherwise, manual operation will be required. -""" - -import os -import sys -import time - -import requests - -# ENV -OWNER = os.environ.get('OWNER') -assert OWNER, 'env OWNER should not be empty' -REPO_NAME = os.environ.get('REPO_NAME') -assert REPO_NAME, 'env REPO_NAME should not be empty' -HEAD = os.environ.get('HEAD') -assert HEAD, 'env HEAD should not be empty' -BASE = os.environ.get('BASE') -assert BASE, 'env BASE should not be empty' -AUTOMERGE_TOKEN = os.environ.get('AUTOMERGE_TOKEN') -assert AUTOMERGE_TOKEN, 'env AUTOMERGE_TOKEN should not be empty' -# static -API_URL = 'https://api.github.com' -AUTH_HEADERS = { - 'Authorization': 'token ' + AUTOMERGE_TOKEN -} - - -def create(): - url = f'{API_URL}/repos/{OWNER}/{REPO_NAME}/pulls' - params = { - 'title': f'[auto-merge] {HEAD} to {BASE} [skip ci] [bot]', - 'head': HEAD, - 'base': BASE, - 'body': f'auto-merge triggered by github actions on `{HEAD}` to create a PR keeping `{BASE}` up-to-date. If ' - 'this PR is unable to be merged due to conflicts, it will remain open until manually fix.', - 'maintainer_can_modify': True - } - r = requests.post(url, headers=AUTH_HEADERS, json=params) - if r.status_code == 201: - print('SUCCESS - create PR') - pull = r.json() - number = str(pull['number']) - sha = str(pull['head']['sha']) - return number, sha, False - if r.status_code == 422: # early-terminate if no commits between HEAD and BASE - print('SUCCESS - No commits') - print(r.json()) - return '', '', True - # FAILURE - print('FAILURE - create PR') - print(f'status code: {r.status_code}') - print(r.json()) - sys.exit(1) - - -def auto_merge(number, sha): - url = f'{API_URL}/repos/{OWNER}/{REPO_NAME}/pulls/{number}/merge' - params = { - 'sha': sha, - 'merge_method': 'merge' - } - r = requests.put(url, headers=AUTH_HEADERS, json=params) - if r.status_code == 200: - comment(number, '**SUCCESS** - auto-merge') - print('SUCCESS - auto-merge') - sys.exit(0) - else: - print('FAILURE - auto-merge') - comment(number=number, content=f"""**FAILURE** - Unable to auto-merge. Manual operation is required. -``` -{r.json()} -``` - -Please use the following steps to fix the merge conflicts manually: -``` -# Assume upstream is NVIDIA/spark-rapids-ml remote -git fetch upstream {HEAD} {BASE} -git checkout -b fix-auto-merge-conflict-{number} upstream/{BASE} -git merge upstream/{HEAD} -# Fix any merge conflicts caused by this merge -git commit -am "Merge {HEAD} into {BASE}" -git push fix-auto-merge-conflict-{number} -# Open a PR targets NVIDIA/spark-rapids-ml {BASE} -``` -**IMPORTANT:** Before merging this PR, be sure to change the merging strategy to `Create a merge commit` (repo admin only). - -Once this PR is merged, the auto-merge PR should automatically be closed since it contains the same commit hashes -""") - print(f'status code: {r.status_code}') - print(r.json()) - sys.exit(1) - - -def comment(number, content): - url = f'{API_URL}/repos/{OWNER}/{REPO_NAME}/issues/{number}/comments' - params = { - 'body': content - } - r = requests.post(url, headers=AUTH_HEADERS, json=params) - if r.status_code == 201: - print('SUCCESS - create comment') - else: - print('FAILURE - create comment') - print(f'status code: {r.status_code}') - print(r.json()) - - -def main(): - number, sha, term = create() - if term: - sys.exit(0) - - auto_merge(number, sha) - - -if __name__ == '__main__': - main() diff --git a/.github/workflows/blossom-ci.yml b/.github/workflows/blossom-ci.yml index 1d0b77ad..ef720a60 100644 --- a/.github/workflows/blossom-ci.yml +++ b/.github/workflows/blossom-ci.yml @@ -46,7 +46,8 @@ jobs: github.actor == 'GaryShen2008' || github.actor == 'NvTimLiu' || github.actor == 'YanxuanLiu' || - github.actor == 'pxLi' + github.actor == 'pxLi' || + github.actor == 'rishic3' ) steps: - name: Check if comment is issued by authorized person diff --git a/ci/Dockerfile b/ci/Dockerfile index 9ded96b2..d264a4d9 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -37,6 +37,6 @@ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86 && conda config --set solver libmamba # install cuML -ARG CUML_VER=24.08 -RUN conda install -y -c rapidsai -c conda-forge -c nvidia cuml=$CUML_VER cuvs=$CUML_VER python=3.9 cuda-version=11.8 \ +ARG CUML_VER=24.10 +RUN conda install -y -c rapidsai -c conda-forge -c nvidia cuml=$CUML_VER cuvs=$CUML_VER python=3.10 cuda-version=11.8 numpy~=1.0 \ && conda clean --all -f -y diff --git a/docker/Dockerfile.pip b/docker/Dockerfile.pip index 1355efe4..19146ab0 100644 --- a/docker/Dockerfile.pip +++ b/docker/Dockerfile.pip @@ -18,7 +18,7 @@ ARG CUDA_VERSION=11.8.0 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 ARG PYSPARK_VERSION=3.3.1 -ARG RAPIDS_VERSION=24.8.0 +ARG RAPIDS_VERSION=24.10.0 ARG ARCH=amd64 #ARG ARCH=arm64 # Install packages to build spark-rapids-ml @@ -41,6 +41,7 @@ RUN pip install --no-cache-dir \ cudf-cu11~=${RAPIDS_VERSION} \ cuml-cu11~=${RAPIDS_VERSION} \ cuvs-cu11~=${RAPIDS_VERSION} \ + numpy~=1.0 \ --extra-index-url=https://pypi.nvidia.com # install python dependencies diff --git a/docker/Dockerfile.python b/docker/Dockerfile.python index 71f7113c..b304cd38 100644 --- a/docker/Dockerfile.python +++ b/docker/Dockerfile.python @@ -17,7 +17,7 @@ ARG CUDA_VERSION=11.8.0 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 -ARG CUML_VERSION=24.08 +ARG CUML_VERSION=24.10 # Install packages to build spark-rapids-ml RUN apt update -y \ @@ -38,7 +38,7 @@ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-py38_4.10.3-Linu # install cuML -RUN conda install -y -c rapidsai -c conda-forge -c nvidia python=3.9 cuda-version=11.8 cuml=$CUML_VERSION \ +RUN conda install -y -c rapidsai -c conda-forge -c nvidia python=3.10 cuda-version=11.8 cuml=$CUML_VERSION numpy~=1.0 \ && conda clean --all -f -y # install python dependencies diff --git a/docs/site/compatibility.md b/docs/site/compatibility.md index 954103d3..6d41a1ca 100644 --- a/docs/site/compatibility.md +++ b/docs/site/compatibility.md @@ -28,7 +28,7 @@ Note: Spark does not provide a k-Nearest Neighbors (k-NN) implementation, but it | Spark Rapids ML | CUDA | Spark | Python | | :-------------- | :---- | :----- | :----- | -| 1.0.0 | 11.5+ | 3.2.1+ | 3.9+ | +| 1.0.0 | 11.4+ | 3.3+ | 3.10+ | ## Single vs Double precision inputs diff --git a/docs/source/conf.py b/docs/source/conf.py index 937e7f07..fbe81721 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -9,7 +9,7 @@ project = 'spark-rapids-ml' copyright = '2024, NVIDIA' author = 'NVIDIA' -release = '24.08.0' +release = '24.10.0' # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration diff --git a/notebooks/aws-emr/README.md b/notebooks/aws-emr/README.md index a1919c37..b58bc634 100644 --- a/notebooks/aws-emr/README.md +++ b/notebooks/aws-emr/README.md @@ -41,7 +41,8 @@ If you already have a AWS EMR account, you can run the example notebooks on an E aws emr create-cluster \ --name ${CLUSTER_NAME} \ - --release-label emr-6.10.0 \ + --release-label emr-7.3.0 \ + --ebs-root-volume-size=32 \ --applications Name=Hadoop Name=Livy Name=Spark Name=JupyterEnterpriseGateway \ --service-role EMR_DefaultRole \ --log-uri s3://${S3_BUCKET}/logs \ diff --git a/notebooks/aws-emr/init-bootstrap-action.sh b/notebooks/aws-emr/init-bootstrap-action.sh index 292c5b9b..a0af4c77 100755 --- a/notebooks/aws-emr/init-bootstrap-action.sh +++ b/notebooks/aws-emr/init-bootstrap-action.sh @@ -1,23 +1,28 @@ #!/bin/bash set -ex - -sudo chmod a+rwx -R /sys/fs/cgroup/cpu,cpuacct -sudo chmod a+rwx -R /sys/fs/cgroup/devices -sudo yum install -y gcc openssl-devel bzip2-devel libffi-devel tar gzip wget make mysql-devel -sudo bash -c "wget https://www.python.org/ftp/python/3.9.9/Python-3.9.9.tgz && tar xzf Python-3.9.9.tgz && cd Python-3.9.9 && ./configure --enable-optimizations && make altinstall" +sudo mkdir -p /spark-rapids-cgroup/devices +sudo mount -t cgroup -o devices cgroupv1-devices /spark-rapids-cgroup/devices +sudo chmod a+rwx -R /spark-rapids-cgroup -RAPIDS_VERSION=24.8.0 +sudo yum update -y +sudo yum install -y gcc bzip2-devel libffi-devel tar gzip wget make +sudo yum install -y mysql-devel --skip-broken +sudo bash -c "wget https://www.python.org/ftp/python/3.10.9/Python-3.10.9.tgz && \ +tar xzf Python-3.10.9.tgz && cd Python-3.10.9 && \ +./configure --enable-optimizations && make altinstall" + +RAPIDS_VERSION=24.10.0 + +sudo /usr/local/bin/pip3.10 install --upgrade pip # install scikit-learn -sudo /usr/local/bin/pip3.9 install scikit-learn +sudo /usr/local/bin/pip3.10 install scikit-learn # install cudf and cuml -sudo /usr/local/bin/pip3.9 install --no-cache-dir cudf-cu11==${RAPIDS_VERSION} \ - cuml-cu11==${RAPIDS_VERSION} \ - cuvs-cu11==${RAPIDS_VERSION} \ - pylibraft-cu11==${RAPIDS_VERSION} \ - rmm-cu11==${RAPIDS_VERSION} \ - --extra-index-url=https://pypi.nvidia.com +sudo /usr/local/bin/pip3.10 install --no-cache-dir cudf-cu12 --extra-index-url=https://pypi.nvidia.com --verbose +sudo /usr/local/bin/pip3.10 install --no-cache-dir cuml-cu12 cuvs-cu12 --extra-index-url=https://pypi.nvidia.com --verbose + +sudo /usr/local/bin/pip3.10 list diff --git a/notebooks/aws-emr/init-configurations.json b/notebooks/aws-emr/init-configurations.json index 2f20e074..da0be31f 100644 --- a/notebooks/aws-emr/init-configurations.json +++ b/notebooks/aws-emr/init-configurations.json @@ -13,7 +13,7 @@ "yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices":"auto", "yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables":"/usr/bin", "yarn.nodemanager.linux-container-executor.cgroups.mount":"true", - "yarn.nodemanager.linux-container-executor.cgroups.mount-path":"/sys/fs/cgroup", + "yarn.nodemanager.linux-container-executor.cgroups.mount-path":"/spark-rapids-cgroup", "yarn.nodemanager.linux-container-executor.cgroups.hierarchy":"yarn", "yarn.nodemanager.container-executor.class":"org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor" } @@ -33,7 +33,7 @@ { "Classification":"cgroups", "Properties":{ - "root":"/sys/fs/cgroup", + "root":"/spark-rapids-cgroup", "yarn-hierarchy":"yarn" } } @@ -68,14 +68,14 @@ "spark.sql.execution.arrow.pyspark.enabled":"true", "spark.sql.execution.arrow.maxRecordsPerBatch":"100000", "spark.sql.cache.serializer":"com.nvidia.spark.ParquetCachedBatchSerializer", - "spark.pyspark.python":"python3.9", - "spark.pyspark.driver.python":"python3.9", + "spark.pyspark.python":"/usr/local/bin/python3.10", + "spark.pyspark.driver.python":"/usr/local/bin/python3.10", "spark.dynamicAllocation.enabled":"false", "spark.driver.memory":"20g", "spark.rpc.message.maxSize":"512", "spark.executorEnv.CUPY_CACHE_DIR":"/tmp/.cupy", "spark.executorEnv.NCCL_DEBUG":"INFO", - "spark.executorEnv.NCCL_SOCKET_IFNAME":"eth" + "spark.executorEnv.NCCL_SOCKET_IFNAME":"ens" } }, { diff --git a/notebooks/databricks/README.md b/notebooks/databricks/README.md index 3061d711..1d60a204 100644 --- a/notebooks/databricks/README.md +++ b/notebooks/databricks/README.md @@ -51,7 +51,7 @@ If you already have a Databricks account, you can run the example notebooks on a spark.task.resource.gpu.amount 1 spark.databricks.delta.preview.enabled true spark.python.worker.reuse true - spark.executorEnv.PYTHONPATH /databricks/jars/rapids-4-spark_2.12-24.06.1.jar:/databricks/spark/python + spark.executorEnv.PYTHONPATH /databricks/jars/rapids-4-spark_2.12-24.08.1.jar:/databricks/spark/python spark.sql.execution.arrow.maxRecordsPerBatch 100000 spark.rapids.memory.gpu.minAllocFraction 0.0001 spark.plugins com.nvidia.spark.SQLPlugin diff --git a/notebooks/databricks/init-pip-cuda-11.8.sh b/notebooks/databricks/init-pip-cuda-11.8.sh index a054e6c4..bf59784e 100644 --- a/notebooks/databricks/init-pip-cuda-11.8.sh +++ b/notebooks/databricks/init-pip-cuda-11.8.sh @@ -4,8 +4,8 @@ SPARK_RAPIDS_ML_ZIP=/dbfs/path/to/zip/file # IMPORTANT: specify RAPIDS_VERSION fully 23.10.0 and not 23.10 # also in general, RAPIDS_VERSION (python) fields should omit any leading 0 in month/minor field (i.e. 23.8.0 and not 23.08.0) # while SPARK_RAPIDS_VERSION (jar) should have leading 0 in month/minor (e.g. 23.08.2 and not 23.8.2) -RAPIDS_VERSION=24.8.0 -SPARK_RAPIDS_VERSION=24.06.1 +RAPIDS_VERSION=24.10.0 +SPARK_RAPIDS_VERSION=24.08.1 curl -L https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/${SPARK_RAPIDS_VERSION}/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}-cuda11.jar -o /databricks/jars/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}.jar diff --git a/notebooks/dataproc/README.md b/notebooks/dataproc/README.md index 3af37c98..5051e581 100644 --- a/notebooks/dataproc/README.md +++ b/notebooks/dataproc/README.md @@ -29,7 +29,7 @@ If you already have a Dataproc account, you can run the example notebooks on a D - Create a cluster with at least two single-gpu workers. **Note**: in addition to the initialization script from above, this also uses the standard [initialization actions](https://github.com/GoogleCloudDataproc/initialization-actions) for installing the GPU drivers and RAPIDS: ``` export CUDA_VERSION=11.8 - export RAPIDS_VERSION=24.8.0 + export RAPIDS_VERSION=24.10.0 gcloud dataproc clusters create $USER-spark-rapids-ml \ --image-version=2.1-ubuntu \ diff --git a/notebooks/dataproc/spark_rapids_ml.sh b/notebooks/dataproc/spark_rapids_ml.sh index 093259e8..0e9a1e5e 100644 --- a/notebooks/dataproc/spark_rapids_ml.sh +++ b/notebooks/dataproc/spark_rapids_ml.sh @@ -1,6 +1,6 @@ #!/bin/bash -RAPIDS_VERSION=24.8.0 +RAPIDS_VERSION=24.10.0 # patch existing packages mamba install "llvmlite<0.40,>=0.39.0dev0" "numba>=0.56.2" diff --git a/python/README.md b/python/README.md index fb7ffdd2..31718ab0 100644 --- a/python/README.md +++ b/python/README.md @@ -8,9 +8,9 @@ For simplicity, the following instructions just use Spark local mode, assuming a First, install RAPIDS cuML per [these instructions](https://rapids.ai/start.html). Example for CUDA Toolkit 11.8: ```bash -conda create -n rapids-24.08 \ +conda create -n rapids-24.10 \ -c rapidsai -c conda-forge -c nvidia \ - cuml=24.08 cuvs=24.08 python=3.9 cuda-version=11.8 + cuml=24.10 cuvs=24.10 python=3.10 cuda-version=11.8 ``` **Note**: while testing, we recommend using conda or docker to simplify installation and isolate your environment while experimenting. Once you have a working environment, you can then try installing directly, if necessary. @@ -19,7 +19,7 @@ conda create -n rapids-24.08 \ Once you have the conda environment, activate it and install the required packages. ```bash -conda activate rapids-24.08 +conda activate rapids-24.10 ## for development access to notebooks, tests, and benchmarks git clone --branch main https://github.com/NVIDIA/spark-rapids-ml.git diff --git a/python/benchmark/aws-emr/README.md b/python/benchmark/aws-emr/README.md index 074be171..43f0438b 100644 --- a/python/benchmark/aws-emr/README.md +++ b/python/benchmark/aws-emr/README.md @@ -21,8 +21,14 @@ This directory contains shell scripts for running larger-scale benchmarks on an ``` **Note**: this step should be repeated for each new version of the spark-rapids-ml package that you want to test. +## Create an ssh key pair +- The benchmark script needs ssh access to the EMR cluster and this requires creating an [EC2 key pair](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/create-key-pairs.html). Choose the **pem** format. After saving the private key locally with `.pem` as the file extension, set the following environment variable to point to its location. + ``` + export KEYPAIR=/path/to/private/key.pem + ``` + ## Prepare Subnet -- Print out available subnets in CLI then pick a SubnetId of your region (e.g. subnet-0744566f of AvailabilityZone us-east-2a in region Ohio). A subnet is required to start an EMR cluster. +- Print out available subnets in CLI then pick a SubnetId of your region (e.g. subnet-0744566f of AvailabilityZone us-east-2a in region Ohio). A subnet is required to start an EMR cluster. Make sure that your selected subnet allows SSH access (port 22) from your local host where you will be invoking the benchmarking script. The public subnet in the default VPC in your account might be a suitable choice. See AWS EMR documentation for more info on [VPCs for EMR](https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-vpc-host-job-flows.html) and related info on SSH access in [managed security groups used by EMR](https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-man-sec-groups.html). ``` aws ec2 describe-subnets diff --git a/python/benchmark/aws-emr/cpu-init-configurations.json b/python/benchmark/aws-emr/cpu-init-configurations.json index 55905347..cd867733 100644 --- a/python/benchmark/aws-emr/cpu-init-configurations.json +++ b/python/benchmark/aws-emr/cpu-init-configurations.json @@ -2,8 +2,8 @@ { "Classification":"spark-defaults", "Properties":{ - "spark.pyspark.python":"python3.9", - "spark.pyspark.driver.python":"python3.9" + "spark.pyspark.python":"/usr/local/bin/python3.10", + "spark.pyspark.driver.python":"/usr/local/bin/python3.10" } } ] diff --git a/python/benchmark/aws-emr/run_benchmark.sh b/python/benchmark/aws-emr/run_benchmark.sh index 0d77836c..59655987 100755 --- a/python/benchmark/aws-emr/run_benchmark.sh +++ b/python/benchmark/aws-emr/run_benchmark.sh @@ -89,20 +89,67 @@ if [[ $? != 0 ]]; then exit 1 fi +ssh_command () { + aws emr wait cluster-running --cluster-id $CLUSTER_ID + if [[ $? != 0 ]]; then + echo "cluster terminated, exiting" + exit 1 + fi + ssh -i $KEYPAIR -o StrictHostKeyChecking=no ec2-user@$masternode $1 +} + +get_masternode () { + aws emr list-instances --cluster-id $CLUSTER_ID --instance-group-type MASTER | grep PublicDnsName | grep -oP 'ec2[^"]*' +} + +get_appid () { + ssh_command "hdfs dfs -text $stderr_path" | grep -oP "application_[0-9]*_[0-9]*" | head -n 1 +} + +get_appstatus () { + ssh_command "yarn application -status $app_id" | grep -P "\tState :" | grep -oP FINISHED +} + poll_stdout () { stdout_path=s3://${BENCHMARK_HOME}/logs/$1/steps/$2/stdout.gz - res="PENDING" - while [[ ${res} != *"COMPLETED"* ]] + stderr_path=s3://${BENCHMARK_HOME}/logs/$1/steps/$2/stderr.gz + masternode=$( get_masternode ) + + while [[ -z $masternode ]]; do + sleep 30 + masternode=$( get_masternode ) + done + + echo masternode: $masternode + app_id="" + + app_id=$( get_appid ) + echo app_id: $app_id + while [[ -z $app_id ]] do sleep 30 - res=$(aws emr describe-step --cluster-id $1 --step-id $2 | grep "State") - echo ${res} - if [[ ${res} == *"FAILED"* ]]; then - echo "Failed to finish step $2." - exit 1 - fi + app_id=$( get_appid ) + echo app_id: $app_id done + res=$( get_appstatus ) + echo res: $res + while [[ ${res} != FINISHED ]] + do + sleep 30 + res=$( get_appstatus ) + echo res: ${res} + done + + aws emr cancel-steps --cluster-id $1 --step-ids $2 --step-cancellation-option SEND_INTERRUPT + + res=$( ssh_command "yarn application -status $app_id" | grep -P "\tFinal-State :" | sed -e 's/.*: *//g' ) + + if [[ $res != SUCCEEDED ]]; then + echo "benchmark step failed" + exit 1 + fi + # check if EMR stdout.gz is complete res="" while [[ ${res} != *"datetime"* ]] diff --git a/python/benchmark/aws-emr/start_cluster.sh b/python/benchmark/aws-emr/start_cluster.sh index 8b7828f8..0b1eb7bb 100755 --- a/python/benchmark/aws-emr/start_cluster.sh +++ b/python/benchmark/aws-emr/start_cluster.sh @@ -1,4 +1,6 @@ -#!/bin/bash +#!/bin/bash -ex +set -o pipefail + cluster_type=${1:-gpu} # configure arguments @@ -12,15 +14,22 @@ if [[ -z ${BENCHMARK_HOME} ]]; then exit 1 fi +if [[ -z ${KEYPAIR} ]]; then + echo "Please export KEYPAIR per README.md" + exit 1 +fi + cluster_name=spark-rapids-ml-${cluster_type} cur_dir=$(pwd) if [[ ${cluster_type} == "gpu" ]]; then - core_type=g4dn.2xlarge + core_type=g5.2xlarge config_json="file://${cur_dir}/../../../notebooks/aws-emr/init-configurations.json" + bootstrap_actions="--bootstrap-actions Name='Spark Rapids ML Bootstrap action',Path=s3://${BENCHMARK_HOME}/init-bootstrap-action.sh" elif [[ ${cluster_type} == "cpu" ]]; then - core_type=m4.2xlarge + core_type=m6gd.2xlarge config_json="file://${cur_dir}/cpu-init-configurations.json" + bootstrap_actions="" else echo "unknown cluster type ${cluster_type}" echo "usage: ./${script_name} cpu|gpu" @@ -29,17 +38,22 @@ fi start_cmd="aws emr create-cluster \ --name ${cluster_name} \ ---release-label emr-6.10.0 \ +--release-label emr-7.3.0 \ --applications Name=Hadoop Name=Spark \ --service-role EMR_DefaultRole \ --log-uri s3://${BENCHMARK_HOME}/logs \ ---ec2-attributes SubnetId=${SUBNET_ID},InstanceProfile=EMR_EC2_DefaultRole \ +--ec2-attributes KeyName=$(basename ${KEYPAIR} | sed -e 's/\.pem//g' ),SubnetId=${SUBNET_ID},InstanceProfile=EMR_EC2_DefaultRole \ +--ebs-root-volume-size=32 \ --instance-groups InstanceGroupType=MASTER,InstanceCount=1,InstanceType=m4.2xlarge \ InstanceGroupType=CORE,InstanceCount=3,InstanceType=${core_type} \ ---configurations ${config_json} \ ---bootstrap-actions Name='Spark Rapids ML Bootstrap action',Path=s3://${BENCHMARK_HOME}/init-bootstrap-action.sh +--configurations ${config_json} $bootstrap_actions " -CLUSTER_ID=$(eval ${start_cmd} | tee /dev/tty | grep "ClusterId" | grep -o 'j-[0-9|A-Z]*') +CLUSTER_ID=$( eval ${start_cmd} | tee /dev/tty | grep "ClusterId" | grep -o 'j-[0-9|A-Z]*') aws emr put-auto-termination-policy --cluster-id ${CLUSTER_ID} --auto-termination-policy IdleTimeout=1800 -echo "${CLUSTER_ID}" +echo "waiting for cluster ${CLUSTER_ID} to start ... " 1>&2 + +aws emr wait cluster-running --cluster-id $CLUSTER_ID + +echo "cluster started." 1>&2 +echo $CLUSTER_ID diff --git a/python/benchmark/benchmark/bench_kmeans.py b/python/benchmark/benchmark/bench_kmeans.py index e1a0776f..5c2753fc 100644 --- a/python/benchmark/benchmark/bench_kmeans.py +++ b/python/benchmark/benchmark/bench_kmeans.py @@ -192,17 +192,6 @@ def gpu_cache_df(df: DataFrame) -> DataFrame: cluster_centers = gpu_model.cluster_centers_ - # temporary patch for DB with spark-rapids plugin - # this part is not timed so overhead is not critical, but should be reverted - # once https://github.com/NVIDIA/spark-rapids/issues/10770 is fixed - db_version = os.environ.get("DATABRICKS_RUNTIME_VERSION") - if db_version: - dim = len(cluster_centers[0]) - # inject unsupported expr (slice) that is essentially a noop - df_for_scoring = df_for_scoring.select( - F.slice(feature_col, 1, dim).alias(feature_col), output_col - ) - if num_cpus > 0: from pyspark.ml.clustering import KMeans as SparkKMeans diff --git a/python/benchmark/benchmark/bench_umap.py b/python/benchmark/benchmark/bench_umap.py index 94c33d36..246cb27a 100644 --- a/python/benchmark/benchmark/bench_umap.py +++ b/python/benchmark/benchmark/bench_umap.py @@ -19,10 +19,10 @@ import numpy as np from pandas import DataFrame as PandasDataFrame -from pyspark.ml.feature import VectorAssembler +from pyspark.ml.feature import StandardScaler, VectorAssembler from pyspark.ml.functions import array_to_vector, vector_to_array from pyspark.sql import DataFrame, SparkSession -from pyspark.sql.functions import col, sum +from pyspark.sql.functions import array, col, sum from benchmark.base import BenchmarkBase from benchmark.utils import inspect_default_params_from_func, with_benchmark @@ -105,7 +105,7 @@ def score( pdf: PandasDataFrame = transformed_df.toPandas() embedding = np.array(pdf[transformed_col].to_list()) - input = np.array(pdf[data_col].to_list()) + input = np.array(pdf[data_col].to_list()).astype(np.float32) score = trustworthiness(input, embedding, n_neighbors=15) return score @@ -162,39 +162,45 @@ def gpu_cache_df(df: DataFrame) -> DataFrame: else: gpu_estimator = gpu_estimator.setFeaturesCols(input_cols) - output_col = "embedding" - gpu_estimator = gpu_estimator.setOutputCol(output_col) - gpu_model, fit_time = with_benchmark( "gpu fit", lambda: gpu_estimator.fit(train_df) ) - def transform(model: UMAPModel, df: DataFrame) -> DataFrame: - transformed_df = model.transform(df) - transformed_df.count() - return transformed_df - - transformed_df, transform_time = with_benchmark( - "gpu transform", lambda: transform(gpu_model, train_df) + output_col = "embedding" + transformed_df = gpu_model.setOutputCol(output_col).transform(train_df) + _, transform_time = with_benchmark( + "gpu transform", lambda: transformed_df.foreach(lambda _: None) ) + total_time = round(time.time() - func_start_time, 2) print(f"gpu total took: {total_time} sec") - data_col = "features" + + df_for_scoring = transformed_df + feature_col = first_col + if not is_single_col: + feature_col = "features_array" + df_for_scoring = transformed_df.select( + array(*input_cols).alias("features_array"), output_col + ) + elif is_vector_col: + df_for_scoring = transformed_df.select( + vector_to_array(col(feature_col)).alias(feature_col), output_col + ) if num_cpus > 0: from pyspark.ml.feature import PCA as SparkPCA assert num_gpus <= 0 + if is_array_col: vector_df = train_df.select( array_to_vector(train_df[first_col]).alias(first_col) ) elif not is_vector_col: - vector_assembler = VectorAssembler(outputCol="features").setInputCols( + vector_assembler = VectorAssembler(outputCol=first_col).setInputCols( input_cols ) vector_df = vector_assembler.transform(train_df).drop(*input_cols) - first_col = "features" else: vector_df = train_df @@ -209,11 +215,10 @@ def cpu_cache_df(df: DataFrame) -> DataFrame: "prepare dataset", lambda: cpu_cache_df(vector_df) ) - output_col = "pca_features" - params = self.class_params print(f"Passing {params} to SparkPCA") + output_col = "pca_features" cpu_pca = SparkPCA(**params).setInputCol(first_col).setOutputCol(output_col) cpu_model, fit_time = with_benchmark( @@ -233,9 +238,27 @@ def cpu_transform(df: DataFrame) -> None: total_time = round(time.time() - func_start_time, 2) print(f"cpu total took: {total_time} sec") - data_col = first_col - score = self.score(transformed_df, data_col, output_col) + # spark ml does not remove the mean in the transformed features, so do that here + # needed for scoring + standard_scaler = ( + StandardScaler() + .setWithStd(False) + .setWithMean(True) + .setInputCol(output_col) + .setOutputCol(output_col + "_mean_removed") + ) + + scaler_model = standard_scaler.fit(transformed_df) + transformed_df = scaler_model.transform(transformed_df).drop(output_col) + + feature_col = first_col + output_col = output_col + "_mean_removed" + df_for_scoring = transformed_df.select( + vector_to_array(col(output_col)).alias(output_col), feature_col + ) + + score = self.score(df_for_scoring, feature_col, output_col) print(f"trustworthiness score: {score}") report_dict = { diff --git a/python/benchmark/databricks/gpu_etl_cluster_spec.sh b/python/benchmark/databricks/gpu_etl_cluster_spec.sh index 4609aafd..812ae044 100644 --- a/python/benchmark/databricks/gpu_etl_cluster_spec.sh +++ b/python/benchmark/databricks/gpu_etl_cluster_spec.sh @@ -9,7 +9,7 @@ cat <=0.39.0dev0" "numba>=0.56.2" diff --git a/python/benchmark/gen_data_distributed.py b/python/benchmark/gen_data_distributed.py index 2a318792..84fef7de 100644 --- a/python/benchmark/gen_data_distributed.py +++ b/python/benchmark/gen_data_distributed.py @@ -464,9 +464,7 @@ def gen_dataframe_and_meta( ground_truth = ground_truth[col_indices] # Create different partition seeds for sample generation. - random.seed(params["random_state"]) - seed_maxval = 100 * num_partitions - partition_seeds = random.sample(range(1, seed_maxval), num_partitions) + global_random_state = params["random_state"] # UDF for distributed generation of X and y. def make_regression_udf(iter: Iterable[pd.DataFrame]) -> Iterable[pd.DataFrame]: @@ -479,13 +477,14 @@ def make_regression_udf(iter: Iterable[pd.DataFrame]) -> Iterable[pd.DataFrame]: logging.warning("cupy import failed; falling back to numpy.") partition_index = pyspark.TaskContext().partitionId() + my_seed = global_random_state + 100 * partition_index if use_cupy: - generator_p = cp.random.RandomState(partition_seeds[partition_index]) + generator_p = cp.random.RandomState(my_seed) ground_truth_cp = cp.asarray(ground_truth) col_indices_cp = cp.asarray(col_indices) bias_p = cp.asarray(bias) else: - generator_p = np.random.RandomState(partition_seeds[partition_index]) + generator_p = np.random.RandomState(my_seed) bias_p = np.array(bias) for pdf in iter: @@ -741,9 +740,7 @@ def gen_dataframe_and_meta( ground_truth = ground_truth[col_indices] # Create different partition seeds for sample generation. - random.seed(params["random_state"]) - seed_maxval = 100 * num_partitions - partition_seeds = random.sample(range(1, seed_maxval), num_partitions) + global_random_seed = params["random_state"] # UDF for distributed generation of X and y. def make_sparse_regression_udf( @@ -835,10 +832,11 @@ def make_sparse_regression_udf( sparse_matrix.sum_duplicates() # Support parameters and library adaptation + my_seed = global_random_seed + 100 * partition_index if use_cupy: - generator_p = cp.random.RandomState(partition_seeds[partition_index]) + generator_p = cp.random.RandomState(my_seed) else: - generator_p = np.random.RandomState(partition_seeds[partition_index]) + generator_p = np.random.RandomState(my_seed) # Label Calculation y = sparse_matrix.dot(ground_truth) + bias diff --git a/python/pyproject.toml b/python/pyproject.toml index 6bbcf361..54d9a565 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "spark-rapids-ml" -version = "24.8.0" +version = "24.10.0" authors = [ { name="Jinfeng Li", email="jinfeng@nvidia.com" }, { name="Bobby Wang", email="bobwang@nvidia.com" }, @@ -9,16 +9,26 @@ authors = [ ] description = "Apache Spark integration with RAPIDS and cuML" readme = "README.md" -requires-python = ">=3.8" +requires-python = ">=3.10" classifiers = [ "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", "License :: OSI Approved :: Apache Software License", "Operating System :: OS Independent", "Environment :: GPU :: NVIDIA CUDA :: 11", + "Environment :: GPU :: NVIDIA CUDA :: 11.4", "Environment :: GPU :: NVIDIA CUDA :: 11.5", + "Environment :: GPU :: NVIDIA CUDA :: 11.6", + "Environment :: GPU :: NVIDIA CUDA :: 11.7", + "Environment :: GPU :: NVIDIA CUDA :: 11.8", + "Environment :: GPU :: NVIDIA CUDA :: 12", + "Environment :: GPU :: NVIDIA CUDA :: 12.0", + "Environment :: GPU :: NVIDIA CUDA :: 12.1", + "Environment :: GPU :: NVIDIA CUDA :: 12.2", + "Environment :: GPU :: NVIDIA CUDA :: 12.3", + "Environment :: GPU :: NVIDIA CUDA :: 12.4", + "Environment :: GPU :: NVIDIA CUDA :: 12.5", ] [project.scripts] diff --git a/python/requirements_dev.txt b/python/requirements_dev.txt index def9bfb9..3730bf8a 100644 --- a/python/requirements_dev.txt +++ b/python/requirements_dev.txt @@ -7,5 +7,6 @@ numpydoc pydata-sphinx-theme pylint pytest +pytest-xdist sphinx<6.0 twine>=4.0.0 diff --git a/python/run_benchmark.sh b/python/run_benchmark.sh index bc9e929d..fef7840a 100755 --- a/python/run_benchmark.sh +++ b/python/run_benchmark.sh @@ -107,7 +107,7 @@ EOF if [[ $cluster_type == "gpu_etl" ]] then -SPARK_RAPIDS_VERSION=24.06.1 +SPARK_RAPIDS_VERSION=24.08.1 rapids_jar=${rapids_jar:-rapids-4-spark_2.12-$SPARK_RAPIDS_VERSION.jar} if [ ! -f $rapids_jar ]; then echo "downloading spark rapids jar" diff --git a/python/run_test.sh b/python/run_test.sh index 750cf03c..6702b6e8 100755 --- a/python/run_test.sh +++ b/python/run_test.sh @@ -19,8 +19,26 @@ spark-rapids-submit --master local[1] tests_no_import_change/test_no_import_chan # runs on cpu with spark-submit spark-submit --master local[1] tests_no_import_change/test_no_import_change.py 0.2 + +# calculate pytest parallelism by following https://github.com/NVIDIA/spark-rapids/blob/branch-24.12/integration_tests/run_pyspark_from_build.sh +MAX_PARALLEL=3 +NVIDIA_SMI_ARGS="" +if [ ${CUDA_VISIBLE_DEVICES} ]; then + NVIDIA_SMI_ARGS="${NVIDIA_SMI_ARGS} -i ${CUDA_VISIBLE_DEVICES}" +fi +GPU_MEM_PARALLEL=`nvidia-smi ${NVIDIA_SMI_ARGS} --query-gpu=memory.free --format=csv,noheader | awk 'NR == 1 { MIN = $1 } { if ($1 < MIN) { MIN = $1 } } END { print int((MIN - 2 * 1024) / ((3 * 1024) + 750)) }'` +CPU_CORES=`nproc` +TMP_PARALLEL=$(( $GPU_MEM_PARALLEL > $CPU_CORES ? $CPU_CORES : $GPU_MEM_PARALLEL )) +TMP_PARALLEL=$(( $TMP_PARALLEL > $MAX_PARALLEL ? $MAX_PARALLEL : $TMP_PARALLEL )) +if (( $TMP_PARALLEL <= 1 )); then + TEST_PARALLEL=1 + else + TEST_PARALLEL=$TMP_PARALLEL +fi +echo "Test functions in benchmark/test_gen_data.py and tests/ directory will be executed in parallel with ${TEST_PARALLEL} pytest workers" + echo "use --runslow to run all tests" -pytest "$@" benchmark/test_gen_data.py -PYTHONPATH=`pwd`/benchmark pytest -ra "$@" --durations=10 tests -#PYTHONPATH=`pwd`/benchmark pytest -ra --runslow --durations=10 tests +pytest "$@" -n ${TEST_PARALLEL} benchmark/test_gen_data.py +PYTHONPATH=`pwd`/benchmark pytest -ra "$@" -n ${TEST_PARALLEL} --durations=10 tests +#PYTHONPATH=`pwd`/benchmark pytest -ra --runslow -n ${TEST_PARALLEL} --durations=10 tests #PYTHONPATH=`pwd`/benchmark pytest -ra "$@" --durations=10 tests_large diff --git a/python/src/spark_rapids_ml/__init__.py b/python/src/spark_rapids_ml/__init__.py index 21866191..c7f1bc65 100644 --- a/python/src/spark_rapids_ml/__init__.py +++ b/python/src/spark_rapids_ml/__init__.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # -__version__ = "24.08.0" +__version__ = "24.10.0" import pandas as pd import pyspark diff --git a/python/src/spark_rapids_ml/classification.py b/python/src/spark_rapids_ml/classification.py index eb01ac8c..9d2e5e8a 100644 --- a/python/src/spark_rapids_ml/classification.py +++ b/python/src/spark_rapids_ml/classification.py @@ -14,6 +14,7 @@ # limitations under the License. # from abc import ABCMeta +from collections import Counter from typing import ( TYPE_CHECKING, Any, @@ -226,19 +227,15 @@ def _transformEvaluate( num_models = self._this_model._get_num_models() if eval_metric_info.eval_metric == transform_evaluate_metric.accuracy_like: - tp_by_class: List[Dict[float, float]] = [{} for _ in range(num_models)] - fp_by_class: List[Dict[float, float]] = [{} for _ in range(num_models)] - label_count_by_class: List[Dict[float, float]] = [ - {} for _ in range(num_models) + # if we ever implement weights, Counter supports float values, but + # type checking might fail https://github.com/python/typeshed/issues/3438 + tp_by_class: List[Counter[float]] = [Counter() for _ in range(num_models)] + fp_by_class: List[Counter[float]] = [Counter() for _ in range(num_models)] + label_count_by_class: List[Counter[float]] = [ + Counter() for _ in range(num_models) ] label_count = [0 for _ in range(num_models)] - for i in range(num_models): - for j in range(self._this_model._num_classes): - tp_by_class[i][float(j)] = 0.0 - label_count_by_class[i][float(j)] = 0.0 - fp_by_class[i][float(j)] = 0.0 - for row in rows: label_count[row.model_index] += row.total label_count_by_class[row.model_index][row.label] += row.total @@ -250,10 +247,16 @@ def _transformEvaluate( scores = [] for i in range(num_models): + # match spark mllib behavior in the below cases + for l in label_count_by_class[i]: + if l not in tp_by_class[i]: + tp_by_class[i][l] = 0 + if l not in fp_by_class[i]: + fp_by_class[i][l] = 0 metrics = MulticlassMetrics( - tp=tp_by_class[i], - fp=fp_by_class[i], - label=label_count_by_class[i], + tp=dict(tp_by_class[i]), + fp=dict(fp_by_class[i]), + label=dict(label_count_by_class[i]), label_count=label_count[i], ) scores.append(metrics.evaluate(evaluator)) @@ -339,31 +342,31 @@ class RandomForestClassifier( Parameters ---------- - featuresCol: str or List[str] + featuresCol: str or List[str] (default = "features") The feature column names, spark-rapids-ml supports vector, array and columnar as the input.\n * When the value is a string, the feature columns must be assembled into 1 column with vector or array type. * When the value is a list of strings, the feature columns must be numeric types. - labelCol: + labelCol: str (default = "label") The label column name. - predictionCol: + predictionCol: str (default = "prediction") The prediction column name. - probabilityCol: + probabilityCol: str (default = "probability") The column name for predicted class conditional probabilities. - rawPredictionCol: + rawPredictionCol: str (default = "rawPrediction") The column name for class raw predictions - this is currently set equal to probabilityCol values. - maxDepth: + maxDepth: int (default = 5) Maximum tree depth. Must be greater than 0. - maxBins: + maxBins: int (default = 32) Maximum number of bins used by the split algorithm per feature. - minInstancesPerNode: + minInstancesPerNode: int (default = 1) The minimum number of samples (rows) in each leaf node. - impurity: str = "gini", + impurity: str (default = "gini") The criterion used to split nodes.\n * ``'gini'`` for gini impurity * ``'entropy'`` for information gain (entropy) - numTrees: + numTrees: int (default = 20) Total number of trees in the forest. - featureSubsetStrategy: + featureSubsetStrategy: str (default = "auto") Ratio of number of features (columns) to consider per node split.\n The supported options:\n ``'auto'``: If numTrees == 1, set to 'all', If numTrees > 1 (forest), set to 'sqrt'\n @@ -373,9 +376,9 @@ class RandomForestClassifier( ``'log2'``: log2(number of features)\n ``'n'``: when n is in the range (0, 1.0], use n * number of features. When n is in the range (1, number of features), use n features. - seed: + seed: int (default = None) Seed for the random number generator. - bootstrap: + bootstrap: bool (default = True) Control bootstrapping.\n * If ``True``, each tree in the forest is built on a bootstrapped sample with replacement. @@ -393,11 +396,11 @@ class RandomForestClassifier( * ``4 or False`` - Enables all messages up to and including information messages. * ``5 or True`` - Enables all messages up to and including debug messages. * ``6`` - Enables all messages up to and including trace messages. - n_streams: + n_streams: int (default = 1) Number of parallel streams used for forest building. Please note that there is a bug running spark-rapids-ml on a node with multi-gpus when n_streams > 1. See https://github.com/rapidsai/cuml/issues/5402. - min_samples_split: + min_samples_split: int or float (default = 2) The minimum number of samples required to split an internal node.\n * If type ``int``, then ``min_samples_split`` represents the minimum number. @@ -405,11 +408,11 @@ class RandomForestClassifier( and ``ceil(min_samples_split * n_rows)`` is the minimum number of samples for each split. max_samples: Ratio of dataset rows used while fitting each tree. - max_leaves: + max_leaves: int (default = -1) Maximum leaf nodes per tree. Soft constraint. Unlimited, if -1. - min_impurity_decrease: + min_impurity_decrease: float (default = 0.0) Minimum decrease in impurity required for node to be split. - max_batch_size: + max_batch_size: int (default = 4096) Maximum number of nodes that can be processed in a given batch. Examples @@ -835,26 +838,26 @@ class LogisticRegression( Parameters ---------- - featuresCol: str or List[str] + featuresCol: str or List[str] (default = "features") The feature column names, spark-rapids-ml supports vector, array and columnar as the input.\n * When the value is a string, the feature columns must be assembled into 1 column with vector or array type. * When the value is a list of strings, the feature columns must be numeric types. - labelCol: + labelCol: (default = "label") The label column name. - predictionCol: + predictionCol: (default = "prediction") The class prediction column name. - probabilityCol: + probabilityCol: (default = "probability") The probability prediction column name. - rawPredictionCol: + rawPredictionCol: (default = "rawPrediction") The column name for class raw predictions - this is currently set equal to probabilityCol values. - maxIter: + maxIter: (default = 100) The maximum number of iterations of the underlying L-BFGS algorithm. - regParam: + regParam: (default = 0.0) The regularization parameter. - elasticNetParam: + elasticNetParam: (default = 0.0) The ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. - tol: + tol: (default = 1e-6) The convergence tolerance. enable_sparse_data_optim: None or boolean, optional (default=None) If features column is VectorUDT type, Spark rapids ml relies on this parameter to decide whether to use dense array or sparse array in cuml. @@ -862,9 +865,9 @@ class LogisticRegression( If False, always uses dense array. This is favorable if the majority of VectorUDT vectors are DenseVector. If True, always uses sparse array. This is favorable if the majority of the VectorUDT vectors are SparseVector. Note this is only supported in spark >= 3.4. - fitIntercept: + fitIntercept: (default = True) Whether to fit an intercept term. - standardization: + standardization: (default = True) Whether to standardize the training data before fit. num_workers: Number of cuML workers, where each cuML worker corresponds to one Spark task diff --git a/python/src/spark_rapids_ml/clustering.py b/python/src/spark_rapids_ml/clustering.py index 9d97dd52..28d38048 100644 --- a/python/src/spark_rapids_ml/clustering.py +++ b/python/src/spark_rapids_ml/clustering.py @@ -179,17 +179,17 @@ class KMeans(KMeansClass, _CumlEstimator, _KMeansCumlParams): Parameters ---------- - k: int (default = 8) + k: int (default = 2) the number of centers. Set this parameter to enable KMeans to learn k centers from input vectors. initMode: str (default = "k-means||") the algorithm to select initial centroids. It can be "k-means||" or "random". - maxIter: int (default = 300) + maxIter: int (default = 20) the maximum iterations the algorithm will run to learn the k centers. More iterations help generate more accurate centers. - seed: int (default = 1) + seed: int (default = None) the random seed used by the algorithm to initialize a set of k random centers to start with. tol: float (default = 1e-4) @@ -298,6 +298,12 @@ def __init__( super().__init__() self._set_params(**self._input_kwargs) + def setInitMode(self, value: str) -> "KMeans": + """ + Sets the value of :py:attr:`initMode`. + """ + return self._set_params(initMode=value) + def setK(self, value: int) -> "KMeans": """ Sets the value of :py:attr:`k`. @@ -483,7 +489,7 @@ def _construct_kmeans() -> CumlT: kmeans = CumlKMeansMG(output_type="cudf", **cuml_alg_params) from spark_rapids_ml.utils import cudf_to_cuml_array - kmeans.n_cols = n_cols + kmeans.n_features_in_ = n_cols kmeans.dtype = np.dtype(dtype) kmeans.cluster_centers_ = cudf_to_cuml_array( np.array(cluster_centers_).astype(dtype), order=array_order @@ -512,7 +518,6 @@ def _get_cuml_params_default(self) -> Dict[str, Any]: "algorithm": "brute", "verbose": False, "max_mbytes_per_batch": None, - "calc_core_sample_indices": False, } def _pyspark_class(self) -> Optional[ABCMeta]: @@ -528,7 +533,6 @@ def __init__(self) -> None: metric="euclidean", algorithm="brute", max_mbytes_per_batch=None, - calc_core_sample_indices=True, idCol=alias.row_number, ) @@ -578,16 +582,6 @@ def __init__(self) -> None: typeConverter=TypeConverters.toInt, ) - calc_core_sample_indices = Param( - Params._dummy(), - "calc_core_sample_indices", - ( - f"Indicates whether the indices of the core samples should be calculated." - f"Setting this to False will avoid unnecessary kernel launches" - ), - typeConverter=TypeConverters.toBoolean, - ) - idCol = Param( Params._dummy(), "idCol", @@ -645,12 +639,12 @@ class DBSCAN(DBSCANClass, _CumlEstimator, _DBSCANCumlParams): Parameters ---------- - featuresCol: str or List[str] + featuresCol: str or List[str] (default = "features") The feature column names, spark-rapids-ml supports vector, array and columnar as the input.\n * When the value is a string, the feature columns must be assembled into 1 column with vector or array type. * When the value is a list of strings, the feature columns must be numeric types. - predictionCol: str + predictionCol: str (default = "prediction") the name of the column that stores cluster indices of input vectors. predictionCol should be set when users expect to apply the transform function of a learned model. num_workers: @@ -687,14 +681,12 @@ class DBSCAN(DBSCANClass, _CumlEstimator, _DBSCANCumlParams): This enables the trade-off between runtime and memory usage for making the N^2 pairwise distance computations more tractable for large numbers of samples. If you are experiencing out of memory errors when running DBSCAN, you can set this value based on the memory size of your device. - calc_core_sample_indices(optional): boolean (default = True) - Indicates whether the indices of the core samples should be calculated. - Setting this to False will avoid unnecessary kernel launches - idCol: str (default = 'unique_id') The internal unique id column name for label matching, will not reveal in the output. Need to be set to a name that does not conflict with an existing column name in the original input data. + Note: We currently do not support calculating and storing the indices of the core samples via the parameter calc_core_sample_indices=True. + Examples ---------- >>> from spark_rapids_ml.clustering import DBSCAN @@ -765,7 +757,6 @@ def __init__( metric: str = "euclidean", algorithm: str = "brute", max_mbytes_per_batch: Optional[int] = None, - calc_core_sample_indices: bool = True, verbose: Union[int, bool] = False, **kwargs: Any, ) -> None: @@ -778,8 +769,8 @@ def __init__( assert max_records_per_batch_str is not None self.max_records_per_batch = int(max_records_per_batch_str) self.BROADCAST_LIMIT = 8 << 30 - self.verbose = verbose + self.cuml_params["calc_core_sample_indices"] = False # currently not supported def setEps(self: P, value: float) -> P: return self._set_params(eps=value) @@ -811,12 +802,6 @@ def setMaxMbytesPerBatch(self: P, value: Optional[int]) -> P: def getMaxMbytesPerBatch(self) -> Optional[int]: return self.getOrDefault("max_mbytes_per_batch") - def setCalcCoreSampleIndices(self: P, value: bool) -> P: - return self._set_params(calc_core_sample_indices=value) - - def getCalcCoreSampleIndices(self) -> bool: - return self.getOrDefault("calc_core_sample_indices") - def _fit(self, dataset: DataFrame) -> _CumlModel: if self.getMetric() == "precomputed": raise ValueError( @@ -829,6 +814,7 @@ def _fit(self, dataset: DataFrame) -> _CumlModel: model._num_workers = self.num_workers self._copyValues(model) + self._copy_cuml_params(model) # type: ignore return model @@ -970,13 +956,7 @@ def _cuml_fit( dbscan = CumlDBSCANMG( handle=params[param_alias.handle], output_type="cudf", - eps=self.getOrDefault("eps"), - min_samples=self.getOrDefault("min_samples"), - metric=self.getOrDefault("metric"), - algorithm=self.getOrDefault("algorithm"), - max_mbytes_per_batch=self.getOrDefault("max_mbytes_per_batch"), - calc_core_sample_indices=self.getOrDefault("calc_core_sample_indices"), - verbose=self.verbose, + **params[param_alias.cuml_init], ) dbscan.n_cols = params[param_alias.num_cols] dbscan.dtype = np.dtype(dtype) diff --git a/python/src/spark_rapids_ml/core.py b/python/src/spark_rapids_ml/core.py index 064091aa..455aa6e7 100644 --- a/python/src/spark_rapids_ml/core.py +++ b/python/src/spark_rapids_ml/core.py @@ -751,7 +751,9 @@ def _train_udf(pdf_iter: Iterator[pd.DataFrame]) -> pd.DataFrame: concated_nnz = sum(triplet[0].nnz for triplet in inputs) # type: ignore if concated_nnz > np.iinfo(np.int32).max: logger.warn( - "the number of non-zero values of a partition is larger than the int32 index dtype of cupyx csr_matrix" + f"The number of non-zero values of a partition exceeds the int32 index dtype. \ + cupyx csr_matrix currently does not support int64 indices (https://github.com/cupy/cupy/issues/3513); \ + keeping as scipy csr_matrix to avoid overflow." ) else: inputs = [ @@ -775,6 +777,14 @@ def _train_udf(pdf_iter: Iterator[pd.DataFrame]) -> pd.DataFrame: logger.info("Invoking cuml fit") + # pyspark uses sighup to kill python workers gracefully, and for some reason + # the signal handler for sighup needs to be explicitly reset at this point + # to avoid having SIGHUP be swallowed during a usleep call in the nccl library. + # this helps avoid zombie surviving python workers when some workers fail. + import signal + + signal.signal(signal.SIGHUP, signal.SIG_DFL) + # call the cuml fit function # *note*: cuml_fit_func may delete components of inputs to free # memory. do not rely on inputs after this call. @@ -1398,7 +1408,8 @@ def process_pdf_iter( yield pdf else: pdfs = [pdf for pdf in pdf_iter] - yield pd.concat(pdfs, ignore_index=True) + if (len(pdfs)) > 0: + yield pd.concat(pdfs, ignore_index=True) processed_pdf_iter = process_pdf_iter(pdf_iter) has_row_number = None diff --git a/python/src/spark_rapids_ml/knn.py b/python/src/spark_rapids_ml/knn.py index 43369465..81d3dcaf 100644 --- a/python/src/spark_rapids_ml/knn.py +++ b/python/src/spark_rapids_ml/knn.py @@ -15,6 +15,7 @@ # import asyncio +import inspect import math from abc import ABCMeta, abstractmethod from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union @@ -61,7 +62,7 @@ param_alias, ) from .metrics import EvalMetricInfo -from .params import HasIDCol, P, _CumlClass, _CumlParams +from .params import DictTypeConverters, HasIDCol, P, _CumlClass, _CumlParams from .utils import ( _concat_and_free, _get_class_or_callable_name, @@ -601,7 +602,20 @@ def kneighbors( alias.label, lit(self._label_isquery) ) - union_df = self._processed_item_df.union(processed_query_df) + def select_cols_for_cuml_fit(df_origin: DataFrame) -> DataFrame: + cols_for_nns = [self._getIdColOrDefault(), alias.label] + input_col, input_cols = self._get_input_columns() + if input_col is not None: + cols_for_nns.append(input_col) + else: + assert input_cols is not None + cols_for_nns += input_cols + + return df_origin.select(cols_for_nns) + + df_item_for_nns = select_cols_for_cuml_fit(self._processed_item_df) + df_query_for_nns = select_cols_for_cuml_fit(processed_query_df) + union_df = df_item_for_nns.union(df_query_for_nns) pipelinedrdd = self._call_cuml_fit_func(union_df, partially_collect=False) pipelinedrdd = pipelinedrdd.repartition(query_default_num_partitions) # type: ignore @@ -808,17 +822,6 @@ def _pyspark_class(self) -> Optional[ABCMeta]: return None -class DictTypeConverters(TypeConverters): - @staticmethod - def _toDict(value: Any) -> Dict[str, Any]: - """ - Convert a value to a Dict type for Param typeConverter, if possible. - """ - if isinstance(value, Dict): - return {TypeConverters.toString(k): v for k, v in value.items()} - raise TypeError("Could not convert %s to Dict[str, Any]" % value) - - class _ApproximateNearestNeighborsParams(_NearestNeighborsCumlParams): def __init__(self) -> None: super().__init__() @@ -922,6 +925,12 @@ class ApproximateNearestNeighbors( k: int (default = 5) the default number of approximate nearest neighbors to retrieve for each query. + If fewer than k neighbors are found for a query (for example, due to a small nprobe value): + (1)In ivfflat and ivfpq: + (a) If no item vector is probed, the indices are filled with long_max (9,223,372,036,854,775,807) and distances are set to infinity. + (b) If at least one item vector is probed, the indices are filled with the top-1 neighbor's ID, and distances are filled with infinity. + (2) cagra does not have this problem, as at least itopk_size (where itopk_size ≥ k) items are always probed. + algorithm: str (default = 'ivfflat') the algorithm parameter to be passed into cuML. It currently must be 'ivfflat', 'ivfpq' or 'cagra'. Other algorithms are expected to be supported later. @@ -1096,6 +1105,12 @@ def __init__( "ivfpq", "cagra", }, "currently only ivfflat, ivfpq, and cagra are supported" + if not self._input_kwargs.get("float32_inputs", True): + get_logger(self.__class__).warning( + "This estimator supports only float32 inputs on GPU and will convert all other data types to float32. Setting float32_inputs to False will be ignored." + ) + self._input_kwargs.pop("float32_inputs") + self._set_params(**self._input_kwargs) def _fit(self, item_df: DataFrame) -> "ApproximateNearestNeighborsModel": # type: ignore @@ -1301,6 +1316,49 @@ def _cal_cagra_params_and_check( return (cagra_index_params, cagra_search_params) + @classmethod + def _cal_cuvs_ivf_flat_params_and_check( + cls, algoParams: Optional[Dict[str, Any]], metric: str, topk: int + ) -> Tuple[Dict[str, Any], Dict[str, Any]]: + ivfflat_index_params: Dict[str, Any] = {"metric": metric} + ivfflat_search_params: Dict[str, Any] = {} + + # support both cuml names (nlist, nprobe) and cuvs names (n_lists, n_probes) + if algoParams is not None: + for p in algoParams: + if p in {"n_probes", "nprobe"}: + ivfflat_search_params["n_probes"] = algoParams[p] + elif p in {"n_lists", "nlist"}: + ivfflat_index_params["n_lists"] = algoParams[p] + else: + ivfflat_index_params[p] = algoParams[p] + + return (ivfflat_index_params, ivfflat_search_params) + + @classmethod + def _cal_cuvs_ivf_pq_params_and_check( + cls, algoParams: Optional[Dict[str, Any]], metric: str, topk: int + ) -> Tuple[Dict[str, Any], Dict[str, Any]]: + pq_index_params: Dict[str, Any] = {"metric": metric} + pq_search_params: Dict[str, Any] = {} + + if algoParams is not None: + for p in algoParams: + if p in {"n_probes", "nprobe"}: + pq_search_params["n_probes"] = algoParams[p] + elif p in {"lut_dtype", "internal_distance_dtype"}: + pq_search_params[p] = algoParams[p] + elif p in {"n_lists", "nlist"}: + pq_index_params["n_lists"] = algoParams[p] + elif p in {"M", "pq_dim"}: + pq_index_params["pq_dim"] = algoParams[p] + elif p in {"n_bits", "pq_bits"}: + pq_index_params["pq_bits"] = algoParams[p] + else: + pq_index_params[p] = algoParams[p] + + return (pq_index_params, pq_search_params) + def kneighbors( self, query_df: DataFrame, sort_knn_df_by_query_id: bool = True ) -> Tuple[DataFrame, DataFrame, DataFrame]: @@ -1385,10 +1443,21 @@ def _get_cuml_transform_func( "sqeuclidean", "inner_product", "l2", + "cosine", } - if cuml_alg_params["algorithm"] == "cagra": - cagra_index_params, cagra_search_params = self._cal_cagra_params_and_check( + if ( + cuml_alg_params["algorithm"] != "brute" + ): # brute links to CPUNearestNeighborsModel of benchmark.bench_nearest_neighbors + if cuml_alg_params["algorithm"] == "cagra": + check_fn = self._cal_cagra_params_and_check + elif cuml_alg_params["algorithm"] in {"ivf_flat", "ivfflat"}: + check_fn = self._cal_cuvs_ivf_flat_params_and_check + else: + assert cuml_alg_params["algorithm"] in {"ivf_pq", "ivfpq"} + check_fn = self._cal_cuvs_ivf_pq_params_and_check + + index_params, search_params = check_fn( algoParams=self.cuml_params["algo_params"], metric=self.cuml_params["metric"], topk=cuml_alg_params["n_neighbors"], @@ -1396,25 +1465,19 @@ def _get_cuml_transform_func( def _construct_sgnn() -> CumlT: - if cuml_alg_params["algorithm"] in {"ivfflat", "ivfpq"}: - from cuml.neighbors import NearestNeighbors as SGNN - - # Currently 'usePrecomputedTables' is required by cuml cython API, though the value is ignored in C++. - if ( - cuml_alg_params["algorithm"] == "ivfpq" - and cuml_alg_params["algo_params"] - ): - if "usePrecomputedTables" not in cuml_alg_params["algo_params"]: - cuml_alg_params["algo_params"]["usePrecomputedTables"] = False + if cuml_alg_params["algorithm"] in {"ivf_pq", "ivfpq"}: + from cuvs.neighbors import ivf_pq - nn_object = SGNN(output_type="cupy", **cuml_alg_params) + return ivf_pq + elif cuml_alg_params["algorithm"] in {"ivfflat" or "ivf_flat"}: + from cuvs.neighbors import ivf_flat - return nn_object + return ivf_flat else: assert cuml_alg_params["algorithm"] == "cagra" from cuvs.neighbors import cagra - return "cagra" + return cagra row_number_col = alias.row_number input_col, input_cols = self._get_input_columns() @@ -1432,20 +1495,21 @@ def _transform_internal( nn_object: CumlT, df: Union[pd.DataFrame, np.ndarray] ) -> pd.DataFrame: - item_row_number = df[row_number_col].to_numpy() + item_row_number = df[row_number_col].to_numpy(dtype=np.int64) item = df.drop(row_number_col, axis=1) # type: ignore if input_col is not None: assert len(item.columns) == 1 item = np.array(list(item[item.columns[0]]), order="C") - if len(item) == 0: - return pd.DataFrame( + if len(item) == 0 or len(bcast_qfeatures.value) == 0: + res = pd.DataFrame( { - f"query_{id_col_name}": [], - "indices": [], - "distances": [], + f"query_{id_col_name}": pd.Series(dtype="int64"), + "indices": pd.Series(dtype="object"), + "distances": pd.Series(dtype="object"), } ) + return res import cupy as cp from pyspark import TaskContext @@ -1459,17 +1523,40 @@ def _transform_internal( start_time = time.time() - if nn_object != "cagra": + if not inspect.ismodule( + nn_object + ): # derived class (e.g. benchmark.bench_nearest_neighbors.CPUNearestNeighborsModel) nn_object.fit(item) - else: - from cuvs.neighbors import cagra - - build_params = cagra.IndexParams(**cagra_index_params) + else: # cuvs ivf_flat or cagra + build_params = nn_object.IndexParams(**index_params) # cuvs does not take pd.DataFrame as input if isinstance(item, pd.DataFrame): item = cp.array(item.to_numpy(), order="C", dtype="float32") - cagra_index_obj = cagra.build(build_params, item) + if isinstance(item, np.ndarray): + item = cp.array(item, dtype="float32") + + try: + index_obj = nn_object.build(build_params, item) + except Exception as e: + if "k must be less than topk::kMaxCapacity (256)" in str(e): + from cuvs.neighbors import cagra + + assert nn_object == cagra + assert ( + "build_algo" not in index_params + or index_params["build_algo"] == "ivf_pq" + ) + + intermediate_graph_degree = ( + build_params.intermediate_graph_degree + ) + assert intermediate_graph_degree >= 256 + + error_msg = f"cagra with ivf_pq build_algo expects intermediate_graph_degree ({intermediate_graph_degree}) to be smaller than 256" + raise ValueError(error_msg) + else: + raise e logger.info( f"partition {pid} indexing finished in {time.time() - start_time} seconds." @@ -1477,35 +1564,59 @@ def _transform_internal( start_time = time.time() - if nn_object != "cagra": + if not inspect.ismodule( + nn_object + ): # derived class (e.g. benchmark.bench_nearest_neighbors.CPUNearestNeighborsModel) distances, indices = nn_object.kneighbors(bcast_qfeatures.value) - else: + else: # cuvs ivf_flat cagra ivf_pq gpu_qfeatures = cp.array( bcast_qfeatures.value, order="C", dtype="float32" ) - distances, indices = cagra.search( - cagra.SearchParams(**cagra_search_params), - cagra_index_obj, + assert cuml_alg_params["n_neighbors"] <= len( + item + ), "k is larger than the number of item vectors on a GPU. Please increase the dataset size or use less GPUs" + + distances, indices = nn_object.search( + nn_object.SearchParams(**search_params), + index_obj, gpu_qfeatures, cuml_alg_params["n_neighbors"], ) + + if cuml_alg_params["algorithm"] in {"ivf_pq", "ivfpq"}: + from cuvs.neighbors import refine + + distances, indices = refine( + dataset=item, + queries=gpu_qfeatures, + candidates=indices, + k=cuml_alg_params["n_neighbors"], + metric=cuml_alg_params["metric"], + ) + distances = cp.asarray(distances) indices = cp.asarray(indices) - # Note cuML kneighbors applys an extra square root on the l2 distances. - # Here applies square to obtain the actual l2 distances. - if cuml_alg_params["algorithm"] in {"ivfflat", "ivfpq"}: - if ( - cuml_alg_params["metric"] == "euclidean" - or cuml_alg_params["metric"] == "l2" - ): - distances = distances * distances + # in case refine API reset inf distances to 0. + if cuml_alg_params["algorithm"] in {"ivf_pq", "ivfpq"}: + distances[indices >= len(item)] = float("inf") + + # for the case top-1 nn got filled into indices + top1_ind = indices[:, 0] + rest_indices = indices[:, 1:] + rest_distances = distances[:, 1:] + rest_distances[rest_indices == top1_ind[:, cp.newaxis]] = float( + "inf" + ) if isinstance(distances, cp.ndarray): distances = distances.get() + # in case a query did not probe any items, indices are filled with int64 max and distances are filled with inf + item_row_number = np.append(item_row_number, np.iinfo("int64").max) if isinstance(indices, cp.ndarray): + indices[indices >= len(item)] = len(item) indices = indices.get() indices_global = item_row_number[indices] diff --git a/python/src/spark_rapids_ml/params.py b/python/src/spark_rapids_ml/params.py index be94bc14..156d7279 100644 --- a/python/src/spark_rapids_ml/params.py +++ b/python/src/spark_rapids_ml/params.py @@ -145,6 +145,9 @@ def _param_mapping(cls) -> Dict[str, Optional[str]]: - empty string, if a defined Spark Param should just be silently ignored, or - None, if a defined Spark Param should raise an error. + For algorithms without a Spark equivalent, the mapping can be left empty, with the exception + of parameters for which we override the cuML default value with our own: these should include an identity mapping, e.g. {"param": "param"}. + Note: standard Spark column Params, e.g. inputCol, featureCol, etc, should not be listed in this mapping, since they are handled differently. @@ -331,7 +334,7 @@ def _set_params(self: P, **kwargs: Any) -> P: elif isinstance(v, List): self._set(**{"featuresCols": v}) elif self.hasParam(k): - # standard Spark ML Param + # Param is declared as a Spark ML Param self._set(**{str(k): v}) # type: ignore self._set_cuml_param(k, v, silent=False) elif k in self.cuml_params: @@ -479,6 +482,9 @@ def _get_cuml_param(self, spark_param: str, silent: bool = True) -> Optional[str cuml_param = None return cuml_param + elif spark_param in self.cuml_params: + # cuML param that is declared as a Spark param (e.g., for algos w/out Spark equivalents) + return spark_param else: return None @@ -486,6 +492,8 @@ def _set_cuml_param( self, spark_param: str, spark_value: Any, silent: bool = True ) -> None: """Set a cuml_params parameter for a given Spark Param and value. + The Spark Param may be a cuML param that is declared as a Spark param (e.g., for algos w/out Spark equivalents), + in which case the cuML param will be returned from _get_cuml_param. Parameters ---------- @@ -552,3 +560,15 @@ def _set_cuml_value(self, k: str, v: Any) -> None: """ value_map = self._get_cuml_mapping_value(k, v) self._cuml_params[k] = value_map + + +class DictTypeConverters(TypeConverters): + @staticmethod + def _toDict(value: Any) -> Dict[str, Any]: + """ + Convert a value to a Dict type for Param typeConverter, if possible. + Used to support Dict types with the Spark ML Param API. + """ + if isinstance(value, Dict): + return {TypeConverters.toString(k): v for k, v in value.items()} + raise TypeError("Could not convert %s to Dict[str, Any]" % value) diff --git a/python/src/spark_rapids_ml/regression.py b/python/src/spark_rapids_ml/regression.py index 997c3915..1bed8a1f 100644 --- a/python/src/spark_rapids_ml/regression.py +++ b/python/src/spark_rapids_ml/regression.py @@ -311,31 +311,31 @@ class LinearRegression( Parameters ---------- - featuresCol: str or List[str] + featuresCol: str or List[str] (default = "features") The feature column names, spark-rapids-ml supports vector, array and columnar as the input.\n * When the value is a string, the feature columns must be assembled into 1 column with vector or array type. * When the value is a list of strings, the feature columns must be numeric types. - labelCol: + labelCol: str (default = "label") The label column name. - predictionCol: + predictionCol: str (default = "prediction") The prediction column name. - maxIter: + maxIter: int (default = 100) Max number of iterations (>= 0). - regParam: - Regularization parameter (>= 0) - elasticNetParam: + regParam: float (default = 0.0) + Regularization parameter (>= 0). + elasticNetParam: float (default = 0.0) The ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. - tol: + tol: float (default = 1e-6) The convergence tolerance for iterative algorithms (>= 0). - fitIntercept: - whether to fit an intercept term. - standardization: + fitIntercept: bool (default = True) + Whether to fit an intercept term. + standardization: bool (default = True) Whether to standardize the training features before fitting the model. - solver: + solver: str (default = "auto") The solver algorithm for optimization. If this is not set or empty, default value is 'auto'.\n The supported options: 'auto', 'normal' and 'eig', all of them will be mapped to 'eig' in cuML. - loss: + loss: str (default = "squaredError") The loss function to be optimized. The supported options: 'squaredError' num_workers: @@ -840,25 +840,25 @@ class RandomForestRegressor( Parameters ---------- - featuresCol: str or List[str] + featuresCol: str or List[str] (default = "features") The feature column names, spark-rapids-ml supports vector, array and columnar as the input.\n * When the value is a string, the feature columns must be assembled into 1 column with vector or array type. * When the value is a list of strings, the feature columns must be numeric types. - labelCol: + labelCol: str (default = "label") The label column name. - predictionCol: + predictionCol: str (default = "prediction") The prediction column name. - maxDepth: + maxDepth: int (default = 5) Maximum tree depth. Must be greater than 0. - maxBins: + maxBins: int (default = 32) Maximum number of bins used by the split algorithm per feature. - minInstancesPerNode: + minInstancesPerNode: int (default = 1) The minimum number of samples (rows) in each leaf node. - impurity: str = "variance", + impurity: str (default = "variance") The criterion used to split nodes. - numTrees: + numTrees: int (default = 20) Total number of trees in the forest. - featureSubsetStrategy: + featureSubsetStrategy: str (default = "auto") Ratio of number of features (columns) to consider per node split.\n The supported options:\n ``'auto'``: If numTrees == 1, set to 'all', If numTrees > 1 (forest), set to 'onethird'\n @@ -868,9 +868,9 @@ class RandomForestRegressor( ``'log2'``: log2(number of features)\n ``'n'``: when n is in the range (0, 1.0], use n * number of features. When n is in the range (1, number of features), use n features. - seed: + seed: int (default = None) Seed for the random number generator. - bootstrap: + bootstrap: bool (default = True) Control bootstrapping.\n * If ``True``, each tree in the forest is built on a bootstrapped sample with replacement. @@ -888,11 +888,11 @@ class RandomForestRegressor( * ``4 or False`` - Enables all messages up to and including information messages. * ``5 or True`` - Enables all messages up to and including debug messages. * ``6`` - Enables all messages up to and including trace messages. - n_streams: + n_streams: int (default = 1) Number of parallel streams used for forest building. Please note that there is a bug running spark-rapids-ml on a node with multi-gpus when n_streams > 1. See https://github.com/rapidsai/cuml/issues/5402. - min_samples_split: + min_samples_split: int or float (default = 2) The minimum number of samples required to split an internal node.\n * If type ``int``, then ``min_samples_split`` represents the minimum number. @@ -900,11 +900,11 @@ class RandomForestRegressor( and ``ceil(min_samples_split * n_rows)`` is the minimum number of samples for each split. max_samples: Ratio of dataset rows used while fitting each tree. - max_leaves: + max_leaves: int (default = -1) Maximum leaf nodes per tree. Soft constraint. Unlimited, if -1. - min_impurity_decrease: + min_impurity_decrease: float (default = 0.0) Minimum decrease in impurity required for node to be split. - max_batch_size: + max_batch_size: int (default = 4096) Maximum number of nodes that can be processed in a given batch. diff --git a/python/src/spark_rapids_ml/umap.py b/python/src/spark_rapids_ml/umap.py index ed9fabf1..2ce0abf5 100644 --- a/python/src/spark_rapids_ml/umap.py +++ b/python/src/spark_rapids_ml/umap.py @@ -34,6 +34,7 @@ import numpy as np import pandas as pd import pyspark +import scipy from pandas import DataFrame as PandasDataFrame from pyspark.ml.param.shared import ( HasFeaturesCol, @@ -50,13 +51,12 @@ ArrayType, DoubleType, FloatType, + IntegerType, Row, StructField, StructType, ) -from spark_rapids_ml.core import FitInputType, _CumlModel - from .core import ( CumlT, FitInputType, @@ -66,19 +66,30 @@ _CumlEstimatorSupervised, _CumlModel, _CumlModelReader, + _CumlModelWithColumns, _CumlModelWriter, _EvaluateFunc, + _read_csr_matrix_from_unwrapped_spark_vec, _TransformFunc, + _use_sparse_in_cuml, alias, param_alias, ) from .metrics import EvalMetricInfo -from .params import HasFeaturesCols, P, _CumlClass, _CumlParams +from .params import ( + DictTypeConverters, + HasEnableSparseDataOptim, + HasFeaturesCols, + P, + _CumlClass, + _CumlParams, +) from .utils import ( _ArrayOrder, _concat_and_free, _get_spark_session, _is_local, + dtype_to_pyspark_type, get_logger, ) @@ -97,6 +108,7 @@ def _get_cuml_params_default(self) -> Dict[str, Any]: "n_neighbors": 15, "n_components": 2, "metric": "euclidean", + "metric_kwds": None, "n_epochs": None, "learning_rate": 1.0, "init": "spectral", @@ -112,6 +124,8 @@ def _get_cuml_params_default(self) -> Dict[str, Any]: "precomputed_knn": None, "random_state": None, "verbose": False, + "build_algo": "auto", + "build_kwds": None, } def _pyspark_class(self) -> Optional[ABCMeta]: @@ -119,7 +133,12 @@ def _pyspark_class(self) -> Optional[ABCMeta]: class _UMAPCumlParams( - _CumlParams, HasFeaturesCol, HasFeaturesCols, HasLabelCol, HasOutputCol + _CumlParams, + HasFeaturesCol, + HasFeaturesCols, + HasLabelCol, + HasOutputCol, + HasEnableSparseDataOptim, ): def __init__(self) -> None: super().__init__() @@ -127,6 +146,7 @@ def __init__(self) -> None: n_neighbors=15, n_components=2, metric="euclidean", + metric_kwds=None, n_epochs=None, learning_rate=1.0, init="spectral", @@ -141,6 +161,8 @@ def __init__(self) -> None: b=None, precomputed_knn=None, random_state=None, + build_algo="auto", + build_kwds=None, sample_fraction=1.0, outputCol="embedding", ) @@ -172,12 +194,22 @@ def __init__(self) -> None: ( f"Distance metric to use. Supported distances are ['l1', 'cityblock', 'taxicab', 'manhattan', 'euclidean', 'l2'," f" 'sqeuclidean', 'canberra', 'minkowski', 'chebyshev', 'linf', 'cosine', 'correlation', 'hellinger', 'hamming'," - f" 'jaccard']. Metrics that take arguments (such as minkowski) can have arguments passed via the metric_kwds" - f" dictionary." + f" 'jaccard'] Metrics that take arguments (such as minkowski) can have arguments passed via the metric_kwds dictionary." + f" Note: The 'jaccard' distance metric is only supported for sparse inputs." ), typeConverter=TypeConverters.toString, ) + metric_kwds = Param( + Params._dummy(), + "metric_kwds", + ( + f"Additional keyword arguments for the metric function. If the metric function takes additional arguments, they" + f" should be passed in this dictionary." + ), + typeConverter=DictTypeConverters._toDict, + ) + n_epochs = Param( Params._dummy(), "n_epochs", @@ -329,6 +361,27 @@ def __init__(self) -> None: typeConverter=TypeConverters.toInt, ) + build_algo = Param( + Params._dummy(), + "build_algo", + ( + f"How to build the knn graph. Supported build algorithms are ['auto', 'brute_force_knn', 'nn_descent']. 'auto' chooses" + f" to run with brute force knn if number of data rows is smaller than or equal to 50K. Otherwise, runs with nn descent." + ), + typeConverter=TypeConverters.toString, + ) + + build_kwds = Param( + Params._dummy(), + "build_kwds", + ( + f"Build algorithm argument {{'nnd_graph_degree': 64, 'nnd_intermediate_graph_degree': 128, 'nnd_max_iterations': 20," + f" 'nnd_termination_threshold': 0.0001, 'nnd_return_distances': True, 'nnd_n_clusters': 1}} Note that nnd_n_clusters > 1" + f" will result in batch-building with NN Descent." + ), + typeConverter=DictTypeConverters._toDict, + ) + sample_fraction = Param( Params._dummy(), "sample_fraction", @@ -340,7 +393,7 @@ def __init__(self) -> None: typeConverter=TypeConverters.toFloat, ) - def getNNeighbors(self) -> float: + def getNNeighbors(self: P) -> float: """ Gets the value of `n_neighbors`. """ @@ -352,7 +405,7 @@ def setNNeighbors(self: P, value: float) -> P: """ return self._set_params(n_neighbors=value) - def getNComponents(self) -> int: + def getNComponents(self: P) -> int: """ Gets the value of `n_components`. """ @@ -364,7 +417,7 @@ def setNComponents(self: P, value: int) -> P: """ return self._set_params(n_components=value) - def getMetric(self) -> str: + def getMetric(self: P) -> str: """ Gets the value of `metric`. """ @@ -376,7 +429,19 @@ def setMetric(self: P, value: str) -> P: """ return self._set_params(metric=value) - def getNEpochs(self) -> int: + def getMetricKwds(self: P) -> Optional[Dict[str, Any]]: + """ + Gets the value of `metric_kwds`. + """ + return self.getOrDefault("metric_kwds") + + def setMetricKwds(self: P, value: Dict[str, Any]) -> P: + """ + Sets the value of `metric_kwds`. + """ + return self._set_params(metric_kwds=value) + + def getNEpochs(self: P) -> int: """ Gets the value of `n_epochs`. """ @@ -388,7 +453,7 @@ def setNEpochs(self: P, value: int) -> P: """ return self._set_params(n_epochs=value) - def getLearningRate(self) -> float: + def getLearningRate(self: P) -> float: """ Gets the value of `learning_rate`. """ @@ -400,7 +465,7 @@ def setLearningRate(self: P, value: float) -> P: """ return self._set_params(learning_rate=value) - def getInit(self) -> str: + def getInit(self: P) -> str: """ Gets the value of `init`. """ @@ -412,7 +477,7 @@ def setInit(self: P, value: str) -> P: """ return self._set_params(init=value) - def getMinDist(self) -> float: + def getMinDist(self: P) -> float: """ Gets the value of `min_dist`. """ @@ -424,7 +489,7 @@ def setMinDist(self: P, value: float) -> P: """ return self._set_params(min_dist=value) - def getSpread(self) -> float: + def getSpread(self: P) -> float: """ Gets the value of `spread`. """ @@ -436,7 +501,7 @@ def setSpread(self: P, value: float) -> P: """ return self._set_params(spread=value) - def getSetOpMixRatio(self) -> float: + def getSetOpMixRatio(self: P) -> float: """ Gets the value of `set_op_mix_ratio`. """ @@ -448,7 +513,7 @@ def setSetOpMixRatio(self: P, value: float) -> P: """ return self._set_params(set_op_mix_ratio=value) - def getLocalConnectivity(self) -> float: + def getLocalConnectivity(self: P) -> float: """ Gets the value of `local_connectivity`. """ @@ -460,7 +525,7 @@ def setLocalConnectivity(self: P, value: float) -> P: """ return self._set_params(local_connectivity=value) - def getRepulsionStrength(self) -> float: + def getRepulsionStrength(self: P) -> float: """ Gets the value of `repulsion_strength`. """ @@ -472,7 +537,7 @@ def setRepulsionStrength(self: P, value: float) -> P: """ return self._set_params(repulsion_strength=value) - def getNegativeSampleRate(self) -> int: + def getNegativeSampleRate(self: P) -> int: """ Gets the value of `negative_sample_rate`. """ @@ -484,7 +549,7 @@ def setNegativeSampleRate(self: P, value: int) -> P: """ return self._set_params(negative_sample_rate=value) - def getTransformQueueSize(self) -> float: + def getTransformQueueSize(self: P) -> float: """ Gets the value of `transform_queue_size`. """ @@ -496,7 +561,7 @@ def setTransformQueueSize(self: P, value: float) -> P: """ return self._set_params(transform_queue_size=value) - def getA(self) -> float: + def getA(self: P) -> float: """ Gets the value of `a`. """ @@ -508,7 +573,7 @@ def setA(self: P, value: float) -> P: """ return self._set_params(a=value) - def getB(self) -> float: + def getB(self: P) -> float: """ Gets the value of `b`. """ @@ -520,7 +585,7 @@ def setB(self: P, value: float) -> P: """ return self._set_params(b=value) - def getPrecomputedKNN(self) -> List[List[float]]: + def getPrecomputedKNN(self: P) -> List[List[float]]: """ Gets the value of `precomputed_knn`. """ @@ -532,7 +597,7 @@ def setPrecomputedKNN(self: P, value: List[List[float]]) -> P: """ return self._set_params(precomputed_knn=value) - def getRandomState(self) -> int: + def getRandomState(self: P) -> int: """ Gets the value of `random_state`. """ @@ -544,7 +609,31 @@ def setRandomState(self: P, value: int) -> P: """ return self._set_params(random_state=value) - def getSampleFraction(self) -> float: + def getBuildAlgo(self: P) -> str: + """ + Gets the value of `build_algo`. + """ + return self.getOrDefault("build_algo") + + def setBuildAlgo(self: P, value: str) -> P: + """ + Sets the value of `build_algo`. + """ + return self._set_params(build_algo=value) + + def getBuildKwds(self: P) -> Optional[Dict[str, Any]]: + """ + Gets the value of `build_kwds`. + """ + return self.getOrDefault("build_kwds") + + def setBuildKwds(self: P, value: Dict[str, Any]) -> P: + """ + Sets the value of `build_kwds`. + """ + return self._set_params(build_kwds=value) + + def getSampleFraction(self: P) -> float: """ Gets the value of `sample_fraction`. """ @@ -590,7 +679,7 @@ def setLabelCol(self: P, value: str) -> P: """ return self._set_params(labelCol=value) - def getOutputCol(self) -> str: + def getOutputCol(self: P) -> str: """ Gets the value of :py:attr:`outputCol`. Contains the embeddings of the input data. """ @@ -629,6 +718,10 @@ class UMAP(UMAPClass, _CumlEstimatorSupervised, _UMAPCumlParams): 'hamming', 'jaccard']. Metrics that take arguments (such as minkowski) can have arguments passed via the metric_kwds dictionary. + metric_kwds : dict (optional, default=None) + Additional keyword arguments for the metric function. If the metric function takes additional arguments, + they should be passed in this dictionary. + n_epochs : int (optional, default=None) The number of training epochs to be used in optimizing the low dimensional embedding. Larger values result in more accurate embeddings. If None is specified a value will be selected based on the size of the input dataset @@ -708,6 +801,15 @@ class UMAP(UMAPClass, _CumlEstimatorSupervised, _UMAPCumlParams): * ``5 or True`` - Enables all messages up to and including debug messages. * ``6`` - Enables all messages up to and including trace messages. + build_algo : str (optional, default='auto') + How to build the knn graph. Supported build algorithms are ['auto', 'brute_force_knn', 'nn_descent']. 'auto' chooses + to run with brute force knn if number of data rows is smaller than or equal to 50K. Otherwise, runs with nn descent. + + build_kwds : dict (optional, default=None) + Build algorithm argument {'nnd_graph_degree': 64, 'nnd_intermediate_graph_degree': 128, 'nnd_max_iterations': 20, + 'nnd_termination_threshold': 0.0001, 'nnd_return_distances': True, 'nnd_n_clusters': 1} Note that nnd_n_clusters > 1 + will result in batch-building with NN Descent. + sample_fraction : float (optional, default=1.0) The fraction of the dataset to be used for fitting the model. Since fitting is done on a single node, very large datasets must be subsampled to fit within the node's memory and execute in a reasonable time. Smaller fractions @@ -788,6 +890,7 @@ def __init__( n_neighbors: Optional[float] = 15, n_components: Optional[int] = 15, metric: str = "euclidean", + metric_kwds: Optional[Dict[str, Any]] = None, n_epochs: Optional[int] = None, learning_rate: Optional[float] = 1.0, init: Optional[str] = "spectral", @@ -802,12 +905,16 @@ def __init__( b: Optional[float] = None, precomputed_knn: Optional[List[List[float]]] = None, random_state: Optional[int] = None, + build_algo: Optional[str] = "auto", + build_kwds: Optional[Dict[str, Any]] = None, sample_fraction: Optional[float] = 1.0, featuresCol: Optional[Union[str, List[str]]] = None, labelCol: Optional[str] = None, outputCol: Optional[str] = None, num_workers: Optional[int] = None, - verbose: Union[int, bool] = False, + enable_sparse_data_optim: Optional[ + bool + ] = None, # will enable SparseVector inputs if first row is sparse (for any metric). **kwargs: Any, ) -> None: super().__init__() @@ -822,7 +929,6 @@ def __init__( ) assert max_records_per_batch_str is not None self.max_records_per_batch = int(max_records_per_batch_str) - self.BROADCAST_LIMIT = 8 << 30 def _create_pyspark_model(self, result: Row) -> _CumlModel: raise NotImplementedError("UMAP does not support model creation from Row") @@ -851,54 +957,36 @@ def _fit(self, dataset: DataFrame) -> "UMAPModel": pdf_output: PandasDataFrame = df_output.toPandas() - # Collect and concatenate row-by-row fit results - embeddings = np.array( - list( - pd.concat( - [pd.Series(x) for x in pdf_output["embedding_"]], ignore_index=True - ) - ), - dtype=np.float32, - ) - raw_data = np.array( - list( - pd.concat( - [pd.Series(x) for x in pdf_output["raw_data_"]], ignore_index=True - ) - ), - dtype=np.float32, - ) - del pdf_output - - def _chunk_arr( - arr: np.ndarray, BROADCAST_LIMIT: int = self.BROADCAST_LIMIT - ) -> List[np.ndarray]: - """Chunk an array, if oversized, into smaller arrays that can be broadcasted.""" - if arr.nbytes <= BROADCAST_LIMIT: - return [arr] - - rows_per_chunk = BROADCAST_LIMIT // (arr.nbytes // arr.shape[0]) - num_chunks = (arr.shape[0] + rows_per_chunk - 1) // rows_per_chunk - chunks = [ - arr[i * rows_per_chunk : (i + 1) * rows_per_chunk] - for i in range(num_chunks) - ] - - return chunks + if self._sparse_fit: + embeddings = np.array( + list( + pd.concat( + [pd.Series(x) for x in pdf_output["embedding_"]], + ignore_index=True, + ) + ), + dtype=np.float32, + ) + pdf_output["raw_data_"] = pdf_output.apply( + lambda row: scipy.sparse.csr_matrix( + (row["data"], row["indices"], row["indptr"]), + shape=row["shape"], + ).astype(np.float32), + axis=1, + ) + raw_data = scipy.sparse.vstack(pdf_output["raw_data_"], format="csr") + else: + embeddings = np.vstack(pdf_output["embedding_"]).astype(np.float32) + raw_data = np.vstack(pdf_output["raw_data_"]).astype(np.float32) # type: ignore - spark = _get_spark_session() - broadcast_embeddings = [ - spark.sparkContext.broadcast(chunk) for chunk in _chunk_arr(embeddings) - ] - broadcast_raw_data = [ - spark.sparkContext.broadcast(chunk) for chunk in _chunk_arr(raw_data) - ] + del pdf_output model = UMAPModel( - embedding_=broadcast_embeddings, - raw_data_=broadcast_raw_data, - n_cols=len(raw_data[0]), - dtype=type(raw_data[0][0]).__name__, + embedding_=embeddings, + raw_data_=raw_data, + sparse_fit=self._sparse_fit, + n_cols=self._n_cols, + dtype="float32", # UMAP only supports float ) model._num_workers = input_num_workers @@ -979,7 +1067,8 @@ def _call_cuml_fit_func_dataframe( cls = self.__class__ - select_cols, multi_col_names, _, _ = self._pre_process_data(dataset) + select_cols, multi_col_names, dimension, _ = self._pre_process_data(dataset) + self._n_cols = dimension dataset = dataset.select(*select_cols) @@ -1005,6 +1094,11 @@ def _call_cuml_fit_func_dataframe( cuml_verbose = self.cuml_params.get("verbose", False) + use_sparse_array = _use_sparse_in_cuml(dataset) + self._sparse_fit = use_sparse_array # param stored internally by cuml model + if self.cuml_params.get("metric") == "jaccard" and not use_sparse_array: + raise ValueError("Metric 'jaccard' not supported for dense inputs.") + chunk_size = self.max_records_per_batch def _train_udf(pdf_iter: Iterable[pd.DataFrame]) -> Iterable[pd.DataFrame]: @@ -1014,6 +1108,7 @@ def _train_udf(pdf_iter: Iterable[pd.DataFrame]) -> Iterable[pd.DataFrame]: logger.info("Initializing cuml context") import cupy as cp + import cupyx if cuda_managed_mem_enabled: import rmm @@ -1032,17 +1127,20 @@ def _train_udf(pdf_iter: Iterable[pd.DataFrame]) -> Iterable[pd.DataFrame]: # handle the input # inputs = [(X, Optional(y)), (X, Optional(y))] logger.info("Loading data into python worker memory") - inputs = [] - sizes = [] + inputs: List[Any] = [] + sizes: List[int] = [] + for pdf in pdf_iter: sizes.append(pdf.shape[0]) if multi_col_names: features = np.array(pdf[multi_col_names], order=array_order) + elif use_sparse_array: + # sparse vector input + features = _read_csr_matrix_from_unwrapped_spark_vec(pdf) else: + # dense input features = np.array(list(pdf[alias.data]), order=array_order) - # experiments indicate it is faster to convert to numpy array and then to cupy array than directly - # invoking cupy array on the list - if cuda_managed_mem_enabled: + if cuda_managed_mem_enabled and not use_sparse_array: features = cp.array(features) label = pdf[alias.label] if alias.label in pdf.columns else None @@ -1051,10 +1149,25 @@ def _train_udf(pdf_iter: Iterable[pd.DataFrame]) -> Iterable[pd.DataFrame]: ) inputs.append((features, label, row_number)) + if cuda_managed_mem_enabled and use_sparse_array: + concated_nnz = sum(triplet[0].nnz for triplet in inputs) # type: ignore + if concated_nnz > np.iinfo(np.int32).max: + logger.warn( + f"The number of non-zero values of a partition exceeds the int32 index dtype. \ + cupyx csr_matrix currently does not support int64 indices (https://github.com/cupy/cupy/issues/3513); \ + keeping as scipy csr_matrix to avoid overflow." + ) + else: + inputs = [ + (cupyx.scipy.sparse.csr_matrix(row[0]), row[1], row[2]) + for row in inputs + ] + # call the cuml fit function # *note*: cuml_fit_func may delete components of inputs to free # memory. do not rely on inputs after this call. embedding, raw_data = cuml_fit_func(inputs, params).values() + logger.info("Cuml fit complete") num_sections = (len(embedding) + chunk_size - 1) // chunk_size @@ -1062,15 +1175,29 @@ def _train_udf(pdf_iter: Iterable[pd.DataFrame]) -> Iterable[pd.DataFrame]: for i in range(num_sections): start = i * chunk_size end = min((i + 1) * chunk_size, len(embedding)) - - yield pd.DataFrame( - data=[ + if use_sparse_array: + csr_chunk = raw_data[start:end] + indices = csr_chunk.indices + indptr = csr_chunk.indptr + data = csr_chunk.data + yield pd.DataFrame( + data=[ + { + "embedding_": embedding[start:end].tolist(), + "indices": indices.tolist(), + "indptr": indptr.tolist(), + "data": data.tolist(), + "shape": [end - start, dimension], + } + ] + ) + else: + yield pd.DataFrame( { "embedding_": embedding[start:end].tolist(), "raw_data_": raw_data[start:end].tolist(), } - ] - ) + ) output_df = dataset.mapInPandas(_train_udf, schema=self._out_schema()) @@ -1080,20 +1207,27 @@ def _require_nccl_ucx(self) -> Tuple[bool, bool]: return (False, False) def _out_schema(self) -> Union[StructType, str]: - return StructType( - [ - StructField( - "embedding_", - ArrayType(ArrayType(FloatType(), False), False), - False, - ), - StructField( - "raw_data_", - ArrayType(ArrayType(FloatType(), False), False), - False, - ), - ] - ) + if self._sparse_fit: + return StructType( + [ + StructField( + "embedding_", + ArrayType(ArrayType(FloatType(), False), False), + False, + ), + StructField("indices", ArrayType(IntegerType(), False), False), + StructField("indptr", ArrayType(IntegerType(), False), False), + StructField("data", ArrayType(FloatType(), False), False), + StructField("shape", ArrayType(IntegerType(), False), False), + ] + ) + else: + return StructType( + [ + StructField("embedding_", ArrayType(FloatType()), False), + StructField("raw_data_", ArrayType(FloatType()), False), + ] + ) def _pre_process_data( self, dataset: DataFrame @@ -1115,36 +1249,47 @@ def _pre_process_data( return select_cols, multi_col_names, dimension, feature_type -class UMAPModel(_CumlModel, UMAPClass, _UMAPCumlParams): +class UMAPModel(_CumlModelWithColumns, UMAPClass, _UMAPCumlParams): def __init__( self, - embedding_: List[pyspark.broadcast.Broadcast], - raw_data_: List[pyspark.broadcast.Broadcast], + embedding_: np.ndarray, + raw_data_: Union[ + np.ndarray, + scipy.sparse.csr_matrix, + ], + sparse_fit: bool, n_cols: int, dtype: str, ) -> None: super(UMAPModel, self).__init__( embedding_=embedding_, raw_data_=raw_data_, + sparse_fit=sparse_fit, n_cols=n_cols, dtype=dtype, ) self.embedding_ = embedding_ self.raw_data_ = raw_data_ + self._sparse_fit = sparse_fit # If true, raw data is a sparse CSR matrix + self.BROADCAST_LIMIT = 8 << 30 # Spark broadcast limit: 8GiB @property - def embedding(self) -> List[List[float]]: - res = [] - for chunk in self.embedding_: - res.extend(chunk.value.tolist()) - return res + def embedding(self) -> np.ndarray: + """ + Returns the model embeddings. + """ + return ( + self.embedding_ + ) # TBD: return a more Spark-like object, e.g. DenseMatrix? @property - def raw_data(self) -> List[List[float]]: - res = [] - for chunk in self.raw_data_: - res.extend(chunk.value.tolist()) - return res + def rawData(self) -> Union[np.ndarray, scipy.sparse.csr_matrix]: + """ + Returns the raw data used to fit the model. If the input data was sparse, this will be a scipy csr matrix. + """ + return ( + self.raw_data_ + ) # TBD: return a more Spark-like object, e.g. DenseMatrix or SparseMatrix? def _get_cuml_transform_func( self, dataset: DataFrame, eval_metric_info: Optional[EvalMetricInfo] = None @@ -1154,9 +1299,53 @@ def _get_cuml_transform_func( Optional[_EvaluateFunc], ]: cuml_alg_params = self.cuml_params - driver_embedding = self.embedding_ - driver_raw_data = self.raw_data_ - outputCol = self.getOutputCol() + sparse_fit = self._sparse_fit + n_cols = self.n_cols + + def _chunk_and_broadcast( + sc: pyspark.SparkContext, + arr: np.ndarray, + BROADCAST_LIMIT: int, + ) -> List[pyspark.broadcast.Broadcast]: + """ + Broadcast the input array, chunking it into smaller arrays if it exceeds the broadcast limit. + """ + if arr.nbytes < BROADCAST_LIMIT: + return [sc.broadcast(arr)] + + rows_per_chunk = BROADCAST_LIMIT // (arr.nbytes // arr.shape[0]) + if rows_per_chunk == 0: + raise ValueError( + f"Array cannot be chunked into broadcastable pieces: \ + single row exceeds broadcast limit ({BROADCAST_LIMIT} bytes)" + ) + num_chunks = (arr.shape[0] + rows_per_chunk - 1) // rows_per_chunk + return [ + sc.broadcast(arr[i * rows_per_chunk : (i + 1) * rows_per_chunk]) + for i in range(num_chunks) + ] + + spark = _get_spark_session() + broadcast_embeddings = _chunk_and_broadcast( + spark.sparkContext, self.embedding_, self.BROADCAST_LIMIT + ) + + if isinstance(self.raw_data_, scipy.sparse.csr_matrix): + broadcast_raw_data = { + "indices": _chunk_and_broadcast( + spark.sparkContext, self.raw_data_.indices, self.BROADCAST_LIMIT + ), + "indptr": _chunk_and_broadcast( + spark.sparkContext, self.raw_data_.indptr, self.BROADCAST_LIMIT + ), + "data": _chunk_and_broadcast( + spark.sparkContext, self.raw_data_.data, self.BROADCAST_LIMIT + ), + } # NOTE: CSR chunks are not independently meaningful; do not use until recombined. + else: + broadcast_raw_data = _chunk_and_broadcast( + spark.sparkContext, self.raw_data_, self.BROADCAST_LIMIT + ) # type: ignore def _construct_umap() -> CumlT: import cupy as cp @@ -1166,28 +1355,52 @@ def _construct_umap() -> CumlT: from .utils import cudf_to_cuml_array - nonlocal driver_embedding, driver_raw_data + nonlocal broadcast_embeddings, broadcast_raw_data + assert isinstance(broadcast_embeddings, list) embedding = ( - driver_embedding[0].value - if len(driver_embedding) == 1 - else np.concatenate([chunk.value for chunk in driver_embedding]) - ) - raw_data = ( - driver_raw_data[0].value - if len(driver_raw_data) == 1 - else np.concatenate([chunk.value for chunk in driver_raw_data]) + broadcast_embeddings[0].value + if len(broadcast_embeddings) == 1 + else np.concatenate([chunk.value for chunk in broadcast_embeddings]) ) - del driver_embedding - del driver_raw_data + if sparse_fit: + if not isinstance(broadcast_raw_data, dict): + raise ValueError("Expected raw data as a CSR dict for sparse fit.") + indices = np.concatenate( + [chunk.value for chunk in broadcast_raw_data["indices"]] + ) + indptr = np.concatenate( + [chunk.value for chunk in broadcast_raw_data["indptr"]] + ) + data = np.concatenate( + [chunk.value for chunk in broadcast_raw_data["data"]] + ) + raw_data = scipy.sparse.csr_matrix( + (data, indices, indptr), shape=(len(indptr) - 1, n_cols) + ) + else: + if not isinstance(broadcast_raw_data, list): + raise ValueError( + "Expected raw data as list (of lists) for dense fit." + ) + raw_data = ( + broadcast_raw_data[0].value + if len(broadcast_raw_data) == 1 + else np.concatenate([chunk.value for chunk in broadcast_raw_data]) + ) + + del broadcast_embeddings + del broadcast_raw_data if embedding.dtype != np.float32: embedding = embedding.astype(np.float32) raw_data = raw_data.astype(np.float32) if is_sparse(raw_data): - raw_data_cuml = SparseCumlArray(raw_data, convert_format=False) + raw_data_cuml = SparseCumlArray( + raw_data, + ) else: raw_data_cuml = cudf_to_cuml_array( raw_data, @@ -1197,35 +1410,28 @@ def _construct_umap() -> CumlT: internal_model = CumlUMAP(**cuml_alg_params) internal_model.embedding_ = cp.array(embedding).data internal_model._raw_data = raw_data_cuml + internal_model.sparse_fit = sparse_fit return internal_model def _transform_internal( umap: CumlT, - df: Union[pd.DataFrame, np.ndarray], - ) -> pd.Series: - embedding = umap.transform(df) + df: Union[pd.DataFrame, np.ndarray, scipy.sparse._csr.csr_matrix], + ) -> pd.DataFrame: - is_df_np = isinstance(df, np.ndarray) - is_emb_np = isinstance(embedding, np.ndarray) + embedding = umap.transform(df) # Input is either numpy array or pandas dataframe - input_list = [ - df[i, :] if is_df_np else df.iloc[i, :] for i in range(df.shape[0]) # type: ignore - ] emb_list = [ - embedding[i, :] if is_emb_np else embedding.iloc[i, :] + ( + embedding[i, :] + if isinstance(embedding, np.ndarray) + else embedding.iloc[i, :] + ) for i in range(embedding.shape[0]) ] - result = pd.DataFrame( - { - "features": input_list, - outputCol: emb_list, - } - ) - - return result + return pd.Series(emb_list) return _construct_umap, _transform_internal, None @@ -1233,23 +1439,9 @@ def _require_nccl_ucx(self) -> Tuple[bool, bool]: return (False, False) def _out_schema(self, input_schema: StructType) -> Union[StructType, str]: - return StructType( - [ - StructField("features", ArrayType(FloatType(), False), False), - StructField(self.getOutputCol(), ArrayType(FloatType(), False), False), - ] - ) - - def _get_model_attributes(self) -> Optional[Dict[str, Any]]: - """ - Override parent method to bring broadcast variables to driver before JSON serialization. - """ - - self._model_attributes["embedding_"] = [ - chunk.value for chunk in self.embedding_ - ] - self._model_attributes["raw_data_"] = [chunk.value for chunk in self.raw_data_] - return self._model_attributes + assert self.dtype is not None + pyspark_type = dtype_to_pyspark_type(self.dtype) + return f"array<{pyspark_type}>" def write(self) -> MLWriter: return _CumlModelWriterNumpy(self) @@ -1281,14 +1473,16 @@ def saveImpl(self, path: str) -> None: if not os.path.exists(data_path): os.makedirs(data_path) assert model_attributes is not None - for key, value in model_attributes.items(): - if isinstance(value, list) and isinstance(value[0], np.ndarray): - paths = [] - for idx, chunk in enumerate(value): - array_path = os.path.join(data_path, f"{key}_{idx}.npy") - np.save(array_path, chunk) - paths.append(array_path) - model_attributes[key] = paths + + for key in ["embedding_", "raw_data_"]: + array = model_attributes[key] + if isinstance(array, scipy.sparse.csr_matrix): + npz_path = os.path.join(data_path, f"{key}csr_.npz") + scipy.sparse.save_npz(npz_path, array) + else: + npz_path = os.path.join(data_path, f"{key}.npz") + np.savez_compressed(npz_path, array) + model_attributes[key] = npz_path metadata_file_path = os.path.join(data_path, "metadata.json") model_attributes_str = json.dumps(model_attributes) @@ -1310,14 +1504,13 @@ def load(self, path: str) -> "_CumlEstimator": model_attr_str = self.sc.textFile(metadata_file_path).collect()[0] model_attr_dict = json.loads(model_attr_str) - for key, value in model_attr_dict.items(): - if isinstance(value, list) and value[0].endswith(".npy"): - arrays = [] - spark = _get_spark_session() - for array_path in value: - array = np.load(array_path) - arrays.append(spark.sparkContext.broadcast(array)) - model_attr_dict[key] = arrays + for key in ["embedding_", "raw_data_"]: + npz_path = model_attr_dict[key] + if npz_path.endswith("csr_.npz"): + model_attr_dict[key] = scipy.sparse.load_npz(npz_path) + else: + with np.load(npz_path) as data: + model_attr_dict[key] = data["arr_0"] instance = self.model_cls(**model_attr_dict) DefaultParamsReader.getAndSetParams(instance, metadata) diff --git a/python/src/spark_rapids_ml/utils.py b/python/src/spark_rapids_ml/utils.py index 48389a0a..0edcf7fa 100644 --- a/python/src/spark_rapids_ml/utils.py +++ b/python/src/spark_rapids_ml/utils.py @@ -271,8 +271,12 @@ def dtype_to_pyspark_type(dtype: Union[np.dtype, str]) -> str: return "double" elif dtype == np.int32: return "int" + elif dtype == np.int64: + return "long" elif dtype == np.int16: return "short" + elif dtype == np.int64: + return "long" else: raise RuntimeError("Unsupported dtype, found ", dtype) diff --git a/python/tests/test_approximate_nearest_neighbors.py b/python/tests/test_approximate_nearest_neighbors.py index 4cc9b600..387cf4f8 100644 --- a/python/tests/test_approximate_nearest_neighbors.py +++ b/python/tests/test_approximate_nearest_neighbors.py @@ -1,4 +1,5 @@ -from typing import Any, Callable, Dict, Optional, Tuple +import math +from typing import Any, Callable, Dict, List, Optional, Tuple import numpy as np import pandas as pd @@ -25,6 +26,7 @@ ) from .utils import ( array_equal, + assert_params, create_pyspark_dataframe, get_default_cuml_parameters, idfn, @@ -41,17 +43,29 @@ def cal_dist(v1: np.ndarray, v2: np.ndarray, metric: str) -> float: return dist * dist else: return dist + elif metric == "cosine": + v1_l2norm = np.linalg.norm(v1) + v2_l2norm = np.linalg.norm(v2) + if v1_l2norm == 0 or v2_l2norm == 0: + return 0.0 + return 1 - np.dot(v1, v2) / (v1_l2norm * v2_l2norm) else: assert False, f"Does not recognize metric '{metric}'" -def test_params() -> None: +@pytest.mark.parametrize("default_params", [True, False]) +def test_params(default_params: bool) -> None: from cuml import NearestNeighbors as CumlNearestNeighbors + spark_params = { + param.name: value + for param, value in ApproximateNearestNeighbors().extractParamMap().items() + } + # obtain n_neighbors, verbose, algorithm, algo_params, metric cuml_params = get_default_cuml_parameters( - [CumlNearestNeighbors], - [ + cuml_classes=[CumlNearestNeighbors], + excludes=[ "handle", "p", "metric_expanded", @@ -60,15 +74,29 @@ def test_params() -> None: ], ) - spark_params = ApproximateNearestNeighbors()._get_cuml_params_default() cuml_params["algorithm"] = "ivfflat" # change cuml default 'auto' to 'ivfflat' - assert cuml_params == spark_params + + # Ensure internal cuml defaults match actual cuml defaults + assert ApproximateNearestNeighbors()._get_cuml_params_default() == cuml_params + + if default_params: + knn = ApproximateNearestNeighbors() + else: + knn = ApproximateNearestNeighbors(k=7) + cuml_params["n_neighbors"] = 7 + spark_params["k"] = 7 + + # Ensure both Spark API params and internal cuml_params are set correctly + assert_params(knn, spark_params, cuml_params) + assert knn.cuml_params == cuml_params # setter/getter from .test_common_estimator import _test_input_setter_getter _test_input_setter_getter(ApproximateNearestNeighbors) + +def test_search_index_params() -> None: # test cagra index params and search params cagra_index_param: Dict[str, Any] = { "intermediate_graph_degree": 80, @@ -123,6 +151,61 @@ def test_example( assert obj._cuml_params["algo_params"] == algoParams +@pytest.mark.slow +def test_empty_dataframe() -> None: + gpu_knn = ApproximateNearestNeighbors() + gpu_knn = gpu_knn.setInputCol("features").setK(1) + with CleanSparkSession() as spark: + schema = f"features array, metadata string" + item_df = spark.createDataFrame([], schema) + gpu_model = gpu_knn.fit(item_df) + + query_df = spark.createDataFrame([], schema="features array") + (_, _, knn_df_empty) = gpu_model.kneighbors(query_df) + knn_df_empty.show() + + +def test_example_cosine() -> None: + gpu_number = 1 + X = [ + (0, (1.0, 0.0)), + (1, (1.0, 1.0)), + (2, (-1.0, 1.0)), + ] + + topk = 2 + metric = "cosine" + algoParams = {"nlist": 1, "nprobe": 1} + + with CleanSparkSession() as spark: + schema = f"id int, features array" + df = spark.createDataFrame(X, schema) + gpu_knn = ApproximateNearestNeighbors( + algorithm="ivfflat", + algoParams=algoParams, + k=topk, + metric=metric, + idCol="id", + inputCol="features", + num_workers=gpu_number, + ) + gpu_model = gpu_knn.fit(df) + _, _, knn_df = gpu_model.kneighbors(df) + knn_collect = knn_df.collect() + + from sklearn.neighbors import NearestNeighbors + + X_features = np.array([row[1] for row in X]) + exact_nn = NearestNeighbors( + algorithm="brute", metric="cosine", n_neighbors=topk + ) + exact_nn.fit(X_features) + distances, indices = exact_nn.kneighbors(X_features) + + assert array_equal([row["distances"] for row in knn_collect], distances) + assert array_equal([row["indices"] for row in knn_collect], indices) + + class ANNEvaluator: """ obtain exact knn distances and indices @@ -183,26 +266,22 @@ def compare_with_cuml_or_cuvs_sg( tolerance: float, ) -> None: # compare with cuml sg ANN on avg_recall and avg_dist_gap - if algorithm in {"ivfflat", "ivfpq"}: - cumlsg_distances, cumlsg_indices = self.get_cuml_sg_results( - algorithm, algoParams - ) - else: - assert algorithm == "cagra" - cumlsg_distances, cumlsg_indices = self.get_cuvs_sg_results(algoParams) + cuvssg_distances, cuvssg_indices = self.get_cuvs_sg_results( + algorithm=algorithm, algoParams=algoParams + ) # compare cuml sg with given results - avg_recall_cumlann = self.cal_avg_recall(cumlsg_indices) + avg_recall_cumlann = self.cal_avg_recall(cuvssg_indices) avg_recall = self.cal_avg_recall(given_indices) assert (avg_recall > avg_recall_cumlann) or abs( avg_recall - avg_recall_cumlann - ) < tolerance + ) <= tolerance - avg_dist_gap_cumlann = self.cal_avg_dist_gap(cumlsg_distances) + avg_dist_gap_cumlann = self.cal_avg_dist_gap(cuvssg_distances) avg_dist_gap = self.cal_avg_dist_gap(given_distances) - assert (avg_dist_gap < avg_dist_gap_cumlann) or abs( + assert (avg_dist_gap <= avg_dist_gap_cumlann) or abs( avg_dist_gap - avg_dist_gap_cumlann - ) < tolerance + ) <= tolerance def get_cuml_sg_results( self, @@ -231,25 +310,51 @@ def get_cuml_sg_results( def get_cuvs_sg_results( self, - algoParams: Optional[Dict[str, Any]], + algorithm: str = "cagra", + algoParams: Optional[Dict[str, Any]] = None, ) -> Tuple[np.ndarray, np.ndarray]: - assert self.metric == "sqeuclidean" + if algorithm == "cagra": + assert self.metric == "sqeuclidean" + index_params, search_params = ( + ApproximateNearestNeighborsModel._cal_cagra_params_and_check( + algoParams=algoParams, metric=self.metric, topk=self.n_neighbors + ) + ) + + from cuvs.neighbors import cagra as cuvs_algo + elif algorithm == "ivf_flat" or algorithm == "ivfflat": + + index_params, search_params = ( + ApproximateNearestNeighborsModel._cal_cuvs_ivf_flat_params_and_check( + algoParams=algoParams, metric=self.metric, topk=self.n_neighbors + ) + ) + from cuvs.neighbors import ivf_flat as cuvs_algo + elif algorithm in {"ivf_pq", "ivfpq"}: + index_params, search_params = ( + ApproximateNearestNeighborsModel._cal_cuvs_ivf_pq_params_and_check( + algoParams=algoParams, metric=self.metric, topk=self.n_neighbors + ) + ) + from cuvs.neighbors import ivf_pq as cuvs_algo + else: + assert False, f"unrecognized algorithm {algorithm}" import cupy as cp gpu_X = cp.array(self.X, dtype="float32") - from cuvs.neighbors import cagra - index_params, search_params = ( - ApproximateNearestNeighborsModel._cal_cagra_params_and_check( - algoParams=algoParams, metric=self.metric, topk=self.n_neighbors - ) + index = cuvs_algo.build(cuvs_algo.IndexParams(**index_params), gpu_X) + sg_distances, sg_indices = cuvs_algo.search( + cuvs_algo.SearchParams(**search_params), index, gpu_X, self.n_neighbors ) - index = cagra.build(cagra.IndexParams(**index_params), gpu_X) - sg_distances, sg_indices = cagra.search( - cagra.SearchParams(**search_params), index, gpu_X, self.n_neighbors - ) + if algorithm in {"ivf_pq", "ivfpq"}: + from cuvs.neighbors import refine + + sg_distances, sg_indices = refine( + gpu_X, gpu_X, sg_indices, self.n_neighbors, metric=self.metric + ) # convert results to cp array then to np array sg_distances = cp.array(sg_distances).get() @@ -258,29 +363,23 @@ def get_cuvs_sg_results( return (sg_distances, sg_indices) -@pytest.mark.parametrize( - "combo", - [ - ("ivfflat", "array", 10000, None, "euclidean"), - ("ivfflat", "vector", 2000, {"nlist": 10, "nprobe": 2}, "euclidean"), - ("ivfflat", "multi_cols", 5000, {"nlist": 20, "nprobe": 4}, "euclidean"), - ("ivfflat", "array", 2000, {"nlist": 10, "nprobe": 2}, "sqeuclidean"), - ("ivfflat", "vector", 5000, {"nlist": 20, "nprobe": 4}, "l2"), - ("ivfflat", "multi_cols", 2000, {"nlist": 10, "nprobe": 2}, "inner_product"), - ], -) # vector feature type will be converted to float32 to be compatible with cuml single-GPU NearestNeighbors Class -@pytest.mark.parametrize("data_shape", [(10000, 50)], ids=idfn) -@pytest.mark.parametrize("data_type", [np.float32]) -def test_ann_algorithm( +def ann_algorithm_test_func( combo: Tuple[str, str, int, Optional[Dict[str, Any]], str], data_shape: Tuple[int, int], data_type: np.dtype, expected_avg_recall: float = 0.95, + expected_avg_dist_gap: float = 1e-4, distances_are_exact: bool = True, tolerance: float = 1e-4, n_neighbors: int = 50, + cluster_std: float = 1.0, ) -> None: + assert data_type in { + np.float32, + np.float64, + }, "the test function applies to float dataset dtype only, as it scales the dataset by the average norm of rows" + algorithm = combo[0] assert algorithm in {"ivfflat", "ivfpq", "cagra"} @@ -310,6 +409,7 @@ def test_ann_algorithm( n_features=data_shape[1], centers=n_clusters, random_state=0, + cluster_std=cluster_std, ) # make_blobs creates a random dataset of isotropic gaussian blobs. # set average norm sq to be 1 to allow comparisons with default error thresholds @@ -363,12 +463,12 @@ def test_ann_algorithm( # test kneighbors: compare top-1 nn indices(self) and distances(self) - if metric != "inner_product" and distances_are_exact: + if metric != "inner_product": self_index = [knn[0] for knn in indices] assert np.all(self_index == y) self_distance = [dist[0] for dist in distances] - assert self_distance == [0.0] * len(X) + assert array_equal(self_distance, [0.0] * len(X)) # test kneighbors: compare with single-GPU cuml ann_evaluator.compare_with_cuml_or_cuvs_sg( @@ -380,7 +480,7 @@ def test_ann_algorithm( # test kneighbors: compare with sklearn brute NN on avg_recall and avg_dist_gap assert avg_recall >= expected_avg_recall if distances_are_exact: - assert np.all(np.abs(avg_dist_gap) < tolerance) + assert np.all(np.abs(avg_dist_gap) < expected_avg_dist_gap) # test exactNearestNeighborsJoin knnjoin_df = knn_model.approxSimilarityJoin(query_df_withid) @@ -424,8 +524,9 @@ def assert_row_equal(r1: Row, r2: Row) -> None: ) assert len(reconstructed_collect) == len(knn_df_collect) - if algorithm != "ivfpq": + if algorithm != "ivfpq" and not (algorithm == "ivfflat" and algoParams == None): # it is fine to skip ivfpq as long as other algorithms assert the same results of approxSimilarityJoin and kneighbors. + # Also skip ivfflat when algoParams == None. Ivfflat probes only 1/50 of the clusters, leading to unstable results. # ivfpq shows non-deterministic distances due to kmeans initialization uses GPU memory runtime values. for i in range(len(reconstructed_collect)): r1 = reconstructed_collect[i] @@ -436,6 +537,76 @@ def assert_row_equal(r1: Row, r2: Row) -> None: assert knn_model._cuml_params["metric"] == metric +@pytest.mark.parametrize( + "combo", + [ + ( + "ivfflat", + "array", + 10000, + None, + "euclidean", + ), + ( + "ivfflat", + "vector", + 2000, + {"nlist": 10, "nprobe": 2}, + "euclidean", + ), + ( + "ivfflat", + "multi_cols", + 5000, + {"nlist": 20, "nprobe": 4}, + "euclidean", + ), + ( + "ivfflat", + "array", + 2000, + {"nlist": 10, "nprobe": 2}, + "sqeuclidean", + ), + ("ivfflat", "vector", 5000, {"nlist": 20, "nprobe": 4}, "l2"), + ( + "ivfflat", + "multi_cols", + 2000, + {"nlist": 10, "nprobe": 2}, + "inner_product", + ), + ( + "ivfflat", + "array", + 2000, + {"nlist": 10, "nprobe": 2}, + "cosine", + ), + ], +) # vector feature type will be converted to float32 to be compatible with cuml single-GPU NearestNeighbors Class +@pytest.mark.parametrize("data_type", [np.float32]) +def test_ivfflat( + combo: Tuple[str, str, int, Optional[Dict[str, Any]], str], + data_type: np.dtype, +) -> None: + algoParams = combo[3] + + # cuvs ivf_flat None sets nlist to 1000 and nprobe to 20, leading to unstable results when run multiple times + expected_avg_recall: float = 0.95 if algoParams != None else 0.5 + expected_avg_dist_gap: float = 1e-4 if algoParams != None else 1e-2 + tolerance: float = 1e-4 if algoParams != None else 1e-2 + data_shape: Tuple[int, int] = (10000, 50) + ann_algorithm_test_func( + combo=combo, + data_shape=data_shape, + data_type=data_type, + expected_avg_recall=expected_avg_recall, + expected_avg_dist_gap=expected_avg_dist_gap, + tolerance=tolerance, + ) + + @pytest.mark.parametrize( "algorithm,feature_type,max_records_per_batch,algo_params,metric", [ @@ -444,11 +615,10 @@ def assert_row_equal(r1: Row, r2: Row) -> None: "array", 10000, { - "nlist": 10, - "nprobe": 2, - "M": 2, + "nlist": 100, + "nprobe": 20, + "M": 20, "n_bits": 4, - "usePrecomputedTables": False, }, "euclidean", ), @@ -457,11 +627,10 @@ def assert_row_equal(r1: Row, r2: Row) -> None: "vector", 200, { - "nlist": 10, - "nprobe": 2, - "M": 4, + "nlist": 100, + "nprobe": 20, + "M": 40, "n_bits": 4, - "usePrecomputedTables": True, }, "sqeuclidean", ), @@ -470,11 +639,10 @@ def assert_row_equal(r1: Row, r2: Row) -> None: "multi_cols", 5000, { - "nlist": 10, - "nprobe": 2, - "M": 1, + "nlist": 100, + "nprobe": 20, + "M": 10, "n_bits": 8, - "usePrecomputedTables": False, }, "l2", ), @@ -483,13 +651,25 @@ def assert_row_equal(r1: Row, r2: Row) -> None: "array", 2000, { - "nlist": 10, - "nprobe": 2, - "M": 2, + "nlist": 100, + "nprobe": 20, + "M": 20, "n_bits": 4, }, "inner_product", ), + ( + "ivfpq", + "array", + 3000, + { + "nlist": 100, + "nprobe": 20, + "M": 20, + "n_bits": 4, + }, + "cosine", + ), ], ) @pytest.mark.parametrize("data_shape", [(10000, 50)], ids=idfn) @@ -504,20 +684,32 @@ def test_ivfpq( data_type: np.dtype, ) -> None: """ - Currently the usePrecomputedTables is not used in cuml C++. + (1) Currently the usePrecomputedTables is not used in cuml C++. + + (2) ivfpq has become unstable in 24.10. It does not get passed with algoParam {"nlist" : 10, "nprobe" : 2, "M": 2, "n_bits": 4} in ci where test_ivfflat is run beforehand. avg_recall shows large variance, depending on the quantization accuracy. This can be fixed by increasing nlist, nprobe, M, and n_bits. Note ivf_pq is non-deterministic, and it seems due to kmeans initialization leveraging runtime values of GPU memory. + + (3) In ivfpq, when the dataset itself is used as queries, it is observed sometimes that the top-1 indice may not be self, and top-1 distance may not be zero. + This is because ivfpq internally uses approximated distance, i.e. the distance of the query vector to the center of quantized item. """ combo = (algorithm, feature_type, max_records_per_batch, algo_params, metric) - expected_avg_recall = 0.1 - distances_are_exact = False - tolerance = 5e-3 # tolerance increased to be more stable due to quantization and randomness in ivfpq + expected_avg_recall = 0.4 + distances_are_exact = True + expected_avg_dist_gap = 0.05 + tolerance = 0.05 # tolerance increased to be more stable due to quantization and randomness in ivfpq, especially when expected_recall is low. - test_ann_algorithm( + cluster_std = ( + 1.0 if metric != "cosine" else 10.0 + ) # Increasing cluster_std for cosine to make dataset more randomized and separable. + + ann_algorithm_test_func( combo=combo, data_shape=data_shape, data_type=data_type, expected_avg_recall=expected_avg_recall, + expected_avg_dist_gap=expected_avg_dist_gap, distances_are_exact=distances_are_exact, tolerance=tolerance, + cluster_std=cluster_std, ) @@ -591,7 +783,7 @@ def test_cagra( distances_are_exact = True tolerance = 2e-3 - test_ann_algorithm( + ann_algorithm_test_func( combo=combo, data_shape=data_shape, data_type=data_type, @@ -602,6 +794,62 @@ def test_cagra( ) +@pytest.mark.parametrize( + "feature_type,data_type", + [ + ("vector", np.float64), + ("multi_cols", np.float64), + ("multi_cols", np.int16), + ("array", np.int64), + ], +) +@pytest.mark.slow +def test_cagra_dtype( + feature_type: str, + data_type: np.dtype, +) -> None: + + algorithm = "cagra" + algo_params = { + "intermediate_graph_degree": 128, + "graph_degree": 64, + "build_algo": "ivf_pq", + } + + gpu_number = 1 + n_neighbors = 2 + metric = "sqeuclidean" + X = np.array( + [ + [10.0, 10.0], + [20.0, 20.0], + [40.0, 40.0], + [50.0, 50.0], + ], + dtype="int32", + ) + X = X.astype(data_type) + y = np.array(range(len(X))) + with CleanSparkSession() as spark: + data_df, features_col, label_col = create_pyspark_dataframe( + spark, feature_type, data_type, X, y + ) + + gpu_knn = ApproximateNearestNeighbors( + num_workers=gpu_number, + inputCol=features_col, + idCol=label_col, + k=n_neighbors, + metric=metric, + algorithm=algorithm, + algoParams=algo_params, + ) + + gpu_model = gpu_knn.fit(data_df) + (_, _, knn_df) = gpu_model.kneighbors(data_df) + knn_df.show() + + @pytest.mark.parametrize( "algorithm,feature_type,max_records_per_batch,algo_params,metric", [ @@ -617,7 +865,6 @@ def test_cagra( ), ], ) -@pytest.mark.parametrize("data_shape", [(10000, 50)], ids=idfn) @pytest.mark.parametrize("data_type", [np.float32]) def test_cagra_params( algorithm: str, @@ -625,12 +872,12 @@ def test_cagra_params( max_records_per_batch: int, algo_params: Dict[str, Any], metric: str, - data_shape: Tuple[int, int], data_type: np.dtype, + caplog: LogCaptureFixture, ) -> None: + data_shape = (1000, 20) itopk_size = 64 if "itopk_size" not in algo_params else algo_params["itopk_size"] - import math internal_topk_size = math.ceil(itopk_size / 32) * 32 n_neighbors = 50 @@ -650,6 +897,23 @@ def test_cagra_params( n_neighbors=n_neighbors, ) + # test intermediate_graph_degree restriction on ivf_pq + algo_params["itopk_size"] = 64 + algo_params["intermediate_graph_degree"] = 257 + error_msg = f"cagra with ivf_pq build_algo expects intermediate_graph_degree (257) to be smaller than 256." + with pytest.raises(Exception): + test_cagra( + algorithm, + feature_type, + max_records_per_batch, + algo_params, + metric, + data_shape, + data_type, + n_neighbors=n_neighbors, + ) + assert error_msg in caplog.text + @pytest.mark.parametrize( "combo", @@ -667,9 +931,126 @@ def test_ivfflat_wide_matrix( data_shape: Tuple[int, int], data_type: np.dtype, ) -> None: + """ + It seems adding a column with df.withColumn can be very slow, if df already has many columns (e.g. 3000). + One strategy is to avoid df.withColumn on wide df and use df.select instead. + """ import time start = time.time() - test_ann_algorithm(combo=combo, data_shape=data_shape, data_type=data_type) + ann_algorithm_test_func(combo=combo, data_shape=data_shape, data_type=data_type) duration_sec = time.time() - start - assert duration_sec < 10 * 60 + assert duration_sec < 3 * 60 + + +@pytest.mark.parametrize( + "algorithm,feature_type", + [ + ( + "ivfpq", + "array", + ), + ( + "ivfflat", + "vector", + ), + ], +) +@pytest.mark.parametrize("data_type", [np.float32]) +def test_return_fewer_k( + algorithm: str, + feature_type: str, + data_type: np.dtype, +) -> None: + """ + This tests the corner case where there are less than k neighbors found due to nprobe too small. + More details can be found at the docstring of class ApproximateNearestNeighbors. + """ + assert algorithm in {"ivfpq", "ivfflat"} + metric = "euclidean" + gpu_number = 1 + k = 4 + algo_params = { + "nlist": k, + "nprobe": 1, + } + + if algorithm == "ivfpq": + algo_params.update({"M": 2, "n_bits": 4}) + + X = np.array( + [ + ( + 0.0, + 0.0, + ), + ( + 0.0, + 0.0, + ), + ( + 2.0, + 2.0, + ), + ( + 2.0, + 2.0, + ), + ] + ) + y = np.arange(len(X)) # use label column as id column + + with CleanSparkSession() as spark: + df, features_col, label_col = create_pyspark_dataframe( + spark, feature_type, data_type, X, y, label_dtype=np.dtype(np.int64) + ) + + est = ApproximateNearestNeighbors( + num_workers=gpu_number, + algorithm=algorithm, + algoParams=algo_params, + metric=metric, + k=k, + inputCol="features", + idCol=label_col, + ) + model = est.fit(df) + _, _, knn_df = model.kneighbors(df) + knn_df_collect = knn_df.collect() + + int64_max = np.iinfo("int64").max + float_inf = float("inf") + + # ensure consistency with cuvs for ivfflat, and ivfpq > 24.10 + import cuvs + from packaging import version + + if algorithm == "ivfflat" or version.parse(cuvs.__version__) > version.parse( + "24.10.00" + ): + ann_evaluator = ANNEvaluator(X, k, metric) + spark_indices = np.array([row["indices"] for row in knn_df_collect]) + spark_distances = np.array([row["distances"] for row in knn_df_collect]) + ann_evaluator.compare_with_cuml_or_cuvs_sg( + algorithm, algo_params, spark_indices, spark_distances, tolerance=0.0 + ) + + # check result details + indices_none_probed = [int64_max, int64_max, int64_max, int64_max] + distances_none_probed = [float_inf, float_inf, float_inf, float_inf] + + def check_row_results( + i: int, indices_if_probed: List[int], distances_if_probed: List[float] + ) -> None: + assert i == 0 or i == 2 + j = i + 1 + assert knn_df_collect[i]["indices"] == knn_df_collect[j]["indices"] + assert knn_df_collect[i]["distances"] == knn_df_collect[j]["distances"] + if knn_df_collect[i]["indices"] == indices_none_probed: + assert knn_df_collect[i]["distances"] == distances_none_probed + else: + assert knn_df_collect[i]["indices"] == indices_if_probed + assert knn_df_collect[i]["distances"] == distances_if_probed + + check_row_results(0, [0, 1, 0, 0], [0.0, 0.0, float_inf, float_inf]) + check_row_results(2, [2, 3, 2, 2], [0.0, 0.0, float_inf, float_inf]) diff --git a/python/tests/test_dbscan.py b/python/tests/test_dbscan.py index da008346..fb1c4e12 100644 --- a/python/tests/test_dbscan.py +++ b/python/tests/test_dbscan.py @@ -46,35 +46,61 @@ ) -def test_default_cuml_params() -> None: - from cuml import DBSCAN as CumlDBSCAN - - cuml_params = get_default_cuml_parameters([CumlDBSCAN], ["handle", "output_type"]) - cuml_params["calc_core_sample_indices"] = False - spark_params = DBSCAN()._get_cuml_params_default() - assert cuml_params == spark_params - - -def test_params(gpu_number: int, tmp_path: str, caplog: LogCaptureFixture) -> None: - # Default constructor - default_spark_params: Dict[str, Any] = {} - default_cuml_params = { - "eps": 0.5, - "min_samples": 5, - "metric": "euclidean", - "verbose": False, - "max_mbytes_per_batch": None, - "calc_core_sample_indices": False, +@pytest.mark.parametrize("default_params", [True, False]) +def test_params( + default_params: bool, + tmp_path: str, +) -> None: + from cuml import DBSCAN as cumlDBSCAN + + spark_params = { + param.name: value for param, value in DBSCAN().extractParamMap().items() } - default_dbscan = DBSCAN() - assert_params(default_dbscan, default_spark_params, default_cuml_params) + + cuml_params = get_default_cuml_parameters( + cuml_classes=[cumlDBSCAN], + excludes=[ + "handle", + "output_type", + "calc_core_sample_indices", + ], + ) + + # Ensure internal cuml defaults match actual cuml defaults + assert DBSCAN()._get_cuml_params_default() == cuml_params + + with pytest.raises( + ValueError, match="Unsupported param 'calc_core_sample_indices'" + ): + dbscan_dummy = DBSCAN(calc_core_sample_indices=True) + + if default_params: + dbscan = DBSCAN() + else: + nondefault_params = { + "eps": 0.4, + "metric": "cosine", + "min_samples": 4, + } + dbscan = DBSCAN(**nondefault_params) # type: ignore + cuml_params.update(nondefault_params) + spark_params.update(nondefault_params) + + cuml_params["calc_core_sample_indices"] = ( + False # we override this param to False internally + ) + + # Ensure both Spark API params and internal cuml_params are set correctly + assert_params(dbscan, spark_params, cuml_params) + assert dbscan.cuml_params == cuml_params # Estimator persistence path = tmp_path + "/dbscan_tests" estimator_path = f"{path}/dbscan" - default_dbscan.write().overwrite().save(estimator_path) + dbscan.write().overwrite().save(estimator_path) loaded_dbscan = DBSCAN.load(estimator_path) - assert_params(loaded_dbscan, default_spark_params, default_cuml_params) + assert_params(loaded_dbscan, spark_params, cuml_params) + assert loaded_dbscan.cuml_params == cuml_params # setter/getter from .test_common_estimator import _test_input_setter_getter diff --git a/python/tests/test_kmeans.py b/python/tests/test_kmeans.py index eabb5842..1c7f88a5 100644 --- a/python/tests/test_kmeans.py +++ b/python/tests/test_kmeans.py @@ -63,14 +63,49 @@ def assert_centers_equal( assert a_center == pytest.approx(b_center, tolerance) -def test_params() -> None: +@pytest.mark.parametrize("default_params", [True, False]) +def test_params(default_params: bool) -> None: from cuml import KMeans as CumlKMeans + from pyspark.ml.clustering import KMeans as SparkKMeans + + spark_params = { + param.name: value for param, value in SparkKMeans().extractParamMap().items() + } cuml_params = get_default_cuml_parameters( - [CumlKMeans], ["handle", "output_type", "convert_dtype"] + cuml_classes=[CumlKMeans], excludes=["handle", "output_type", "convert_dtype"] ) - spark_params = KMeans()._get_cuml_params_default() - assert cuml_params == spark_params + + # Ensure internal cuml defaults match actual cuml defaults + assert KMeans()._get_cuml_params_default() == cuml_params + + # Our algorithm overrides the following cuml parameters with their spark defaults: + spark_default_overrides = { + "n_clusters": spark_params["k"], + "max_iter": spark_params["maxIter"], + "init": spark_params["initMode"], + } + + cuml_params.update(spark_default_overrides) + + if default_params: + kmeans = KMeans() + seed = kmeans.getSeed() # get the random seed that Spark generates + spark_params["seed"] = seed + cuml_params["random_state"] = seed + else: + kmeans = KMeans( + k=10, + seed=42, + ) + cuml_params["n_clusters"] = 10 + cuml_params["random_state"] = 42 + spark_params["k"] = 10 + spark_params["seed"] = 42 + + # Ensure both Spark API params and internal cuml_params are set correctly + assert_params(kmeans, spark_params, cuml_params) + assert kmeans.cuml_params == cuml_params # setter/getter from .test_common_estimator import _test_input_setter_getter @@ -368,6 +403,7 @@ def test_kmeans_spark_compat( kmeans.setSeed(1) kmeans.setMaxIter(10) + kmeans.setInitMode("k-means||") if isinstance(kmeans, SparkKMeans): kmeans.setWeightCol("weighCol") else: @@ -377,6 +413,7 @@ def test_kmeans_spark_compat( assert kmeans.getMaxIter() == 10 assert kmeans.getK() == 2 assert kmeans.getSeed() == 1 + assert kmeans.getInitMode() == "k-means||" kmeans.clear(kmeans.maxIter) assert kmeans.getMaxIter() == 20 diff --git a/python/tests/test_linear_model.py b/python/tests/test_linear_model.py index 6441798f..5585ddae 100644 --- a/python/tests/test_linear_model.py +++ b/python/tests/test_linear_model.py @@ -97,24 +97,54 @@ def train_with_cuml_linear_regression( return lr -def test_params() -> None: +@pytest.mark.parametrize("default_params", [True, False]) +def test_params(default_params: bool) -> None: from cuml.linear_model.linear_regression import ( LinearRegression as CumlLinearRegression, ) from cuml.linear_model.ridge import Ridge from cuml.solvers import CD + from pyspark.ml.regression import LinearRegression as SparkLinearRegression + + spark_params = { + param.name: value + for param, value in SparkLinearRegression().extractParamMap().items() + } cuml_params = get_default_cuml_parameters( - [CumlLinearRegression, Ridge, CD], ["handle", "output_type"] + cuml_classes=[CumlLinearRegression, Ridge, CD], + excludes=["handle", "output_type"], ) - spark_params = LinearRegression()._get_cuml_params_default() - import cuml - from packaging import version + # Ensure internal cuml defaults match actual cuml defaults + assert cuml_params == LinearRegression()._get_cuml_params_default() + + # Our algorithm overrides the following cuml parameters with their spark defaults: + spark_default_overrides = { + "alpha": spark_params["regParam"], + "l1_ratio": spark_params["elasticNetParam"], + "max_iter": spark_params["maxIter"], + "normalize": spark_params["standardization"], + "tol": spark_params["tol"], + } + + cuml_params.update(spark_default_overrides) - if version.parse(cuml.__version__) < version.parse("23.08.00"): - spark_params.pop("copy_X") - assert cuml_params == spark_params + if default_params: + lr = LinearRegression() + else: + lr = LinearRegression( + regParam=0.001, + maxIter=500, + ) + cuml_params["alpha"] = 0.001 + cuml_params["max_iter"] = 500 + spark_params["regParam"] = 0.001 + spark_params["maxIter"] = 500 + + # Ensure both Spark API params and internal cuml_params are set correctly + assert_params(lr, spark_params, cuml_params) + assert cuml_params == lr.cuml_params # setter/getter from .test_common_estimator import _test_input_setter_getter diff --git a/python/tests/test_logistic_regression.py b/python/tests/test_logistic_regression.py index 6c974f1f..8ae9094c 100644 --- a/python/tests/test_logistic_regression.py +++ b/python/tests/test_logistic_regression.py @@ -55,6 +55,7 @@ assert_params, create_pyspark_dataframe, feature_types, + get_default_cuml_parameters, idfn, make_classification_dataset, ) @@ -172,29 +173,51 @@ def assert_transform(model: LogisticRegressionModel) -> None: def test_params(tmp_path: str, caplog: LogCaptureFixture) -> None: + from cuml import LogisticRegression as CumlLogisticRegression + from pyspark.ml.classification import LogisticRegression as SparkLogisticRegression + # Default params: no regularization default_spark_params = { - "maxIter": 100, - "regParam": 0.0, - "elasticNetParam": 0.0, - "tol": 1e-06, - "fitIntercept": True, - "standardization": True, + param.name: value + for param, value in SparkLogisticRegression().extractParamMap().items() } - default_cuml_params: Dict[str, Any] = { - "max_iter": 100, - "penalty": None, - "C": 0.0, - "l1_ratio": 0.0, - "tol": 1e-6, - "fit_intercept": True, - "standardization": True, + default_cuml_params = get_default_cuml_parameters( + cuml_classes=[CumlLogisticRegression], + excludes=[ + "class_weight", + "linesearch_max_iter", + "solver", + "handle", + "output_type", + ], + ) + + default_cuml_params["standardization"] = ( + False # Standardization param exists in LogisticRegressionMG (default = False) but not in SG, and we support it. Add it in manually for this check. + ) + + # Ensure internal cuml defaults match actual cuml defaults + assert default_cuml_params == LogisticRegression()._get_cuml_params_default() + + # Our algorithm overrides the following cuml parameters with their spark defaults: + spark_default_overrides = { + "tol": default_spark_params["tol"], + "max_iter": default_spark_params["maxIter"], + "standardization": default_spark_params["standardization"], + "C": default_spark_params["regParam"], + "l1_ratio": default_spark_params[ + "elasticNetParam" + ], # set to 0.0 when reg_param == 0.0 + "penalty": None, # set to None when reg_param == 0.0 } + default_cuml_params.update(spark_default_overrides) + default_lr = LogisticRegression() assert_params(default_lr, default_spark_params, default_cuml_params) + assert default_lr.cuml_params == default_cuml_params # L2 regularization spark_params: Dict[str, Any] = { @@ -222,6 +245,7 @@ def test_params(tmp_path: str, caplog: LogCaptureFixture) -> None: } ) assert_params(spark_lr, expected_spark_params, expected_cuml_params) + assert spark_lr.cuml_params == expected_cuml_params # L1 regularization spark_params = { @@ -249,6 +273,7 @@ def test_params(tmp_path: str, caplog: LogCaptureFixture) -> None: } ) assert_params(spark_lr, expected_spark_params, expected_cuml_params) + assert spark_lr.cuml_params == expected_cuml_params # elasticnet(L1 + L2) regularization spark_params = { @@ -2186,3 +2211,42 @@ def test_sparse_int64() -> None: total_tol=tolerance, accuracy_and_probability_only=True, ) + + +@pytest.mark.slow +@pytest.mark.parametrize("standardization", [True, False]) +@pytest.mark.parametrize("fit_intercept", [True, False]) +def test_sparse_all_zeroes( + standardization: bool, + fit_intercept: bool, +) -> None: + tolerance = 0.001 + + with CleanSparkSession() as spark: + data = [ + Row(label=1.0, weight=1.0, features=Vectors.sparse(2, {})), + Row(label=1.0, weight=1.0, features=Vectors.sparse(2, {})), + Row(label=0.0, weight=1.0, features=Vectors.sparse(2, {})), + Row(label=0.0, weight=1.0, features=Vectors.sparse(2, {})), + ] + + bdf = spark.createDataFrame(data) + + params: Dict[str, Any] = { + "regParam": 0.1, + "fitIntercept": fit_intercept, + "standardization": standardization, + "featuresCol": "features", + "labelCol": "label", + } + + if version.parse(pyspark.__version__) < version.parse("3.4.0"): + return + + gpu_lr = LogisticRegression(enable_sparse_data_optim=True, **params) + gpu_model = gpu_lr.fit(bdf) + check_sparse_model_preprocess(gpu_model, bdf) + + cpu_lr = SparkLogisticRegression(**params) + cpu_model = cpu_lr.fit(bdf) + compare_model(gpu_model, cpu_model, bdf) diff --git a/python/tests/test_nearest_neighbors.py b/python/tests/test_nearest_neighbors.py index 061f0c61..bf0ec74f 100644 --- a/python/tests/test_nearest_neighbors.py +++ b/python/tests/test_nearest_neighbors.py @@ -4,7 +4,9 @@ import pandas as pd import pytest from _pytest.logging import LogCaptureFixture +from pyspark.ml.linalg import VectorUDT from pyspark.sql import DataFrame +from pyspark.sql.types import LongType, StructField, StructType from sklearn.datasets import make_blobs from spark_rapids_ml.core import alias @@ -18,6 +20,7 @@ from .sparksession import CleanSparkSession from .utils import ( array_equal, + assert_params, create_pyspark_dataframe, get_default_cuml_parameters, idfn, @@ -28,15 +31,21 @@ NNModel = Union[NearestNeighborsModel, ApproximateNearestNeighborsModel] -def test_params(caplog: LogCaptureFixture) -> None: +@pytest.mark.parametrize("default_params", [True, False]) +def test_params(default_params: bool, caplog: LogCaptureFixture) -> None: from cuml import NearestNeighbors as CumlNearestNeighbors from cuml.neighbors.nearest_neighbors_mg import ( NearestNeighborsMG, # to include the batch_size parameter that exists in the MG class ) + spark_params = { + param.name: value + for param, value in NearestNeighbors().extractParamMap().items() + } + cuml_params = get_default_cuml_parameters( - [CumlNearestNeighbors, NearestNeighborsMG], - [ + cuml_classes=[CumlNearestNeighbors, NearestNeighborsMG], + excludes=[ "handle", "algorithm", "metric", @@ -47,8 +56,18 @@ def test_params(caplog: LogCaptureFixture) -> None: "output_type", ], ) - spark_params = NearestNeighbors()._get_cuml_params_default() - assert cuml_params == spark_params + assert cuml_params == NearestNeighbors()._get_cuml_params_default() + + if default_params: + knn = NearestNeighbors() + else: + knn = NearestNeighbors(k=7) + cuml_params["n_neighbors"] = 7 + spark_params["k"] = 7 + + # Ensure both Spark API params and internal cuml_params are set correctly + assert_params(knn, spark_params, cuml_params) + assert knn.cuml_params == cuml_params # float32_inputs warn, NearestNeighbors only accepts float32 nn_float32 = NearestNeighbors(float32_inputs=False) @@ -107,9 +126,16 @@ def func_test_example_no_id( with pytest.raises(NotImplementedError): gpu_model.save(tmp_path + "/knn_model") + # test kneighbors on empty query dataframe + df_empty = spark.createDataFrame([], schema="features array") + (_, _, knn_df_empty) = gpu_model.kneighbors(df_empty) + knn_df_empty.show() + + # test kneighbors on normal query dataframe (item_df_withid, query_df_withid, knn_df) = gpu_model.kneighbors(query_df) item_df_withid.show() query_df_withid.show() + knn_df = knn_df.cache() knn_df.show() # check knn results @@ -161,6 +187,7 @@ def assert_indices_equal(indices: List[List[int]]) -> None: else: knnjoin_df = gpu_model.approxSimilarityJoin(query_df, distCol="distCol") + knnjoin_df = knnjoin_df.cache() knnjoin_df.show() assert len(knnjoin_df.dtypes) == 3 @@ -250,6 +277,9 @@ def assert_knn_metadata_equal(knn_metadata: List[List[str]]) -> None: (_, _, knn_df_v2) = gpu_model_v2.kneighbors(query_df) assert knn_df_v2.collect() == knn_df.collect() + knn_df.unpersist() + knnjoin_df.unpersist() + return gpu_knn, gpu_model @@ -296,9 +326,18 @@ def func_test_example_with_id( gpu_knn = gpu_knn.setK(topk) gpu_model = gpu_knn.fit(data_df) + + # test kneighbors on empty query dataframe with id column + df_empty = spark.createDataFrame([], schema="id long, features array") + (_, _, knn_df_empty) = gpu_model.kneighbors(df_empty) + knn_df_empty.show() + + # test kneighbors on normal query dataframe item_df_withid, query_df_withid, knn_df = gpu_model.kneighbors(query_df) item_df_withid.show() query_df_withid.show() + + knn_df = knn_df.cache() knn_df.show() distances_df = knn_df.select("distances") @@ -322,6 +361,7 @@ def assert_indices_equal(indices: List[List[int]]) -> None: else: knnjoin_df = gpu_model.approxSimilarityJoin(query_df, distCol="distCol") + knnjoin_df = knnjoin_df.cache() knnjoin_df.show() assert len(knnjoin_df.dtypes) == 3 @@ -345,6 +385,8 @@ def assert_indices_equal(indices: List[List[int]]) -> None: reconstructed_query_ids = [r.query_id for r in reconstructed_rows] assert reconstructed_query_ids == [201, 202, 203, 204, 205] + knn_df.unpersist() + knnjoin_df.unpersist() return (gpu_knn, gpu_model) diff --git a/python/tests/test_pca.py b/python/tests/test_pca.py index 5964b972..46ec0f2d 100644 --- a/python/tests/test_pca.py +++ b/python/tests/test_pca.py @@ -52,12 +52,20 @@ PCAModelType = TypeVar("PCAModelType", Type[SparkPCAModel], Type[PCAModel]) -def test_params(caplog: LogCaptureFixture) -> None: +@pytest.mark.parametrize("default_params", [True, False]) +def test_params(default_params: bool, caplog: LogCaptureFixture) -> None: from cuml import PCA as CumlPCA + from pyspark.ml.feature import PCA as SparkPCA + + spark_params = { + param.name: value for param, value in SparkPCA().extractParamMap().items() + } + # Ignore output col, as it is linked to the object id by default (e.g., 'PCA_ac9c581af6b3__output') + spark_params.pop("outputCol", None) cuml_params = get_default_cuml_parameters( - [CumlPCA], - [ + cuml_classes=[CumlPCA], + excludes=[ "copy", "handle", "iterated_power", @@ -66,8 +74,20 @@ def test_params(caplog: LogCaptureFixture) -> None: "tol", ], ) - spark_params = PCA()._get_cuml_params_default() - assert cuml_params == spark_params + + # Ensure internal cuml defaults match actual cuml defaults + assert cuml_params == PCA()._get_cuml_params_default() + + if default_params: + pca = PCA() + else: + pca = PCA(k=4) + cuml_params["n_components"] = 4 + spark_params["k"] = 4 + + # Ensure both Spark API params and internal cuml_params are set correctly + assert_params(pca, spark_params, cuml_params) + assert cuml_params == pca.cuml_params # make sure no warning when enabling float64 inputs pca_float32 = PCA(float32_inputs=False) diff --git a/python/tests/test_random_forest.py b/python/tests/test_random_forest.py index 1e294249..92bc434f 100644 --- a/python/tests/test_random_forest.py +++ b/python/tests/test_random_forest.py @@ -104,12 +104,28 @@ @pytest.mark.parametrize("Estimator", [RandomForestClassifier, RandomForestRegressor]) -def test_params(Estimator: RandomForest) -> None: +@pytest.mark.parametrize("default_params", [True, False]) +def test_params(default_params: bool, Estimator: RandomForest) -> None: from cuml.ensemble.randomforest_common import BaseRandomForestModel + from pyspark.ml.classification import ( + RandomForestClassificationModel as SparkRandomForestClassifier, + ) + from pyspark.ml.regression import ( + RandomForestRegressionModel as SparkRandomForestRegressor, + ) + + SparkEstimator = ( + SparkRandomForestClassifier + if Estimator == RandomForestClassifier + else SparkRandomForestRegressor + ) + spark_params = { + param.name: value for param, value in SparkEstimator().extractParamMap().items() + } cuml_params = get_default_cuml_parameters( - [BaseRandomForestModel], - [ + cuml_classes=[BaseRandomForestModel], + excludes=[ "handle", "output_type", "accuracy_metric", @@ -124,8 +140,42 @@ def test_params(Estimator: RandomForest) -> None: "class_weight", ], ) - spark_params = Estimator()._get_cuml_params_default() - assert cuml_params == spark_params + + # Ensure internal cuml defaults match actual cuml defaults + assert cuml_params == Estimator()._get_cuml_params_default() + + # Our algorithm overrides the following cuml parameters with their spark defaults: + spark_default_overrides = { + "n_streams": 1, + "n_estimators": spark_params["numTrees"], + "max_depth": spark_params["maxDepth"], + "n_bins": spark_params["maxBins"], + "max_features": spark_params["featureSubsetStrategy"], + "split_criterion": {"gini": "gini", "variance": "mse"}.get( + spark_params["impurity"] + ), + } + + cuml_params.update(spark_default_overrides) + + if default_params: + est = Estimator() + seed = est.getSeed() + cuml_params["random_state"] = seed + spark_params["seed"] = seed + else: + est = Estimator( + maxDepth=7, + seed=42, + ) + cuml_params["max_depth"] = 7 + cuml_params["random_state"] = 42 + spark_params["maxDepth"] = 7 + spark_params["seed"] = 42 + + # Ensure both Spark API params and internal cuml_params are set correctly + assert_params(est, spark_params, cuml_params) + assert est.cuml_params == cuml_params # setter/getter from .test_common_estimator import _test_input_setter_getter @@ -413,12 +463,21 @@ def test_random_forest_classifier( labelCol=spark_rf_model.getLabelCol(), ) - spark_cuml_f1_score = spark_rf_model._transformEvaluate(test_df, evaluator) + y_test_fewer_classes = np.maximum(y_test - 1, 0) + + test_df_fewer_classes, _, _ = create_pyspark_dataframe( + spark, feature_type, data_type, X_test, y_test_fewer_classes + ) + + for _test_df in [test_df, test_df_fewer_classes]: + spark_cuml_f1_score = spark_rf_model._transformEvaluate( + _test_df, evaluator + ) - transformed_df = spark_rf_model.transform(test_df) - pyspark_f1_score = evaluator.evaluate(transformed_df) + transformed_df = spark_rf_model.transform(_test_df) + pyspark_f1_score = evaluator.evaluate(transformed_df) - assert math.fabs(pyspark_f1_score - spark_cuml_f1_score[0]) < 1e-6 + assert math.fabs(pyspark_f1_score - spark_cuml_f1_score[0]) < 1e-6 @pytest.mark.parametrize("feature_type", pyspark_supported_feature_types) diff --git a/python/tests/test_ucx.py b/python/tests/test_ucx.py index 79431372..b8abfd2b 100644 --- a/python/tests/test_ucx.py +++ b/python/tests/test_ucx.py @@ -71,6 +71,13 @@ def _train_udf(pdf_iter: Iterator[pd.DataFrame]) -> pd.DataFrame: enable=True, require_ucx=True, ) as cc: + # pyspark uses sighup to kill python workers gracefully, and for some reason + # the signal handler for sighup needs to be explicitly reset at this point + # to avoid having SIGHUP be swallowed during a usleep call in the nccl library. + # this helps avoid zombie surviving python workers when some workers fail. + import signal + + signal.signal(signal.SIGHUP, signal.SIG_DFL) async def do_allGather() -> List[str]: loop = asyncio.get_running_loop() diff --git a/python/tests/test_umap.py b/python/tests/test_umap.py index 2af2700b..2e93d8ab 100644 --- a/python/tests/test_umap.py +++ b/python/tests/test_umap.py @@ -15,14 +15,16 @@ # import math -from typing import List, Tuple, Union +from typing import Any, Dict, List, Tuple, Union import cupy as cp import numpy as np import pytest from _pytest.logging import LogCaptureFixture from cuml.metrics import trustworthiness +from pyspark.ml.linalg import SparseVector from pyspark.sql.functions import array +from scipy.sparse import csr_matrix from sklearn.datasets import load_digits, load_iris from spark_rapids_ml.umap import UMAP, UMAPModel @@ -32,10 +34,62 @@ assert_params, create_pyspark_dataframe, cuml_supported_data_types, + get_default_cuml_parameters, pyspark_supported_feature_types, ) +def _load_sparse_binary_data( + n_rows: int, n_cols: int, nnz: int +) -> Tuple[List[Tuple[SparseVector]], csr_matrix]: + # TODO: Replace this function by adding to SparseDataGen + # Generate binary sparse data compatible with Jaccard, with nnz non-zero values per row. + data = [] + for i in range(n_rows): + indices = [(i + j) % n_cols for j in range(nnz)] + values = [1] * nnz + sparse_vector = SparseVector(n_cols, dict(zip(indices, values))) + data.append((sparse_vector,)) + + csr_data: List[float] = [] + csr_indices: List[int] = [] + csr_indptr: List[int] = [0] + for row in data: + sparse_vector = row[0] + csr_data.extend(sparse_vector.values) + csr_indices.extend(sparse_vector.indices) + csr_indptr.append(csr_indptr[-1] + len(sparse_vector.indices)) + csr_mat = csr_matrix((csr_data, csr_indices, csr_indptr), shape=(n_rows, n_cols)) + + return data, csr_mat + + +def _assert_umap_model( + model: UMAPModel, input_raw_data: Union[np.ndarray, csr_matrix] +) -> None: + embedding = model.embedding + raw_data = model.rawData + assert embedding.shape == ( + input_raw_data.shape[0], + model.cuml_params["n_components"], + ) + assert raw_data.shape == input_raw_data.shape + if isinstance(input_raw_data, csr_matrix): + assert isinstance(raw_data, csr_matrix) + assert model._sparse_fit + assert (raw_data != input_raw_data).nnz == 0 + assert ( + np.all(raw_data.indices == input_raw_data.indices) + and np.all(raw_data.indptr == input_raw_data.indptr) + and np.allclose(raw_data.data, input_raw_data.data) + ) + else: + assert not model._sparse_fit + assert np.array_equal(raw_data, input_raw_data) + assert model.dtype == "float32" + assert model.n_cols == input_raw_data.shape[1] + + def _load_dataset(dataset: str, n_rows: int) -> Tuple[np.ndarray, np.ndarray]: if dataset == "digits": local_X, local_y = load_digits(return_X_y=True) @@ -56,18 +110,29 @@ def _load_dataset(dataset: str, n_rows: int) -> Tuple[np.ndarray, np.ndarray]: def _local_umap_trustworthiness( - local_X: np.ndarray, + local_X: Union[np.ndarray, csr_matrix], local_y: np.ndarray, n_neighbors: int, supervised: bool, + sparse: bool = False, ) -> float: from cuml.manifold import UMAP - local_model = UMAP(n_neighbors=n_neighbors, random_state=42, init="random") + if sparse: + local_model = UMAP( + n_neighbors=n_neighbors, random_state=42, init="random", metric="jaccard" + ) + else: + local_model = UMAP(n_neighbors=n_neighbors, random_state=42, init="random") + y_train = local_y if supervised else None local_model.fit(local_X, y=y_train) embedding = local_model.transform(local_X) + if sparse: + assert isinstance(local_X, csr_matrix) + local_X = local_X.toarray() + return trustworthiness(local_X, embedding, n_neighbors=n_neighbors, batch_size=5000) @@ -78,7 +143,6 @@ def _spark_umap_trustworthiness( supervised: bool, n_parts: int, gpu_number: int, - sampling_ratio: float, dtype: np.dtype, feature_type: str, ) -> float: @@ -91,22 +155,32 @@ def _spark_umap_trustworthiness( with CleanSparkSession() as spark: if supervised: - data_df, features_col, label_col = create_pyspark_dataframe( + data_df, feature_cols, label_col = create_pyspark_dataframe( spark, feature_type, dtype, local_X, local_y ) assert label_col is not None umap_estimator.setLabelCol(label_col) else: - data_df, features_col, _ = create_pyspark_dataframe( + data_df, feature_cols, _ = create_pyspark_dataframe( spark, feature_type, dtype, local_X, None ) data_df = data_df.repartition(n_parts) - umap_estimator.setFeaturesCol(features_col).setSampleFraction(sampling_ratio) + if isinstance(feature_cols, list): + umap_estimator.setFeaturesCols(feature_cols) + else: + umap_estimator.setFeaturesCol(feature_cols) + umap_model = umap_estimator.fit(data_df) pdf = umap_model.transform(data_df).toPandas() + embedding = cp.asarray(pdf["embedding"].to_list()).astype(cp.float32) - input = cp.asarray(pdf["features"].to_list()).astype(cp.float32) + if isinstance(feature_cols, list): + input = pdf[feature_cols].to_numpy() + else: + input = pdf[feature_cols].to_list() + + input = cp.asarray(input).astype(cp.float32) return trustworthiness(input, embedding, n_neighbors=n_neighbors, batch_size=5000) @@ -115,7 +189,6 @@ def _run_spark_test( n_parts: int, gpu_number: int, n_rows: int, - sampling_ratio: float, supervised: bool, dataset: str, n_neighbors: int, @@ -131,15 +204,14 @@ def _run_spark_test( supervised, n_parts, gpu_number, - sampling_ratio, dtype, feature_type, ) loc_umap = _local_umap_trustworthiness(local_X, local_y, n_neighbors, supervised) - print("Local UMAP trustworthiness score : {:.2f}".format(loc_umap)) - print("Spark UMAP trustworthiness score : {:.2f}".format(dist_umap)) + print("Local UMAP trustworthiness score : {:.4f}".format(loc_umap)) + print("Spark UMAP trustworthiness score : {:.4f}".format(dist_umap)) trust_diff = loc_umap - dist_umap @@ -148,7 +220,6 @@ def _run_spark_test( @pytest.mark.parametrize("n_parts", [2, 9]) @pytest.mark.parametrize("n_rows", [100, 500]) -@pytest.mark.parametrize("sampling_ratio", [0.55, 0.9]) @pytest.mark.parametrize("supervised", [True, False]) @pytest.mark.parametrize("dataset", ["digits", "iris"]) @pytest.mark.parametrize("n_neighbors", [10]) @@ -159,7 +230,6 @@ def test_spark_umap( n_parts: int, gpu_number: int, n_rows: int, - sampling_ratio: float, supervised: bool, dataset: str, n_neighbors: int, @@ -170,7 +240,6 @@ def test_spark_umap( n_parts, gpu_number, n_rows, - sampling_ratio, supervised, dataset, n_neighbors, @@ -183,7 +252,6 @@ def test_spark_umap( n_parts, gpu_number, n_rows, - sampling_ratio, supervised, dataset, n_neighbors, @@ -196,7 +264,6 @@ def test_spark_umap( @pytest.mark.parametrize("n_parts", [5]) @pytest.mark.parametrize("n_rows", [500]) -@pytest.mark.parametrize("sampling_ratio", [0.7]) @pytest.mark.parametrize("supervised", [True]) @pytest.mark.parametrize("dataset", ["digits"]) @pytest.mark.parametrize("n_neighbors", [10]) @@ -206,7 +273,6 @@ def test_spark_umap_fast( n_parts: int, gpu_number: int, n_rows: int, - sampling_ratio: float, supervised: bool, dataset: str, n_neighbors: int, @@ -218,7 +284,6 @@ def test_spark_umap_fast( n_parts, gpu_number, n_rows, - sampling_ratio, supervised, dataset, n_neighbors, @@ -231,7 +296,6 @@ def test_spark_umap_fast( n_parts, gpu_number, n_rows, - sampling_ratio, supervised, dataset, n_neighbors, @@ -250,37 +314,53 @@ def test_spark_umap_fast( assert umap_float32._float32_inputs -def test_params(tmp_path: str) -> None: - # Default constructor - default_cuml_params = { - "n_neighbors": 15, - "n_components": 2, - "metric": "euclidean", - "n_epochs": None, - "learning_rate": 1.0, - "init": "spectral", - "min_dist": 0.1, - "spread": 1.0, - "set_op_mix_ratio": 1.0, - "local_connectivity": 1.0, - "repulsion_strength": 1.0, - "negative_sample_rate": 5, - "transform_queue_size": 4.0, - "a": None, - "b": None, - "precomputed_knn": None, - "random_state": None, - "verbose": False, +@pytest.mark.parametrize("default_params", [True, False]) +def test_params(tmp_path: str, default_params: bool) -> None: + from cuml import UMAP as cumlUMAP + + spark_params = { + param.name: value for param, value in UMAP().extractParamMap().items() } - default_umap = UMAP() - assert_params(default_umap, {}, default_cuml_params) + + cuml_params = get_default_cuml_parameters( + cuml_classes=[cumlUMAP], + excludes=[ + "callback", + "handle", + "hash_input", + "output_type", + "target_metric", + "target_n_neighbors", + "target_weight", + ], + ) + + # Ensure internal cuml defaults match actual cuml defaults + assert UMAP()._get_cuml_params_default() == cuml_params + + if default_params: + umap = UMAP() + else: + nondefault_params = { + "n_neighbors": 12, + "learning_rate": 0.9, + "random_state": 42, + } + umap = UMAP(**nondefault_params) # type: ignore + cuml_params.update(nondefault_params) + spark_params.update(nondefault_params) + + # Ensure both Spark API params and internal cuml_params are set correctly + assert_params(umap, spark_params, cuml_params) + assert umap.cuml_params == cuml_params # Estimator persistence path = tmp_path + "/umap_tests" estimator_path = f"{path}/umap" - default_umap.write().overwrite().save(estimator_path) + umap.write().overwrite().save(estimator_path) loaded_umap = UMAP.load(estimator_path) - assert_params(loaded_umap, {}, default_cuml_params) + assert_params(loaded_umap, spark_params, cuml_params) + assert umap.cuml_params == cuml_params assert loaded_umap._float32_inputs # setter/getter @@ -289,13 +369,141 @@ def test_params(tmp_path: str) -> None: _test_input_setter_getter(UMAP) -def test_umap_model_persistence(gpu_number: int, tmp_path: str) -> None: +@pytest.mark.parametrize("sparse_fit", [True, False]) +def test_umap_model_persistence( + sparse_fit: bool, gpu_number: int, tmp_path: str +) -> None: + import pyspark + from cuml.datasets import make_blobs + from packaging import version + + with CleanSparkSession() as spark: + + n_rows = 5000 + n_cols = 200 + + if sparse_fit: + if version.parse(pyspark.__version__) < version.parse("3.4.0"): + import logging + + err_msg = "pyspark < 3.4 is detected. Cannot import pyspark `unwrap_udt` function for SparseVector. " + "The test case will be skipped. Please install pyspark>=3.4." + logging.info(err_msg) + return + + data, input_raw_data = _load_sparse_binary_data(n_rows, n_cols, 30) + df = spark.createDataFrame(data, ["features"]) + else: + X, _ = make_blobs( + n_rows, + n_cols, + centers=5, + cluster_std=0.1, + dtype=np.float32, + random_state=10, + ) + pyspark_type = "float" + feature_cols = [f"c{i}" for i in range(X.shape[1])] + schema = [f"{c} {pyspark_type}" for c in feature_cols] + df = spark.createDataFrame(X.tolist(), ",".join(schema)) + df = df.withColumn("features", array(*feature_cols)).drop(*feature_cols) + input_raw_data = X.get() + + umap = UMAP(num_workers=gpu_number).setFeaturesCol("features") + + umap_model = umap.fit(df) + _assert_umap_model(umap_model, input_raw_data) + + # Model persistence + path = tmp_path + "/umap_tests" + model_path = f"{path}/umap_model" + umap_model.write().overwrite().save(model_path) + umap_model_loaded = UMAPModel.load(model_path) + _assert_umap_model(umap_model_loaded, input_raw_data) + + +@pytest.mark.parametrize("maxRecordsPerBatch", ["2000"]) +@pytest.mark.parametrize("BROADCAST_LIMIT", [8 << 15]) +@pytest.mark.parametrize("sparse_fit", [True, False]) +def test_umap_chunking( + gpu_number: int, maxRecordsPerBatch: str, BROADCAST_LIMIT: int, sparse_fit: bool +) -> None: from cuml.datasets import make_blobs + n_rows = int(int(maxRecordsPerBatch) * 2.5) + n_cols = 3000 + + with CleanSparkSession() as spark: + spark.conf.set( + "spark.sql.execution.arrow.maxRecordsPerBatch", maxRecordsPerBatch + ) + + if sparse_fit: + import pyspark + from packaging import version + + if version.parse(pyspark.__version__) < version.parse("3.4.0"): + import logging + + err_msg = "pyspark < 3.4 is detected. Cannot import pyspark `unwrap_udt` function for SparseVector. " + "The test case will be skipped. Please install pyspark>=3.4." + logging.info(err_msg) + return + + data, input_raw_data = _load_sparse_binary_data(n_rows, n_cols, 30) + df = spark.createDataFrame(data, ["features"]) + nbytes = input_raw_data.data.nbytes + else: + X, _ = make_blobs( + n_rows, + n_cols, + centers=5, + cluster_std=0.1, + dtype=np.float32, + random_state=10, + ) + pyspark_type = "float" + feature_cols = [f"c{i}" for i in range(X.shape[1])] + schema = [f"{c} {pyspark_type}" for c in feature_cols] + df = spark.createDataFrame(X.tolist(), ",".join(schema)) + df = df.withColumn("features", array(*feature_cols)).drop(*feature_cols) + input_raw_data = X.get() + nbytes = input_raw_data.nbytes + + umap = UMAP(num_workers=gpu_number).setFeaturesCol("features") + + assert umap.max_records_per_batch == int(maxRecordsPerBatch) + assert nbytes > BROADCAST_LIMIT + + umap_model = umap.fit(df) + umap_model.BROADCAST_LIMIT = BROADCAST_LIMIT + + _assert_umap_model(umap_model, input_raw_data) + + pdf = umap_model.transform(df).toPandas() + embedding = np.vstack(pdf["embedding"]).astype(np.float32) + input = np.vstack(pdf["features"]).astype(np.float32) + + dist_umap = trustworthiness(input, embedding, n_neighbors=15, batch_size=5000) + loc_umap = _local_umap_trustworthiness( + input_raw_data, np.zeros(0), 15, False, sparse_fit + ) + trust_diff = loc_umap - dist_umap + + assert trust_diff <= 0.15 + + +def test_umap_sample_fraction(gpu_number: int) -> None: + from cuml.datasets import make_blobs + + n_rows = 5000 + sample_fraction = 0.5 + random_state = 42 + X, _ = make_blobs( - 100, - 20, - centers=42, + n_rows, + 10, + centers=5, cluster_std=0.1, dtype=np.float32, random_state=10, @@ -305,39 +513,39 @@ def test_umap_model_persistence(gpu_number: int, tmp_path: str) -> None: pyspark_type = "float" feature_cols = [f"c{i}" for i in range(X.shape[1])] schema = [f"{c} {pyspark_type}" for c in feature_cols] - df = spark.createDataFrame(X.tolist(), ",".join(schema)) + df = spark.createDataFrame(X.tolist(), ",".join(schema)).coalesce(1) df = df.withColumn("features", array(*feature_cols)).drop(*feature_cols) - umap = UMAP(num_workers=gpu_number).setFeaturesCol("features") - - def assert_umap_model(model: UMAPModel) -> None: - embedding = np.array(model.embedding) - raw_data = np.array(model.raw_data) - assert embedding.shape == (100, 2) - assert raw_data.shape == (100, 20) - assert np.array_equal(raw_data, X.get()) - assert model.dtype == "float32" - assert model.n_cols == X.shape[1] + umap = ( + UMAP(num_workers=gpu_number, random_state=random_state) + .setFeaturesCol("features") + .setSampleFraction(sample_fraction) + ) + assert umap.getSampleFraction() == sample_fraction + assert umap.getRandomState() == random_state umap_model = umap.fit(df) - assert_umap_model(model=umap_model) - # Model persistence - path = tmp_path + "/umap_tests" - model_path = f"{path}/umap_model" - umap_model.write().overwrite().save(model_path) - umap_model_loaded = UMAPModel.load(model_path) - assert_umap_model(model=umap_model_loaded) + threshold = 2 * np.sqrt( + n_rows * sample_fraction * (1 - sample_fraction) + ) # 2 std devs + + embedding = umap_model.embedding + raw_data = umap_model.rawData + assert np.abs(n_rows * sample_fraction - embedding.shape[0]) <= threshold + assert np.abs(n_rows * sample_fraction - raw_data.shape[0]) <= threshold -@pytest.mark.parametrize("BROADCAST_LIMIT", [8 << 20, 8 << 18]) -def test_umap_broadcast_chunks(gpu_number: int, BROADCAST_LIMIT: int) -> None: +def test_umap_build_algo(gpu_number: int) -> None: from cuml.datasets import make_blobs + n_rows = 10000 + random_state = 42 + X, _ = make_blobs( - 5000, - 3000, - centers=42, + n_rows, + 10, + centers=5, cluster_std=0.1, dtype=np.float32, random_state=10, @@ -347,31 +555,82 @@ def test_umap_broadcast_chunks(gpu_number: int, BROADCAST_LIMIT: int) -> None: pyspark_type = "float" feature_cols = [f"c{i}" for i in range(X.shape[1])] schema = [f"{c} {pyspark_type}" for c in feature_cols] - df = spark.createDataFrame(X.tolist(), ",".join(schema)) + df = spark.createDataFrame(X.tolist(), ",".join(schema)).coalesce(1) df = df.withColumn("features", array(*feature_cols)).drop(*feature_cols) - umap = UMAP(num_workers=gpu_number).setFeaturesCol("features") - umap.BROADCAST_LIMIT = BROADCAST_LIMIT + build_algo = "nn_descent" + build_kwds = { + "nnd_graph_degree": 64, + "nnd_intermediate_graph_degree": 128, + "nnd_max_iterations": 40, + "nnd_termination_threshold": 0.0001, + "nnd_return_distances": True, + "nnd_n_clusters": 5, + } + + umap = UMAP( + num_workers=gpu_number, + random_state=random_state, + build_algo=build_algo, + build_kwds=build_kwds, + ).setFeaturesCol("features") umap_model = umap.fit(df) - def assert_umap_model(model: UMAPModel) -> None: - embedding = np.array(model.embedding) - raw_data = np.array(model.raw_data) - assert embedding.shape == (5000, 2) - assert raw_data.shape == (5000, 3000) - assert np.array_equal(raw_data, X.get()) - assert model.dtype == "float32" - assert model.n_cols == X.shape[1] - - assert_umap_model(model=umap_model) + _assert_umap_model(umap_model, X.get()) pdf = umap_model.transform(df).toPandas() embedding = cp.asarray(pdf["embedding"].to_list()).astype(cp.float32) input = cp.asarray(pdf["features"].to_list()).astype(cp.float32) - dist_umap = trustworthiness(input, embedding, n_neighbors=15, batch_size=5000) + dist_umap = trustworthiness(input, embedding, n_neighbors=15, batch_size=10000) loc_umap = _local_umap_trustworthiness(X, np.zeros(0), 15, False) trust_diff = loc_umap - dist_umap assert trust_diff <= 0.15 + + +@pytest.mark.parametrize("n_rows", [3000]) +@pytest.mark.parametrize("n_cols", [64]) +@pytest.mark.parametrize("nnz", [12]) +@pytest.mark.parametrize("metric", ["jaccard", "hamming", "correlation", "cosine"]) +def test_umap_sparse_vector( + n_rows: int, n_cols: int, nnz: int, metric: str, gpu_number: int, tmp_path: str +) -> None: + import pyspark + from cuml.manifold import UMAP as cumlUMAP + from packaging import version + + if version.parse(pyspark.__version__) < version.parse("3.4.0"): + import logging + + err_msg = "pyspark < 3.4 is detected. Cannot import pyspark `unwrap_udt` function for SparseVector. " + "The test case will be skipped. Please install pyspark>=3.4." + logging.info(err_msg) + return + + with CleanSparkSession() as spark: + data, input_raw_data = _load_sparse_binary_data(n_rows, n_cols, nnz) + df = spark.createDataFrame(data, ["features"]) + + umap_estimator = UMAP( + metric=metric, num_workers=gpu_number, random_state=42 + ).setFeaturesCol("features") + umap_model = umap_estimator.fit(df) + embedding = umap_model.embedding + + # Ensure internal and input CSR data match + _assert_umap_model(umap_model, input_raw_data) + + # Local vs dist trustworthiness check + output = umap_model.transform(df).toPandas() + embedding = cp.asarray(output["embedding"].to_list()) + dist_umap = trustworthiness(input_raw_data.toarray(), embedding, n_neighbors=15) + + local_model = cumlUMAP(n_neighbors=15, random_state=42, metric=metric) + local_model.fit(input_raw_data) + embedding = local_model.transform(input_raw_data) + loc_umap = trustworthiness(input_raw_data.toarray(), embedding, n_neighbors=15) + + trust_diff = loc_umap - dist_umap + assert trust_diff <= 0.15 diff --git a/python/tests/utils.py b/python/tests/utils.py index 379e0627..c2c61281 100644 --- a/python/tests/utils.py +++ b/python/tests/utils.py @@ -19,6 +19,7 @@ from typing import Any, Dict, Iterator, List, Optional, Tuple, TypeVar, Union import numpy as np +import pandas as pd import pyspark from pyspark.ml.feature import VectorAssembler from pyspark.sql import SparkSession @@ -80,10 +81,17 @@ def create_pyspark_dataframe( dtype: np.dtype, data: np.ndarray, label: Optional[np.ndarray] = None, + label_dtype: Optional[np.dtype] = None, # type: ignore ) -> Tuple[pyspark.sql.DataFrame, Union[str, List[str]], Optional[str]]: """Construct a dataframe based on features and label data.""" assert feature_type in pyspark_supported_feature_types + # in case cp.ndarray get passed in + if not isinstance(data, np.ndarray): + data = data.get() + if label is not None and not isinstance(label, np.ndarray): + label = label.get() + m, n = data.shape pyspark_type = dtype_to_pyspark_type(dtype) @@ -92,17 +100,31 @@ def create_pyspark_dataframe( label_col = None if label is not None: + label_dtype = dtype if label_dtype is None else label_dtype + label = label.astype(label_dtype) + label_pyspark_type = dtype_to_pyspark_type(label_dtype) + label_col = "label_col" - schema.append(f"{label_col} {pyspark_type}") + schema.append(f"{label_col} {label_pyspark_type}") + + pdf = pd.DataFrame(data, dtype=dtype, columns=feature_cols) + pdf[label_col] = label.astype(label_dtype) df = spark.createDataFrame( - np.concatenate((data, label.reshape(m, 1)), axis=1).tolist(), + pdf, ",".join(schema), ) else: df = spark.createDataFrame(data.tolist(), ",".join(schema)) if feature_type == feature_types.array: - df = df.withColumn("features", array(*feature_cols)).drop(*feature_cols) + # avoid calling df.withColumn here because runtime slowdown is observed when df has many columns (e.g. 3000). + from pyspark.sql.functions import col + + selected_col = [array(*feature_cols).alias("features")] + if label_col: + selected_col.append(col(label_col).alias(label_col)) + df = df.select(selected_col) + feature_cols = "features" elif feature_type == feature_types.vector: df = ( @@ -113,6 +135,11 @@ def create_pyspark_dataframe( .drop(*feature_cols) ) feature_cols = "features" + else: + # When df has many columns (e.g. 3000), and was created by calling spark.createDataFrame on a pandas DataFrame, + # calling df.withColumn can lead to noticeable runtime slowdown. + # Using select here can significantly reduce the runtime and improve the performance. + df = df.select("*") return df, feature_cols, label_col