diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json deleted file mode 100644 index 9b3aa436..00000000 --- a/.devcontainer/devcontainer.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "image": "gquant/gquant:10.1_10.2-runtime-ubuntu18.04_0.14.1_dev", - "workspaceFolder": "/workspace/", - "workspaceMount": "source=${localEnv:GQUANT_ROOT},target=/workspace,type=bind,consistency=cached", - "extensions": [ - "ms-python.python", "dbaeumer.vscode-eslint" - ], - "settings": { - "terminal.integrated.shell.linux": "/bin/bash", - "python.pythonPath": "/home/quant/miniconda3/bin/python", - "python.linting.enabled": true, - "python.linting.pylintEnabled": true, - "python.formatting.autopep8Path": "/home/quant/miniconda3/bin/autopep8", - "python.linting.flake8Path": "/home/quant/miniconda3/bin/flake8", - "python.linting.pylintPath": "/home/quant/miniconda3/bin/pylint", - "python.testing.pytestPath": "/home/quant/miniconda3/bin/pytest" - }, - "forwardPorts": [8080, 8888], - "remoteUser": "quant", - "containerUser": "quant", - "runArgs": ["--runtime=nvidia"] -} diff --git a/CHANGELOG.md b/CHANGELOG.md deleted file mode 100644 index ea4aa471..00000000 --- a/CHANGELOG.md +++ /dev/null @@ -1,183 +0,0 @@ -# Changelog - -## [v1.0.1](https://github.com/rapidsai/gQuant/tree/v1.0.1) (2021-01-20) - -[Full Changelog](https://github.com/rapidsai/gQuant/compare/v1.0.0...v1.0.1) - -**Merged pull requests:** - -- \[REVIEW\] Simple external plugin example [\#113](https://github.com/rapidsai/gQuant/pull/113) ([yidong72](https://github.com/yidong72)) - -## [v1.0.0](https://github.com/rapidsai/gQuant/tree/v1.0.0) (2020-12-30) - -[Full Changelog](https://github.com/rapidsai/gQuant/compare/0.5...v1.0.0) - -**Closed issues:** - -- \[FEA\]We need Plugin file support [\#106](https://github.com/rapidsai/gQuant/issues/106) -- \[FEA\]Migrate to the new input/output port API [\#97](https://github.com/rapidsai/gQuant/issues/97) -- \[FEA\]Add the gQuant Web UI [\#95](https://github.com/rapidsai/gQuant/issues/95) -- \[DOC\] Add CHANGELOG.md [\#48](https://github.com/rapidsai/gQuant/issues/48) - -**Merged pull requests:** - -- \[REVIEW\]gQuant plugin implementation [\#112](https://github.com/rapidsai/gQuant/pull/112) ([yidong72](https://github.com/yidong72)) -- Gpuciscripts clean and update [\#111](https://github.com/rapidsai/gQuant/pull/111) ([msadang](https://github.com/msadang)) -- \[REVIEW\] gQuant 1.0 [\#110](https://github.com/rapidsai/gQuant/pull/110) ([yidong72](https://github.com/yidong72)) -- Streamz gQuant example 2 [\#109](https://github.com/rapidsai/gQuant/pull/109) ([yidong72](https://github.com/yidong72)) -- Revert "Streamz gQuant example" [\#108](https://github.com/rapidsai/gQuant/pull/108) ([yidong72](https://github.com/yidong72)) -- Streamz gQuant example [\#107](https://github.com/rapidsai/gQuant/pull/107) ([yidong72](https://github.com/yidong72)) -- Bump node-fetch from 2.6.0 to 2.6.1 in /gquantlab [\#104](https://github.com/rapidsai/gQuant/pull/104) ([dependabot[bot]](https://github.com/apps/dependabot)) -- Nemo and xgboost integration [\#103](https://github.com/rapidsai/gQuant/pull/103) ([yidong72](https://github.com/yidong72)) -- FIX Update change log check [\#102](https://github.com/rapidsai/gQuant/pull/102) ([mike-wendt](https://github.com/mike-wendt)) -- \[REVIEW\] Update CI scripts to remove references to master \[skip ci\] [\#99](https://github.com/rapidsai/gQuant/pull/99) ([dillon-cullinan](https://github.com/dillon-cullinan)) -- \[skip ci\] Update master references for main branch [\#98](https://github.com/rapidsai/gQuant/pull/98) ([ajschmidt8](https://github.com/ajschmidt8)) -- \[REVIEW\]gQuant UI, first version [\#89](https://github.com/rapidsai/gQuant/pull/89) ([yidong72](https://github.com/yidong72)) - -## [0.5](https://github.com/rapidsai/gQuant/tree/0.5) (2020-07-10) - -[Full Changelog](https://github.com/rapidsai/gQuant/compare/0.4.1...0.5) - -**Implemented enhancements:** - -- \[FEA\] csvStockLoader.py and stockNameLoader.py - Use cudf.read\_csv\(\) insteand of pandas.read\_csv\(\) [\#24](https://github.com/rapidsai/gQuant/issues/24) - -**Fixed bugs:** - -- \[BUG\] Using a UDF via Series.rolling.apply\(\) results in KeyError in numba [\#88](https://github.com/rapidsai/gQuant/issues/88) -- \[BUG\] download\_data.sh seems to do not be in containers anymore [\#66](https://github.com/rapidsai/gQuant/issues/66) - -**Closed issues:** - -- \[FEA\] Conda resolves too slow in the latest versions of the container [\#67](https://github.com/rapidsai/gQuant/issues/67) -- \[FEA\] Comprehensive refactoring of indicator\_demo.ipynb notebook [\#46](https://github.com/rapidsai/gQuant/issues/46) -- \[FEA\] Rename viz\_graph\(\) to viz\(\), save\_taskgraph\(\) to save\(\) [\#34](https://github.com/rapidsai/gQuant/issues/34) - -**Merged pull requests:** - -- \[REIVEW\]gQuant 0.5 release [\#94](https://github.com/rapidsai/gQuant/pull/94) ([yidong72](https://github.com/yidong72)) -- \[REVIEW\] Fix mortgage e2e example for rapids 0.14. [\#93](https://github.com/rapidsai/gQuant/pull/93) ([avolkov1](https://github.com/avolkov1)) -- \[REVIEW\] Update RAPIDS to version 0.14 [\#92](https://github.com/rapidsai/gQuant/pull/92) ([yidong72](https://github.com/yidong72)) -- \[REVIEW\]Multiple gpu xgboost - Dask performance fix [\#91](https://github.com/rapidsai/gQuant/pull/91) ([yidong72](https://github.com/yidong72)) -- \[REVIEW\]Mutliple GPU xgboost [\#90](https://github.com/rapidsai/gQuant/pull/90) ([yidong72](https://github.com/yidong72)) - -## [0.4.1](https://github.com/rapidsai/gQuant/tree/0.4.1) (2020-05-26) - -[Full Changelog](https://github.com/rapidsai/gQuant/compare/0.4...0.4.1) - -**Merged pull requests:** - -- \[REVIEW\] hot fix for 0.4 release [\#86](https://github.com/rapidsai/gQuant/pull/86) ([yidong72](https://github.com/yidong72)) -- \[REVIEW\] fix the cuIndicator notebook and RSI perf notebook [\#85](https://github.com/rapidsai/gQuant/pull/85) ([yidong72](https://github.com/yidong72)) -- Add cuda102 docker support and update version against development branch [\#84](https://github.com/rapidsai/gQuant/pull/84) ([jbaron](https://github.com/jbaron)) - -## [0.4](https://github.com/rapidsai/gQuant/tree/0.4) (2020-05-19) - -[Full Changelog](https://github.com/rapidsai/gQuant/compare/v0.2...0.4) - -**Implemented enhancements:** - -- \[REVIEW\]Feature adding fractional differencing computation [\#56](https://github.com/rapidsai/gQuant/pull/56) ([yidong72](https://github.com/yidong72)) - -**Fixed bugs:** - -- \[BUG\] Dask computation fails with 0.8 build script [\#28](https://github.com/rapidsai/gQuant/issues/28) - -**Closed issues:** - -- \[FEA\] Add cuda 10.1.2 support [\#64](https://github.com/rapidsai/gQuant/issues/64) -- \[FEA\] Use RAPIDS 0.9 container in build.sh [\#54](https://github.com/rapidsai/gQuant/issues/54) -- \[FEA\] Rename notebook to notebooks [\#50](https://github.com/rapidsai/gQuant/issues/50) -- \[FEA\] Add Jupyterlab extension to display GPU usage [\#49](https://github.com/rapidsai/gQuant/issues/49) -- \[FEA\] Merge develop branch to master [\#47](https://github.com/rapidsai/gQuant/issues/47) -- \[FEA\] implement the fractional difference operation [\#42](https://github.com/rapidsai/gQuant/issues/42) - -**Merged pull requests:** - -- \[REVIEW\] merge develop to master and release it as 0.4 [\#82](https://github.com/rapidsai/gQuant/pull/82) ([yidong72](https://github.com/yidong72)) -- \[REVIEW\]update to latest version of RAPIDS 0.13 [\#81](https://github.com/rapidsai/gQuant/pull/81) ([yidong72](https://github.com/yidong72)) -- fixed the gamma computation error [\#79](https://github.com/rapidsai/gQuant/pull/79) ([doyend](https://github.com/doyend)) -- \[REVIEW\]asian barrier option tutorial [\#77](https://github.com/rapidsai/gQuant/pull/77) ([yidong72](https://github.com/yidong72)) -- \[REVIEW\] upgrade to RAPIDS 0.11 [\#76](https://github.com/rapidsai/gQuant/pull/76) ([yidong72](https://github.com/yidong72)) -- \[skip ci\] Merge CI Scripts [\#75](https://github.com/rapidsai/gQuant/pull/75) ([avolkov1](https://github.com/avolkov1)) -- \[REVIEW\] Add CI scripts and conda recipe [\#74](https://github.com/rapidsai/gQuant/pull/74) ([raydouglass](https://github.com/raydouglass)) -- \[WIP\] CUQ-36: fix typechecking nodes multi input dataframes [\#68](https://github.com/rapidsai/gQuant/pull/68) ([avolkov1](https://github.com/avolkov1)) -- \[REVIEW\] Upgrade to RAPIDS 0.10 [\#63](https://github.com/rapidsai/gQuant/pull/63) ([yidong72](https://github.com/yidong72)) -- \[REVIEW\] stable master merge [\#62](https://github.com/rapidsai/gQuant/pull/62) ([yidong72](https://github.com/yidong72)) -- \[REVIEW\]upgrade to RAPIDS 0.9, FIX the rebase problem [\#61](https://github.com/rapidsai/gQuant/pull/61) ([yidong72](https://github.com/yidong72)) -- Revert "\[REVIEW\]upgrade to RAPIDS 0.9" [\#59](https://github.com/rapidsai/gQuant/pull/59) ([yidong72](https://github.com/yidong72)) -- Revert "\[REVIEW\]upgrade to RAPIDS 0.9" [\#58](https://github.com/rapidsai/gQuant/pull/58) ([avolkov1](https://github.com/avolkov1)) -- \[REVIEW\]upgrade to RAPIDS 0.9 [\#57](https://github.com/rapidsai/gQuant/pull/57) ([yidong72](https://github.com/yidong72)) -- \[REVIEW\] change the text for notebook 05 [\#55](https://github.com/rapidsai/gQuant/pull/55) ([yidong72](https://github.com/yidong72)) -- Fix \#50b - Rename notebook folder to notebooks [\#52](https://github.com/rapidsai/gQuant/pull/52) ([miguelusque](https://github.com/miguelusque)) -- Fix \#50 - Rename notebook folder to notebooks [\#51](https://github.com/rapidsai/gQuant/pull/51) ([miguelusque](https://github.com/miguelusque)) - -## [v0.2](https://github.com/rapidsai/gQuant/tree/v0.2) (2019-08-16) - -[Full Changelog](https://github.com/rapidsai/gQuant/compare/v0.1...v0.2) - -**Implemented enhancements:** - -- \[FEA\] Refactor 04\_portfolio\_trade.ipynb notebook [\#39](https://github.com/rapidsai/gQuant/issues/39) -- \[FEA\] Refactor notebook 01\_tutorial.ipynb [\#35](https://github.com/rapidsai/gQuant/issues/35) -- \[FEA\] Add error message \(or warning\) if replace node does not exist [\#32](https://github.com/rapidsai/gQuant/issues/32) -- \[FEA\] Add new issue templates [\#26](https://github.com/rapidsai/gQuant/issues/26) -- \[FEA\] cuIndicator notebook plot widget is too complicated [\#17](https://github.com/rapidsai/gQuant/issues/17) - -**Fixed bugs:** - -- \[BUG\] Remove debug info from barPlotNode.py and cumReturnNode.py [\#40](https://github.com/rapidsai/gQuant/issues/40) -- \[BUG\] 04\_portfolio\_trade.ipynb - Number of filtered stocks differs from text [\#23](https://github.com/rapidsai/gQuant/issues/23) - -**Merged pull requests:** - -- Fix \#17 - cuIndicator notebook plot widget is too complicated \(WIP\) [\#45](https://github.com/rapidsai/gQuant/pull/45) ([miguelusque](https://github.com/miguelusque)) -- Fix \#39 - Refactor 04\_portfolio\_trade.ipynb notebook [\#44](https://github.com/rapidsai/gQuant/pull/44) ([miguelusque](https://github.com/miguelusque)) -- Merge develop to master [\#43](https://github.com/rapidsai/gQuant/pull/43) ([yidong72](https://github.com/yidong72)) -- Fix \#40 - Remove debug info [\#41](https://github.com/rapidsai/gQuant/pull/41) ([miguelusque](https://github.com/miguelusque)) -- Update mortgage example using TaskGraph API. [\#38](https://github.com/rapidsai/gQuant/pull/38) ([avolkov1](https://github.com/avolkov1)) -- fixed the issue 32 [\#37](https://github.com/rapidsai/gQuant/pull/37) ([yidong72](https://github.com/yidong72)) -- Fix \#35 - Refactor 01\_tutorial.ipynb notebook [\#36](https://github.com/rapidsai/gQuant/pull/36) ([miguelusque](https://github.com/miguelusque)) -- Fix \#26b - Add new issue templates [\#30](https://github.com/rapidsai/gQuant/pull/30) ([miguelusque](https://github.com/miguelusque)) -- Revert "fix \#26 - Add new issues template" [\#29](https://github.com/rapidsai/gQuant/pull/29) ([yidong72](https://github.com/yidong72)) -- Fix \#26 - Add new issues template [\#27](https://github.com/rapidsai/gQuant/pull/27) ([miguelusque](https://github.com/miguelusque)) -- added workflow class [\#22](https://github.com/rapidsai/gQuant/pull/22) ([yidong72](https://github.com/yidong72)) -- Fix \#19b - Combine OS/Cuda versions user input [\#21](https://github.com/rapidsai/gQuant/pull/21) ([miguelusque](https://github.com/miguelusque)) -- Fix \#19 - build.sh - Move pip dependencies to conda dependencies [\#20](https://github.com/rapidsai/gQuant/pull/20) ([miguelusque](https://github.com/miguelusque)) -- Fix \#13, \#14, \#16 in cuIndicator.ipynb notebook [\#18](https://github.com/rapidsai/gQuant/pull/18) ([miguelusque](https://github.com/miguelusque)) -- update the build.sh [\#15](https://github.com/rapidsai/gQuant/pull/15) ([yidong72](https://github.com/yidong72)) -- Feature xgb notebook [\#11](https://github.com/rapidsai/gQuant/pull/11) ([yidong72](https://github.com/yidong72)) -- CUQ-5: Mortgage example using gQuant. [\#10](https://github.com/rapidsai/gQuant/pull/10) ([avolkov1](https://github.com/avolkov1)) -- CUQ-5: Mortgage example using gQuant. [\#9](https://github.com/rapidsai/gQuant/pull/9) ([avolkov1](https://github.com/avolkov1)) -- Feature indicator node [\#8](https://github.com/rapidsai/gQuant/pull/8) ([yidong72](https://github.com/yidong72)) -- Feature mulit assets indicator [\#7](https://github.com/rapidsai/gQuant/pull/7) ([yidong72](https://github.com/yidong72)) -- Update build.sh [\#6](https://github.com/rapidsai/gQuant/pull/6) ([phogan-nvidia](https://github.com/phogan-nvidia)) -- Feature environment [\#5](https://github.com/rapidsai/gQuant/pull/5) ([yidong72](https://github.com/yidong72)) - -## [v0.1](https://github.com/rapidsai/gQuant/tree/v0.1) (2019-08-13) - -[Full Changelog](https://github.com/rapidsai/gQuant/compare/e4a967fc9e3289fdbfa37e7a7b84887579332b42...v0.1) - -**Implemented enhancements:** - -- \[FEA\] build.sh - Move pip dependencies to conda dependencies [\#19](https://github.com/rapidsai/gQuant/issues/19) - -**Fixed bugs:** - -- \[BUG\] Update build.sh to 0.7 until issue \#28 is fixed [\#31](https://github.com/rapidsai/gQuant/issues/31) -- \[BUG\] cuIndicator.ipyng - Wrong series names [\#16](https://github.com/rapidsai/gQuant/issues/16) -- \[BUG\] cuIndicator.ipynb - Runtime error in cell \#3 - Missing file [\#14](https://github.com/rapidsai/gQuant/issues/14) -- \[BUG\] cuIndicator.ipynb - Incorrect path to dataset [\#13](https://github.com/rapidsai/gQuant/issues/13) - -**Merged pull requests:** - -- Revert "gQuant34 - Update build.sh to make use of RAPIDS v0.8 container" [\#33](https://github.com/rapidsai/gQuant/pull/33) ([yidong72](https://github.com/yidong72)) -- gQuant34 - Update build.sh to make use of RAPIDS v0.8 container [\#12](https://github.com/rapidsai/gQuant/pull/12) ([miguelusque](https://github.com/miguelusque)) -- Synch master with develop [\#4](https://github.com/rapidsai/gQuant/pull/4) ([avolkov1](https://github.com/avolkov1)) -- added unit tests for the cuindicator [\#3](https://github.com/rapidsai/gQuant/pull/3) ([yidong72](https://github.com/yidong72)) -- CUQ-21: Improving tutorials for gQuant [\#2](https://github.com/rapidsai/gQuant/pull/2) ([avolkov1](https://github.com/avolkov1)) -- Add download script and instructions in the readme [\#1](https://github.com/rapidsai/gQuant/pull/1) ([yidong72](https://github.com/yidong72)) - - - -\* *This Changelog was automatically generated by [github_changelog_generator](https://github.com/github-changelog-generator/github-changelog-generator)* diff --git a/README.md b/README.md index a25cc9f2..9ad180b9 100644 --- a/README.md +++ b/README.md @@ -1,125 +1,6 @@ -# gQuant - GPU Accelerated Graph Computation for Quantitative Analyst Tasks +# FSI Examples -**NOTE:** For the latest stable [README.md](https://github.com/rapidsai/gquant/blob/main/README.md) ensure you are on the `main` branch. +Repo Index -## What is gQuant? -gQuant is a collection of open-source GPU accelerated Python tools and examples for quantitative analyst tasks, built on top of the [RAPIDS AI](https://rapids.ai/) project, [Numba](https://numba.pydata.org/), and [Dask](https://dask.org/). +1. [gGuant](gQuant) - A graph computation toolkit that helps you to organize the workflows in graph computation. -The examples range from simple accelerated calculation of technical trading indicators through defining workflows for interactively developing trading strategies and automating many typical tasks. - -The extensibility of the system is highlighted by examples showing how to create a dataframe flow graph, which allows for easy re-use and composability of higher level workflows. - -The examples also show how to easily convert a single-threaded solution into a Dask distributed one. - -These examples can be used as-is or, as they are open source, can be extended to suit your environments. - -## gQuant jupyterlab extension -![Tuturial](tutorial.gif "Tutorial") -![Quick Demo](gquantlab_demo.gif "Demo") -The gQuant Juyterlab extension provides the user interface to build the dataframe flow TaskGraph easily. It takes advantage of the open sources projects like [jupyterlab](https://github.com/jupyterlab/jupyterlab), [ipywidget](https://github.com/jupyter-widgets/ipywidgets), [React](https://reactjs.org/) and [D3](https://d3js.org/). It features: -1. Takes full advantage of the JupyterLab project that the extension adds commands to Jupyterlab context menu, command palette and bind them with keyboard shortcuts to speed up the productivity. -2. Define a new TaskGraph file format `.gq.yaml` that can be edited in the Jupyterlab. -3. Visually presents the TaskGraph as a DAG graph. Users can zoom in and out, freely move the nodes around, and make connections between nodes. -4. Use the special `Ouput Collector` to gather the results and organize them in a tab widget. The IPython [rich display](https://ipython.readthedocs.io/en/stable/config/integrating.html#rich-display) is fully supported. -5. Visually shows the progress of graph evaluation and computation dependence. -6. Automatically generate the UI elements to edit and validate the Node configuration given the configuration JSON schema. It exposes the function API in a user-friendly way. User can change the configuration and re-run the computation to test out the hyperparameters easily. -7. Dynamically compute the input-output ports compatibility, dataframe columns names and types, ports types to prevent connection errors. -8. Nodes can have multiple output ports that can be used to generate different output types. E.g. some data loader Node provides both `cudf` and `dask_cudf` output ports. The multiple GPUs distributed computation computation is automatically enabled by switching to the `dask_cudf` output port. -9. Provides the standard API to extend your computation Nodes. -10. The composite node can encapsulate the TaskGraph into a single node for easy reuse. The composite node can be exported as a regular gQuant node without any coding. - -### Binary pip installation - -To install the gQuant graph computation library, first install the dependence libraries: -```bash -pip install dask[dataframe] distributed networkx -conda install python-graphviz ruamel.yaml numpy pandas -``` -Then install gquant lib: -```bash -pip install gquant -``` - -To install JupyterLab plugin, install the following dependence libraries: -```bash -conda install nodejs ipywidgets -``` -Then install the gquantlab lib: -```bash -pip install gquantlab==0.1.2 -``` -Build the ipywidgets Jupyterlab plugin -```bash -jupyter labextension install @jupyter-widgets/jupyterlab-manager@2.0 -``` -If you launch the JupyterLab, it will prompt to build the new plugin. You can -explicitly build it by: -```bash -jupyter lab build -``` - -Note, the gQuant node plugins are defined in the `gquantrc` file. Check the `System environment` for details - - -### Prerequisites -- NVIDIA Pascal™ GPU architecture or better. -- [CUDA 9.2](https://developer.nvidia.com/cuda-92-download-archive) with driver v396.37+ or [CUDA 10.0](https://developer.nvidia.com/cuda-10.0-download-archive) with driver v410.48+. -- Ubuntu 16.04 or 18.04. -- [NVIDIA-docker v2+](https://github.com/nvidia/nvidia-docker/wiki/Frequently-Asked-Questions#how-do-i-install-20-if-im-not-using-the-latest-docker-version). - - -### Download data files - -Run the following command at the project root diretory -```bash -bash download_data.sh - -``` - -### Install - -gQuant source code can be downloaded from [GitHub](https://github.com/rapidsai/gquant). - -- Git clone source code: - -```bash -$ git clone https://github.com/rapidsai/gQuant.git -``` - - -- Build and run the container: - -```bash -$ cd gQuant/docker && . build.sh -``` -When building the container, you can run gQuant in two modes: dev or prod. In the dev mode, please check the README file in `gquantlab` directory to install the plugins and Python libraries. - -In the production mode, you can launch the container by following command and start to use it -```bash -$ docker run --runtime=nvidia --rm -it -p 8888:8888 -p 8787:8787 -p 8786:8786 gquant/gquant:[tag from the build] -``` - -### Example notebooks - -Example notebooks, tutorial showcasing, can be found in __notebooks__ folder. - - -### System environment - -There are a few system environment that the user can overwrite. - -The custom module files are specified in the `gquantrc` file. `GQUANT_CONFIG` enviroment variable points to the location of this file. By default, it points to -`$CWD\gquantrc`. - -In the example `gquantrc`, system environment variable `MODULEPATH` is used to point to the paths of the module files. -To start the jupyterlab, please make sure `MODULEPATH` is set properly. - -For example, if you want to start the jupyterlab in the gQuant root directory. -```bash -MODULEPATH=$PWD/modules jupyter-lab --allow-root --ip=0.0.0.0 --no-browser --NotebookApp.token='' -``` - -Or, if you want to start the jupyterlab in the gquantlab directory. -```bash -GQUANT_CONFIG=../gquantrc MODULEPATH=$PWD/../modules jupyter-lab --allow-root --ip=0.0.0.0 --no-browser --NotebookApp.token='' -``` diff --git a/.gitignore b/gQuant/.gitignore similarity index 100% rename from .gitignore rename to gQuant/.gitignore diff --git a/gQuant/CHANGELOG.md b/gQuant/CHANGELOG.md new file mode 100644 index 00000000..9c4b070d --- /dev/null +++ b/gQuant/CHANGELOG.md @@ -0,0 +1,191 @@ +# Changelog + +## [v1.0.2](https://github.com/NVIDIA/fsi-samples/tree/v1.0.2) (2021-02-11) + +[Full Changelog](https://github.com/NVIDIA/fsi-samples/compare/v1.0.1...v1.0.2) + +**Merged pull requests:** + +- \[REVIEW\]Upgrade the plugin to JupyterLab 3 [\#114](https://github.com/NVIDIA/fsi-samples/pull/114) ([yidong72](https://github.com/yidong72)) + +## [v1.0.1](https://github.com/NVIDIA/fsi-samples/tree/v1.0.1) (2021-01-20) + +[Full Changelog](https://github.com/NVIDIA/fsi-samples/compare/v1.0.0...v1.0.1) + +**Merged pull requests:** + +- \[REVIEW\]gquant 1.0.1 [\#115](https://github.com/NVIDIA/fsi-samples/pull/115) ([yidong72](https://github.com/yidong72)) + +## [v1.0.0](https://github.com/NVIDIA/fsi-samples/tree/v1.0.0) (2020-12-30) + +[Full Changelog](https://github.com/NVIDIA/fsi-samples/compare/0.5...v1.0.0) + +**Closed issues:** + +- \[FEA\]We need Plugin file support [\#106](https://github.com/NVIDIA/fsi-samples/issues/106) +- \[FEA\]Migrate to the new input/output port API [\#97](https://github.com/NVIDIA/fsi-samples/issues/97) +- \[FEA\]Add the gQuant Web UI [\#95](https://github.com/NVIDIA/fsi-samples/issues/95) +- \[DOC\] Add CHANGELOG.md [\#48](https://github.com/NVIDIA/fsi-samples/issues/48) + +**Merged pull requests:** + +- \[REVIEW\]gQuant plugin implementation [\#112](https://github.com/NVIDIA/fsi-samples/pull/112) ([yidong72](https://github.com/yidong72)) +- Gpuciscripts clean and update [\#111](https://github.com/NVIDIA/fsi-samples/pull/111) ([msadang](https://github.com/msadang)) +- \[REVIEW\] gQuant 1.0 [\#110](https://github.com/NVIDIA/fsi-samples/pull/110) ([yidong72](https://github.com/yidong72)) +- Streamz gQuant example 2 [\#109](https://github.com/NVIDIA/fsi-samples/pull/109) ([yidong72](https://github.com/yidong72)) +- Revert "Streamz gQuant example" [\#108](https://github.com/NVIDIA/fsi-samples/pull/108) ([yidong72](https://github.com/yidong72)) +- Streamz gQuant example [\#107](https://github.com/NVIDIA/fsi-samples/pull/107) ([yidong72](https://github.com/yidong72)) +- Bump node-fetch from 2.6.0 to 2.6.1 in /gquantlab [\#104](https://github.com/NVIDIA/fsi-samples/pull/104) ([dependabot[bot]](https://github.com/apps/dependabot)) +- Nemo and xgboost integration [\#103](https://github.com/NVIDIA/fsi-samples/pull/103) ([yidong72](https://github.com/yidong72)) +- FIX Update change log check [\#102](https://github.com/NVIDIA/fsi-samples/pull/102) ([mike-wendt](https://github.com/mike-wendt)) +- \[REVIEW\] Update CI scripts to remove references to master \[skip ci\] [\#99](https://github.com/NVIDIA/fsi-samples/pull/99) ([dillon-cullinan](https://github.com/dillon-cullinan)) +- \[skip ci\] Update master references for main branch [\#98](https://github.com/NVIDIA/fsi-samples/pull/98) ([ajschmidt8](https://github.com/ajschmidt8)) +- \[REVIEW\]gQuant UI, first version [\#89](https://github.com/NVIDIA/fsi-samples/pull/89) ([yidong72](https://github.com/yidong72)) + +## [0.5](https://github.com/NVIDIA/fsi-samples/tree/0.5) (2020-07-10) + +[Full Changelog](https://github.com/NVIDIA/fsi-samples/compare/0.4.1...0.5) + +**Implemented enhancements:** + +- \[FEA\] csvStockLoader.py and stockNameLoader.py - Use cudf.read\_csv\(\) insteand of pandas.read\_csv\(\) [\#24](https://github.com/NVIDIA/fsi-samples/issues/24) + +**Fixed bugs:** + +- \[BUG\] Using a UDF via Series.rolling.apply\(\) results in KeyError in numba [\#88](https://github.com/NVIDIA/fsi-samples/issues/88) +- \[BUG\] download\_data.sh seems to do not be in containers anymore [\#66](https://github.com/NVIDIA/fsi-samples/issues/66) + +**Closed issues:** + +- \[FEA\] Conda resolves too slow in the latest versions of the container [\#67](https://github.com/NVIDIA/fsi-samples/issues/67) +- \[FEA\] Comprehensive refactoring of indicator\_demo.ipynb notebook [\#46](https://github.com/NVIDIA/fsi-samples/issues/46) +- \[FEA\] Rename viz\_graph\(\) to viz\(\), save\_taskgraph\(\) to save\(\) [\#34](https://github.com/NVIDIA/fsi-samples/issues/34) + +**Merged pull requests:** + +- \[REIVEW\]gQuant 0.5 release [\#94](https://github.com/NVIDIA/fsi-samples/pull/94) ([yidong72](https://github.com/yidong72)) +- \[REVIEW\] Fix mortgage e2e example for rapids 0.14. [\#93](https://github.com/NVIDIA/fsi-samples/pull/93) ([avolkov1](https://github.com/avolkov1)) +- \[REVIEW\] Update RAPIDS to version 0.14 [\#92](https://github.com/NVIDIA/fsi-samples/pull/92) ([yidong72](https://github.com/yidong72)) +- \[REVIEW\]Multiple gpu xgboost - Dask performance fix [\#91](https://github.com/NVIDIA/fsi-samples/pull/91) ([yidong72](https://github.com/yidong72)) +- \[REVIEW\]Mutliple GPU xgboost [\#90](https://github.com/NVIDIA/fsi-samples/pull/90) ([yidong72](https://github.com/yidong72)) + +## [0.4.1](https://github.com/NVIDIA/fsi-samples/tree/0.4.1) (2020-05-26) + +[Full Changelog](https://github.com/NVIDIA/fsi-samples/compare/0.4...0.4.1) + +**Merged pull requests:** + +- \[REVIEW\] hot fix for 0.4 release [\#86](https://github.com/NVIDIA/fsi-samples/pull/86) ([yidong72](https://github.com/yidong72)) +- \[REVIEW\] fix the cuIndicator notebook and RSI perf notebook [\#85](https://github.com/NVIDIA/fsi-samples/pull/85) ([yidong72](https://github.com/yidong72)) +- Add cuda102 docker support and update version against development branch [\#84](https://github.com/NVIDIA/fsi-samples/pull/84) ([jbaron](https://github.com/jbaron)) + +## [0.4](https://github.com/NVIDIA/fsi-samples/tree/0.4) (2020-05-19) + +[Full Changelog](https://github.com/NVIDIA/fsi-samples/compare/v0.2...0.4) + +**Implemented enhancements:** + +- \[REVIEW\]Feature adding fractional differencing computation [\#56](https://github.com/NVIDIA/fsi-samples/pull/56) ([yidong72](https://github.com/yidong72)) + +**Fixed bugs:** + +- \[BUG\] Dask computation fails with 0.8 build script [\#28](https://github.com/NVIDIA/fsi-samples/issues/28) + +**Closed issues:** + +- \[FEA\] Add cuda 10.1.2 support [\#64](https://github.com/NVIDIA/fsi-samples/issues/64) +- \[FEA\] Use RAPIDS 0.9 container in build.sh [\#54](https://github.com/NVIDIA/fsi-samples/issues/54) +- \[FEA\] Rename notebook to notebooks [\#50](https://github.com/NVIDIA/fsi-samples/issues/50) +- \[FEA\] Add Jupyterlab extension to display GPU usage [\#49](https://github.com/NVIDIA/fsi-samples/issues/49) +- \[FEA\] Merge develop branch to master [\#47](https://github.com/NVIDIA/fsi-samples/issues/47) +- \[FEA\] implement the fractional difference operation [\#42](https://github.com/NVIDIA/fsi-samples/issues/42) + +**Merged pull requests:** + +- \[REVIEW\] merge develop to master and release it as 0.4 [\#82](https://github.com/NVIDIA/fsi-samples/pull/82) ([yidong72](https://github.com/yidong72)) +- \[REVIEW\]update to latest version of RAPIDS 0.13 [\#81](https://github.com/NVIDIA/fsi-samples/pull/81) ([yidong72](https://github.com/yidong72)) +- fixed the gamma computation error [\#79](https://github.com/NVIDIA/fsi-samples/pull/79) ([doyend](https://github.com/doyend)) +- \[REVIEW\]asian barrier option tutorial [\#77](https://github.com/NVIDIA/fsi-samples/pull/77) ([yidong72](https://github.com/yidong72)) +- \[REVIEW\] upgrade to RAPIDS 0.11 [\#76](https://github.com/NVIDIA/fsi-samples/pull/76) ([yidong72](https://github.com/yidong72)) +- \[skip ci\] Merge CI Scripts [\#75](https://github.com/NVIDIA/fsi-samples/pull/75) ([avolkov1](https://github.com/avolkov1)) +- \[REVIEW\] Add CI scripts and conda recipe [\#74](https://github.com/NVIDIA/fsi-samples/pull/74) ([raydouglass](https://github.com/raydouglass)) +- \[WIP\] CUQ-36: fix typechecking nodes multi input dataframes [\#68](https://github.com/NVIDIA/fsi-samples/pull/68) ([avolkov1](https://github.com/avolkov1)) +- \[REVIEW\] Upgrade to RAPIDS 0.10 [\#63](https://github.com/NVIDIA/fsi-samples/pull/63) ([yidong72](https://github.com/yidong72)) +- \[REVIEW\] stable master merge [\#62](https://github.com/NVIDIA/fsi-samples/pull/62) ([yidong72](https://github.com/yidong72)) +- \[REVIEW\]upgrade to RAPIDS 0.9, FIX the rebase problem [\#61](https://github.com/NVIDIA/fsi-samples/pull/61) ([yidong72](https://github.com/yidong72)) +- Revert "\[REVIEW\]upgrade to RAPIDS 0.9" [\#59](https://github.com/NVIDIA/fsi-samples/pull/59) ([yidong72](https://github.com/yidong72)) +- Revert "\[REVIEW\]upgrade to RAPIDS 0.9" [\#58](https://github.com/NVIDIA/fsi-samples/pull/58) ([avolkov1](https://github.com/avolkov1)) +- \[REVIEW\]upgrade to RAPIDS 0.9 [\#57](https://github.com/NVIDIA/fsi-samples/pull/57) ([yidong72](https://github.com/yidong72)) +- \[REVIEW\] change the text for notebook 05 [\#55](https://github.com/NVIDIA/fsi-samples/pull/55) ([yidong72](https://github.com/yidong72)) +- Fix \#50b - Rename notebook folder to notebooks [\#52](https://github.com/NVIDIA/fsi-samples/pull/52) ([miguelusque](https://github.com/miguelusque)) + +## [v0.2](https://github.com/NVIDIA/fsi-samples/tree/v0.2) (2019-08-16) + +[Full Changelog](https://github.com/NVIDIA/fsi-samples/compare/v0.1...v0.2) + +**Implemented enhancements:** + +- \[FEA\] Refactor 04\_portfolio\_trade.ipynb notebook [\#39](https://github.com/NVIDIA/fsi-samples/issues/39) +- \[FEA\] Refactor notebook 01\_tutorial.ipynb [\#35](https://github.com/NVIDIA/fsi-samples/issues/35) +- \[FEA\] Add error message \(or warning\) if replace node does not exist [\#32](https://github.com/NVIDIA/fsi-samples/issues/32) +- \[FEA\] Add new issue templates [\#26](https://github.com/NVIDIA/fsi-samples/issues/26) +- \[FEA\] cuIndicator notebook plot widget is too complicated [\#17](https://github.com/NVIDIA/fsi-samples/issues/17) + +**Fixed bugs:** + +- \[BUG\] Remove debug info from barPlotNode.py and cumReturnNode.py [\#40](https://github.com/NVIDIA/fsi-samples/issues/40) +- \[BUG\] 04\_portfolio\_trade.ipynb - Number of filtered stocks differs from text [\#23](https://github.com/NVIDIA/fsi-samples/issues/23) + +**Merged pull requests:** + +- Fix \#50 - Rename notebook folder to notebooks [\#51](https://github.com/NVIDIA/fsi-samples/pull/51) ([miguelusque](https://github.com/miguelusque)) +- Fix \#17 - cuIndicator notebook plot widget is too complicated \(WIP\) [\#45](https://github.com/NVIDIA/fsi-samples/pull/45) ([miguelusque](https://github.com/miguelusque)) +- Fix \#39 - Refactor 04\_portfolio\_trade.ipynb notebook [\#44](https://github.com/NVIDIA/fsi-samples/pull/44) ([miguelusque](https://github.com/miguelusque)) +- Merge develop to master [\#43](https://github.com/NVIDIA/fsi-samples/pull/43) ([yidong72](https://github.com/yidong72)) +- Fix \#40 - Remove debug info [\#41](https://github.com/NVIDIA/fsi-samples/pull/41) ([miguelusque](https://github.com/miguelusque)) +- Update mortgage example using TaskGraph API. [\#38](https://github.com/NVIDIA/fsi-samples/pull/38) ([avolkov1](https://github.com/avolkov1)) +- fixed the issue 32 [\#37](https://github.com/NVIDIA/fsi-samples/pull/37) ([yidong72](https://github.com/yidong72)) +- Fix \#35 - Refactor 01\_tutorial.ipynb notebook [\#36](https://github.com/NVIDIA/fsi-samples/pull/36) ([miguelusque](https://github.com/miguelusque)) +- Fix \#26b - Add new issue templates [\#30](https://github.com/NVIDIA/fsi-samples/pull/30) ([miguelusque](https://github.com/miguelusque)) +- Revert "fix \#26 - Add new issues template" [\#29](https://github.com/NVIDIA/fsi-samples/pull/29) ([yidong72](https://github.com/yidong72)) +- Fix \#26 - Add new issues template [\#27](https://github.com/NVIDIA/fsi-samples/pull/27) ([miguelusque](https://github.com/miguelusque)) +- added workflow class [\#22](https://github.com/NVIDIA/fsi-samples/pull/22) ([yidong72](https://github.com/yidong72)) +- Fix \#19b - Combine OS/Cuda versions user input [\#21](https://github.com/NVIDIA/fsi-samples/pull/21) ([miguelusque](https://github.com/miguelusque)) +- Fix \#19 - build.sh - Move pip dependencies to conda dependencies [\#20](https://github.com/NVIDIA/fsi-samples/pull/20) ([miguelusque](https://github.com/miguelusque)) +- Fix \#13, \#14, \#16 in cuIndicator.ipynb notebook [\#18](https://github.com/NVIDIA/fsi-samples/pull/18) ([miguelusque](https://github.com/miguelusque)) +- update the build.sh [\#15](https://github.com/NVIDIA/fsi-samples/pull/15) ([yidong72](https://github.com/yidong72)) +- Feature xgb notebook [\#11](https://github.com/NVIDIA/fsi-samples/pull/11) ([yidong72](https://github.com/yidong72)) +- CUQ-5: Mortgage example using gQuant. [\#10](https://github.com/NVIDIA/fsi-samples/pull/10) ([avolkov1](https://github.com/avolkov1)) +- Feature indicator node [\#8](https://github.com/NVIDIA/fsi-samples/pull/8) ([yidong72](https://github.com/yidong72)) +- Feature mulit assets indicator [\#7](https://github.com/NVIDIA/fsi-samples/pull/7) ([yidong72](https://github.com/yidong72)) +- Update build.sh [\#6](https://github.com/NVIDIA/fsi-samples/pull/6) ([phogan-nvidia](https://github.com/phogan-nvidia)) +- Feature environment [\#5](https://github.com/NVIDIA/fsi-samples/pull/5) ([yidong72](https://github.com/yidong72)) + +## [v0.1](https://github.com/NVIDIA/fsi-samples/tree/v0.1) (2019-08-13) + +[Full Changelog](https://github.com/NVIDIA/fsi-samples/compare/e4a967fc9e3289fdbfa37e7a7b84887579332b42...v0.1) + +**Implemented enhancements:** + +- \[FEA\] build.sh - Move pip dependencies to conda dependencies [\#19](https://github.com/NVIDIA/fsi-samples/issues/19) + +**Fixed bugs:** + +- \[BUG\] Update build.sh to 0.7 until issue \#28 is fixed [\#31](https://github.com/NVIDIA/fsi-samples/issues/31) +- \[BUG\] cuIndicator.ipyng - Wrong series names [\#16](https://github.com/NVIDIA/fsi-samples/issues/16) +- \[BUG\] cuIndicator.ipynb - Runtime error in cell \#3 - Missing file [\#14](https://github.com/NVIDIA/fsi-samples/issues/14) +- \[BUG\] cuIndicator.ipynb - Incorrect path to dataset [\#13](https://github.com/NVIDIA/fsi-samples/issues/13) + +**Merged pull requests:** + +- Revert "gQuant34 - Update build.sh to make use of RAPIDS v0.8 container" [\#33](https://github.com/NVIDIA/fsi-samples/pull/33) ([yidong72](https://github.com/yidong72)) +- gQuant34 - Update build.sh to make use of RAPIDS v0.8 container [\#12](https://github.com/NVIDIA/fsi-samples/pull/12) ([miguelusque](https://github.com/miguelusque)) +- CUQ-5: Mortgage example using gQuant. [\#9](https://github.com/NVIDIA/fsi-samples/pull/9) ([avolkov1](https://github.com/avolkov1)) +- Synch master with develop [\#4](https://github.com/NVIDIA/fsi-samples/pull/4) ([avolkov1](https://github.com/avolkov1)) +- added unit tests for the cuindicator [\#3](https://github.com/NVIDIA/fsi-samples/pull/3) ([yidong72](https://github.com/yidong72)) +- CUQ-21: Improving tutorials for gQuant [\#2](https://github.com/NVIDIA/fsi-samples/pull/2) ([avolkov1](https://github.com/avolkov1)) +- Add download script and instructions in the readme [\#1](https://github.com/NVIDIA/fsi-samples/pull/1) ([yidong72](https://github.com/yidong72)) + + + +\* *This Changelog was automatically generated by [github_changelog_generator](https://github.com/github-changelog-generator/github-changelog-generator)* diff --git a/LICENSE b/gQuant/LICENSE similarity index 100% rename from LICENSE rename to gQuant/LICENSE diff --git a/gQuant/README.md b/gQuant/README.md new file mode 100644 index 00000000..6cb0cc90 --- /dev/null +++ b/gQuant/README.md @@ -0,0 +1,100 @@ +# gQuant - Graph Computation Tool + +**NOTE:** For the latest stable [README.md](https://github.com/rapidsai/gquant/blob/main/README.md) ensure you are on the `main` branch. + + +## What's Inside This Repo + +There are a few projects inside this repo: + +1. [gquant](gquant) - A graph computation toolkit that helps you to organize the workflows in graph computation. +2. [gquantlab](gquantlab) - A JupyterLab plugin that provides the UI interface for `gquant`. +3. [plugins](plugins) - A few gquant plugins with example notebooks. + 1. [simple_example](plugins/simple_example) - A simple external plugin example for gQuant. + 2. [rapids_plugin](plugins/rapids_plugin) - An external plugin with a set of nodes for quantitative analyst tasks, built on top of the [RAPIDS AI](https://rapids.ai/) project, [Numba](https://numba.pydata.org/), and [Dask](https://dask.org/). + 3. [nemo_plugin](plugins/nemo_plugin) - An external plugin with a set of nodes that wraps the [NeMo library](https://github.com/NVIDIA/NeMo) . + +These projects are all released as independent Python projects with their own `setup.py` files. + +## Screenshots +![Tuturial](tutorial.gif "Tutorial") +![Quick Demo](gquantlab_demo.gif "Demo") + + +## Binary installation + +### Install the gGuant +To install the gQuant graph computation library, run: +```bash +pip install gquant +``` +Or install `gquant` at the gQuant directory: +```bash +pip install . +``` + +### Install the gQuantLab JupyterLab plugin +To install `gquantlab` JupyterLab plugin, make sure `nodejs` of version [12^14^15] is installed. E.g.: +```bash +conda install -c conda-forge nodejs=12.4.0 +``` +Then install the `gquantlab`: +```bash +pip install gquantlab +``` +Or install `gquantlab` at the gquantlab directory: +```bash +pip install . +``` + +### Install the gQuant plugins + +Under the plugin root directory, install the plugin as normal python packages. +```bash +pip install . +``` + +Note, gQuant node plugins can be registered in two ways: + + 1. (Recommended)Write a external plugin using 'entry point' to register it. Check the `plugins` directory for details + 2. Register the plugin in `gquantrc` file. Check the `System environment` for details + + +## Docker Install + +- Build and run the container: + +```bash +$ cd gQuant/docker && . build.sh +``` +When building the container, you can run gQuant in two modes: dev or prod. In the dev mode, please check the README file in `gquantlab` directory to install the plugins and Python libraries. + +In the production mode, you can launch the container by following command and start to use it +```bash +$ docker run --runtime=nvidia --rm -it -p 8888:8888 -p 8787:8787 -p 8786:8786 gquant/gquant:[tag from the build] +``` + +## Example notebooks + +Example notebooks, tutorial showcasing, can be found in __notebooks__ folder in the plugin directory. + + +## System environment + +There are a few system environment that the user can overwrite. + +The custom module files are specified in the `gquantrc` file. `GQUANT_CONFIG` enviroment variable points to the location of this file. By default, it points to +`$CWD\gquantrc`. + +In the example `gquantrc`, system environment variable `MODULEPATH` is used to point to the paths of the module files. +To start the jupyterlab, please make sure `MODULEPATH` is set properly. + +For example, if you want to start the jupyterlab in the gQuant root directory. +```bash +MODULEPATH=$PWD/modules jupyter-lab --allow-root --ip=0.0.0.0 --no-browser --NotebookApp.token='' +``` + +Or, if you want to start the jupyterlab in the gquantlab directory. +```bash +GQUANT_CONFIG=../gquantrc MODULEPATH=$PWD/../modules jupyter-lab --allow-root --ip=0.0.0.0 --no-browser --NotebookApp.token='' +``` diff --git a/ci/checks/changelog.sh b/gQuant/ci/checks/changelog.sh similarity index 100% rename from ci/checks/changelog.sh rename to gQuant/ci/checks/changelog.sh diff --git a/ci/checks/style.sh b/gQuant/ci/checks/style.sh similarity index 100% rename from ci/checks/style.sh rename to gQuant/ci/checks/style.sh diff --git a/ci/common/prebuild.sh b/gQuant/ci/common/prebuild.sh similarity index 100% rename from ci/common/prebuild.sh rename to gQuant/ci/common/prebuild.sh diff --git a/ci/cpu/build.sh b/gQuant/ci/cpu/build.sh similarity index 100% rename from ci/cpu/build.sh rename to gQuant/ci/cpu/build.sh diff --git a/ci/cpu/prebuild.sh b/gQuant/ci/cpu/prebuild.sh similarity index 100% rename from ci/cpu/prebuild.sh rename to gQuant/ci/cpu/prebuild.sh diff --git a/ci/cpu/upload.sh b/gQuant/ci/cpu/upload.sh similarity index 100% rename from ci/cpu/upload.sh rename to gQuant/ci/cpu/upload.sh diff --git a/ci/gpu/build.sh b/gQuant/ci/gpu/build.sh similarity index 100% rename from ci/gpu/build.sh rename to gQuant/ci/gpu/build.sh diff --git a/ci/gpu/prebuild.sh b/gQuant/ci/gpu/prebuild.sh similarity index 100% rename from ci/gpu/prebuild.sh rename to gQuant/ci/gpu/prebuild.sh diff --git a/ci/release/update-version.sh b/gQuant/ci/release/update-version.sh similarity index 100% rename from ci/release/update-version.sh rename to gQuant/ci/release/update-version.sh diff --git a/conda/recipes/gquant/build.sh b/gQuant/conda/recipes/gquant/build.sh similarity index 100% rename from conda/recipes/gquant/build.sh rename to gQuant/conda/recipes/gquant/build.sh diff --git a/conda/recipes/gquant/meta.yaml b/gQuant/conda/recipes/gquant/meta.yaml similarity index 100% rename from conda/recipes/gquant/meta.yaml rename to gQuant/conda/recipes/gquant/meta.yaml diff --git a/docker/build.sh b/gQuant/docker/build.sh similarity index 98% rename from docker/build.sh rename to gQuant/docker/build.sh index 39980f60..b2b75ef0 100755 --- a/docker/build.sh +++ b/gQuant/docker/build.sh @@ -91,7 +91,6 @@ esac mkdir -p gQuant cp -r ../gquant ./gQuant -cp -r ../task_example ./gQuant cp -r ../modules ./gQuant cp -r ../taskgraphs ./gQuant cp ../setup.cfg ./gQuant @@ -153,12 +152,12 @@ RUN wget \ RUN conda install -y -c rapidsai -c nvidia -c conda-forge \ -c defaults rapids=$RAPIDS_VERSION cudatoolkit=$CUDA_STR python=3.7 -RUN conda install -y -c conda-forge jupyterlab'<3.0.0' +RUN conda install -y -c conda-forge jupyterlab RUN conda install -y -c conda-forge python-graphviz bqplot nodejs ipywidgets \ pytables mkl numexpr pydot flask pylint flake8 autopep8 -RUN jupyter labextension install @jupyter-widgets/jupyterlab-manager@2.0 --no-build +RUN jupyter labextension install @jupyter-widgets/jupyterlab-manager --no-build RUN jupyter labextension install bqplot --no-build #RUN jupyter labextension install jupyterlab-nvdashboard --no-build RUN jupyter lab build && jupyter lab clean @@ -169,7 +168,7 @@ RUN pip install jupyterlab-nvdashboard RUN jupyter labextension install jupyterlab-nvdashboard ## install the dask extension -RUN pip install "dask_labextension<5.0.0" +RUN pip install dask_labextension RUN jupyter labextension install dask-labextension RUN jupyter serverextension enable dask_labextension diff --git a/gquantlab/LICENSE b/gQuant/gquant/LICENSE similarity index 100% rename from gquantlab/LICENSE rename to gQuant/gquant/LICENSE diff --git a/gQuant/gquant/README.md b/gQuant/gquant/README.md new file mode 100644 index 00000000..4763e9c5 --- /dev/null +++ b/gQuant/gquant/README.md @@ -0,0 +1,40 @@ +# gQuant - Graph Computation Toolkit + +## What is gQuant? + +gQuant is a tool that helps you to organize the workflows. + +1. It define a TaskGraph file format `.gq.yaml` that describes the workflow. It can be edited easily by `gquantlab` JupyterLab plugin. +2. Dynamically compute the input-output ports compatibility, dataframe columns names and types, ports types to prevent connection errors. +3. Nodes can have multiple output ports that can be used to generate different output types. E.g. some data loader Node provides both `cudf` and `dask_cudf` output ports. The multiple GPUs distributed computation computation is automatically enabled by switching to the `dask_cudf` output port. +4. Provides the standard API to extend your computation Nodes. +5. The composite node can encapsulate the TaskGraph into a single node for easy reuse. The composite node can be exported as a regular gQuant node without any coding. +6. gQuant can be extended by writing a plugin with a set of nodes for a particular domain. Check `plugins` for examples. + +These examples can be used as-is or, as they are open source, can be extended to suit your environments. + +## Binary pip installation + +To install the gQuant graph computation library, run: +```bash +pip install gquant +``` +Or install `gquant` at the root directory: +```bash +pip install . +``` + +gQuant node plugins can be registered in two ways: + + 1. (Recommended)Write a external plugin using 'entry point' to register it. Check the `external` directory for details + 2. Register the plugin in `gquantrc` file. Check the `System environment` for details + +## System environment + +There are a few system environment that the user can overwrite. + +The custom module files are specified in the `gquantrc` file. `GQUANT_CONFIG` enviroment variable points to the location of this file. By default, it points to +`$CWD\gquantrc`. + +In the example `gquantrc`, system environment variable `MODULEPATH` is used to point to the paths of the module files. +To start the jupyterlab, please make sure `MODULEPATH` is set properly. diff --git a/docs/Makefile b/gQuant/gquant/docs/Makefile similarity index 100% rename from docs/Makefile rename to gQuant/gquant/docs/Makefile diff --git a/docs/README.md b/gQuant/gquant/docs/README.md similarity index 100% rename from docs/README.md rename to gQuant/gquant/docs/README.md diff --git a/docs/make.bat b/gQuant/gquant/docs/make.bat similarity index 100% rename from docs/make.bat rename to gQuant/gquant/docs/make.bat diff --git a/docs/source/_static/.gitkeep b/gQuant/gquant/docs/source/_static/.gitkeep similarity index 100% rename from docs/source/_static/.gitkeep rename to gQuant/gquant/docs/source/_static/.gitkeep diff --git a/docs/source/_templates/.gitkeep b/gQuant/gquant/docs/source/_templates/.gitkeep similarity index 100% rename from docs/source/_templates/.gitkeep rename to gQuant/gquant/docs/source/_templates/.gitkeep diff --git a/docs/source/conf.py b/gQuant/gquant/docs/source/conf.py similarity index 100% rename from docs/source/conf.py rename to gQuant/gquant/docs/source/conf.py diff --git a/docs/source/index.rst b/gQuant/gquant/docs/source/index.rst similarity index 100% rename from docs/source/index.rst rename to gQuant/gquant/docs/source/index.rst diff --git a/gquant/__init__.py b/gQuant/gquant/gquant/__init__.py similarity index 100% rename from gquant/__init__.py rename to gQuant/gquant/gquant/__init__.py diff --git a/gquant/_common.py b/gQuant/gquant/gquant/_common.py similarity index 100% rename from gquant/_common.py rename to gQuant/gquant/gquant/_common.py diff --git a/gquant/dataframe_flow/__init__.py b/gQuant/gquant/gquant/dataframe_flow/__init__.py similarity index 100% rename from gquant/dataframe_flow/__init__.py rename to gQuant/gquant/gquant/dataframe_flow/__init__.py diff --git a/gquant/dataframe_flow/_node.py b/gQuant/gquant/gquant/dataframe_flow/_node.py similarity index 100% rename from gquant/dataframe_flow/_node.py rename to gQuant/gquant/gquant/dataframe_flow/_node.py diff --git a/gquant/dataframe_flow/_node_flow.py b/gQuant/gquant/gquant/dataframe_flow/_node_flow.py similarity index 98% rename from gquant/dataframe_flow/_node_flow.py rename to gQuant/gquant/gquant/dataframe_flow/_node_flow.py index 236b8738..d4586f00 100644 --- a/gquant/dataframe_flow/_node_flow.py +++ b/gQuant/gquant/gquant/dataframe_flow/_node_flow.py @@ -85,6 +85,7 @@ class NodeTaskGraphMixin(object): load save delayed_process + infer_meta METHODS ------- @@ -465,10 +466,12 @@ def get_pout(out_dict, port): # Otherwise process will run several times. for inputs_ in inputs_dly.values(): output_df_dly = dask.delayed(self.decorate_process())(inputs_) - output_df_dly_per = output_df_dly.persist() + # output_df_dly_per = output_df_dly.persist() + output_df_dly_per = output_df_dly for oport in self._get_output_ports(): oport_out = dask.delayed(get_pout)(output_df_dly_per, oport) - outputs_dly.setdefault(oport, []).append(oport_out.persist()) + # outputs_dly.setdefault(oport, []).append(oport_out.persist()) + outputs_dly.setdefault(oport, []).append(oport_out) # DEBUGGING # print('OUTPUTS_DLY:\n{}'.format(outputs_dly)) @@ -489,7 +492,12 @@ def get_pout(out_dict, port): # self.uid, oport, port_type)) if any([issubclass(p_type, DaskDataFrame) for p_type in port_type]): - output_df[oport] = from_delayed(outputs_dly[oport]) + if self.infer_meta: + output_df[oport] = from_delayed(outputs_dly[oport]) + else: + meta_data = self.meta_setup().outports + output_df[oport] = from_delayed(outputs_dly[oport], + meta=meta_data[oport]) else: # outputs_dly[oport] is currently a list. Run compute on each # partition, and keep the first one. diff --git a/gquant/dataframe_flow/cache.py b/gQuant/gquant/gquant/dataframe_flow/cache.py similarity index 100% rename from gquant/dataframe_flow/cache.py rename to gQuant/gquant/gquant/dataframe_flow/cache.py diff --git a/gquant/dataframe_flow/node.py b/gQuant/gquant/gquant/dataframe_flow/node.py similarity index 99% rename from gquant/dataframe_flow/node.py rename to gQuant/gquant/gquant/dataframe_flow/node.py index 4f147cdc..778f482c 100644 --- a/gquant/dataframe_flow/node.py +++ b/gQuant/gquant/gquant/dataframe_flow/node.py @@ -92,6 +92,8 @@ def __init__(self, task): self.save = task.get(TaskSpecSchema.save, False) self.delayed_process = False + # eargerly infer the metadata, costly + self.infer_meta = True # customized the column setup self.init() self.profile = False # by default, do not profile diff --git a/gquant/dataframe_flow/portsSpecSchema.py b/gQuant/gquant/gquant/dataframe_flow/portsSpecSchema.py similarity index 100% rename from gquant/dataframe_flow/portsSpecSchema.py rename to gQuant/gquant/gquant/dataframe_flow/portsSpecSchema.py diff --git a/gquant/dataframe_flow/task.py b/gQuant/gquant/gquant/dataframe_flow/task.py similarity index 100% rename from gquant/dataframe_flow/task.py rename to gQuant/gquant/gquant/dataframe_flow/task.py diff --git a/gquant/dataframe_flow/taskGraph.py b/gQuant/gquant/gquant/dataframe_flow/taskGraph.py similarity index 99% rename from gquant/dataframe_flow/taskGraph.py rename to gQuant/gquant/gquant/dataframe_flow/taskGraph.py index 71da064e..dfb0ef9f 100644 --- a/gquant/dataframe_flow/taskGraph.py +++ b/gQuant/gquant/gquant/dataframe_flow/taskGraph.py @@ -1,5 +1,4 @@ from collections import OrderedDict -import networkx as nx import ruamel.yaml from .node import Node from ._node_flow import OUTPUT_ID, OUTPUT_TYPE, _CLEANUP @@ -317,6 +316,7 @@ def viz_graph(self, show_ports=False): ----- nx.DiGraph """ + import networkx as nx G = nx.DiGraph() # instantiate objects for itask in self: @@ -650,6 +650,7 @@ def run(self, outputs=None, replace=None, profile=False, formated=False): formated=formated) def to_pydot(self, show_ports=False): + import networkx as nx nx_graph = self.viz_graph(show_ports=show_ports) to_pydot = nx.drawing.nx_pydot.to_pydot pdot = to_pydot(nx_graph) diff --git a/gquant/dataframe_flow/taskSpecSchema.py b/gQuant/gquant/gquant/dataframe_flow/taskSpecSchema.py similarity index 100% rename from gquant/dataframe_flow/taskSpecSchema.py rename to gQuant/gquant/gquant/dataframe_flow/taskSpecSchema.py diff --git a/gquant/dataframe_flow/util.py b/gQuant/gquant/gquant/dataframe_flow/util.py similarity index 52% rename from gquant/dataframe_flow/util.py rename to gQuant/gquant/gquant/dataframe_flow/util.py index 0d332551..a48932d1 100644 --- a/gquant/dataframe_flow/util.py +++ b/gQuant/gquant/gquant/dataframe_flow/util.py @@ -1,24 +1,30 @@ import os import cloudpickle import base64 +import pathlib -def get_file_path(path): +def get_file_path(path: str) -> str: + """ + @path: the relative or absolute file path + returns: absolute file path + """ if path.startswith('/'): return path if 'GQUANTROOT' in os.environ: - ROOT = os.environ['GQUANTROOT'] + ROOT = pathlib.Path(os.environ['GQUANTROOT']) else: - ROOT = os.getcwd() + ROOT = pathlib.Path(os.getcwd()) if os.path.exists(path): return path - elif os.path.exists(ROOT+'/'+path): - return ROOT+'/'+path + path = pathlib.Path(path) + if (ROOT/path).absolute().parent.exists(): + return str(ROOT/path) else: print('current path', os.getcwd()) print('input path', path) print('cannot find the file') - raise Exception("File cannnot be found") + raise FileNotFoundError("File path cannnot be found") def get_encoded_class(classObj): diff --git a/gquant/flow.py b/gQuant/gquant/gquant/flow.py similarity index 100% rename from gquant/flow.py rename to gQuant/gquant/gquant/flow.py diff --git a/gquant/plugin_nodes/__init__.py b/gQuant/gquant/gquant/plugin_nodes/__init__.py similarity index 100% rename from gquant/plugin_nodes/__init__.py rename to gQuant/gquant/gquant/plugin_nodes/__init__.py diff --git a/gquant/plugin_nodes/util/__init__.py b/gQuant/gquant/gquant/plugin_nodes/util/__init__.py similarity index 100% rename from gquant/plugin_nodes/util/__init__.py rename to gQuant/gquant/gquant/plugin_nodes/util/__init__.py diff --git a/gquant/plugin_nodes/util/compositeNode.py b/gQuant/gquant/gquant/plugin_nodes/util/compositeNode.py similarity index 97% rename from gquant/plugin_nodes/util/compositeNode.py rename to gQuant/gquant/gquant/plugin_nodes/util/compositeNode.py index d99f260b..401c4106 100644 --- a/gquant/plugin_nodes/util/compositeNode.py +++ b/gQuant/gquant/gquant/plugin_nodes/util/compositeNode.py @@ -77,8 +77,11 @@ def _compute_hash_key(self): input_node = "" task_graph_obj = None if 'taskgraph' in self.conf: - task_graph = get_file_path(self.conf['taskgraph']) - if os.path.exists(task_graph): + try: + task_graph = get_file_path(self.conf['taskgraph']) + except FileNotFoundError: + task_graph = None + if task_graph is not None and os.path.exists(task_graph): with open(task_graph) as f: task_graph = hashlib.md5(f.read().encode()).hexdigest() task_graph_obj = TaskGraph.load_taskgraph( @@ -159,7 +162,7 @@ def ports_setup(self): return CACHE_PORTS[cache_key] inports = {} outports = {} - if 'taskgraph' in self.conf: + if task_graph: task_graph.build(replace=replacementObj) def inputNode_fun(inputNode, in_ports): @@ -191,7 +194,7 @@ def meta_setup(self): return CACHE_META[cache_key] required = {} out_meta = {} - if 'taskgraph' in self.conf: + if task_graph: task_graph.build(replace=replacementObj) def inputNode_fun(inputNode, in_ports): @@ -267,7 +270,7 @@ def conf_schema(self): "taskgraph": {"ui:widget": "TaskgraphSelector"}, "subnodes_conf": {} } - if 'taskgraph' in self.conf: + if task_graph: task_graph.build(replace=replacementObj) def inputNode_fun(inputNode, in_ports): @@ -295,7 +298,7 @@ def outNode_fun(outNode, out_ports): json['properties']['input']['items']['enum'] = in_ports json['properties']['output']['items']['enum'] = out_ports json['properties']['subnode_ids']['items']['enum'] = ids_in_graph - if 'subnode_ids' in self.conf: + if 'subnode_ids' in self.conf and task_graph: for subnodeId in self.conf['subnode_ids']: if subnodeId in task_graph: nodeObj = task_graph[subnodeId] diff --git a/gquant/plugin_nodes/util/contextCompositeNode.py b/gQuant/gquant/gquant/plugin_nodes/util/contextCompositeNode.py similarity index 100% rename from gquant/plugin_nodes/util/contextCompositeNode.py rename to gQuant/gquant/gquant/plugin_nodes/util/contextCompositeNode.py diff --git a/gquant/plugin_nodes/util/data_obj.py b/gQuant/gquant/gquant/plugin_nodes/util/data_obj.py similarity index 100% rename from gquant/plugin_nodes/util/data_obj.py rename to gQuant/gquant/gquant/plugin_nodes/util/data_obj.py diff --git a/gquant/plugin_nodes/util/json_util.py b/gQuant/gquant/gquant/plugin_nodes/util/json_util.py similarity index 100% rename from gquant/plugin_nodes/util/json_util.py rename to gQuant/gquant/gquant/plugin_nodes/util/json_util.py diff --git a/setup.cfg b/gQuant/gquant/setup.cfg similarity index 100% rename from setup.cfg rename to gQuant/gquant/setup.cfg diff --git a/setup.py b/gQuant/gquant/setup.py similarity index 82% rename from setup.py rename to gQuant/gquant/setup.py index 13a95c71..d2aa5894 100644 --- a/setup.py +++ b/gQuant/gquant/setup.py @@ -8,12 +8,13 @@ # Get the long description from the README file long_description = (here / 'README.md').read_text(encoding='utf-8') -install_requires = ['dask', 'configparser', 'cloudpickle', 'PyYaml', - 'jsonpath_ng'] +install_requires = ['dask[distributed]', 'dask[dataframe]', 'configparser', + 'cloudpickle', 'PyYaml', + 'jsonpath_ng', 'ruamel.yaml', 'pandas'] setup( name='gquant', - version='1.0.1', + version='1.0.2', description='gquant - RAPIDS Financial Services Algorithms', long_description=long_description, long_description_content_type='text/markdown', diff --git a/notebooks/cuIndicator/viz/__init__.py b/gQuant/gquant/tests/__init__.py similarity index 100% rename from notebooks/cuIndicator/viz/__init__.py rename to gQuant/gquant/tests/__init__.py diff --git a/tests/__init__.py b/gQuant/gquant/tests/unit/__init__.py similarity index 100% rename from tests/__init__.py rename to gQuant/gquant/tests/unit/__init__.py diff --git a/gQuant/gquant/tests/unit/custom_port_nodes.py b/gQuant/gquant/tests/unit/custom_port_nodes.py new file mode 100644 index 00000000..db7e9f6d --- /dev/null +++ b/gQuant/gquant/tests/unit/custom_port_nodes.py @@ -0,0 +1,220 @@ +import math +import numpy as np +import dask +import pandas as pd +from gquant.dataframe_flow import Node, MetaData +from gquant.dataframe_flow import NodePorts, PortsSpecSchema +from gquant.dataframe_flow import ConfSchema +import os +import warnings + + +class _PortTypesMixin(object): + + def load_cache(self, filename=None) -> dict: + """ + Defines the behavior of how to load the cache file from the `filename`. + Node can override this method. Default implementation assumes cudf + dataframes. + + Arguments + ------- + filename: str + filename of the cache file. Leave as none to use default. + returns: dict + dictionary of the output from this node + """ + cache_dir = os.getenv('GQUANT_CACHE_DIR', self.cache_dir) + if filename is None: + filename = cache_dir + '/' + self.uid + '.hdf5' + + output_df = {} + with pd.HDFStore(filename, mode='r') as hf: + for oport, pspec in \ + self._get_output_ports(full_port_spec=True).items(): + ptype = pspec.get(PortsSpecSchema.port_type) + if self.outport_connected(oport): + ptype = ([ptype] if not isinstance(ptype, + list) else ptype) + key = '{}/{}'.format(self.uid, oport) + # check hdf store for the key + if key not in hf: + raise Exception( + 'The task "{}" port "{}" key "{}" not found in' + 'the hdf file "{}". Cannot load from cache.' + .format(self.uid, oport, key, filename) + ) + if pd.DataFrame not in ptype: + warnings.warn( + RuntimeWarning, + 'Task "{}" port "{}" port type is not set to ' + 'cudf.DataFrame. Attempting to load port data ' + 'with cudf.read_hdf.'.format(self.uid, oport)) + output_df[oport] = pd.read_hdf(hf, key) + return output_df + + def save_cache(self, output_data: dict): + '''Defines the behavior for how to save the output of a node to + filesystem cache. Default implementation assumes cudf dataframes. + + :param output_data: The output from :meth:`process`. For saving to hdf + requires that the dataframe(s) have `to_hdf` method. + ''' + cache_dir = os.getenv('GQUANT_CACHE_DIR', self.cache_dir) + os.makedirs(cache_dir, exist_ok=True) + filename = cache_dir + '/' + self.uid + '.hdf5' + with pd.HDFStore(filename, mode='w') as hf: + for oport, odf in output_data.items(): + # check for to_hdf attribute + if not hasattr(odf, 'to_hdf'): + raise Exception( + 'Task "{}" port "{}" output object is missing ' + '"to_hdf" attribute. Cannot save to cache.' + .format(self.uid, oport)) + + dtype = '{}'.format(type(odf)).lower() + if 'dataframe' not in dtype: + warnings.warn( + RuntimeWarning, + 'Task "{}" port "{}" port type is not a dataframe.' + ' Attempting to save to hdf with "to_hdf" method.' + .format(self.uid, oport)) + key = '{}/{}'.format(self.uid, oport) + odf.to_hdf(hf, key, format='table', data_columns=True) + + +class PointNode(_PortTypesMixin, Node): + + def ports_setup(self): + input_ports = {} + output_ports = { + 'points_df_out': { + PortsSpecSchema.port_type: pd.DataFrame + } + } + return NodePorts(inports=input_ports, outports=output_ports) + + def conf_schema(self): + json = { + "title": "PointNode configure", + "type": "object", + "properties": { + "npts": { + "type": "number", + "description": "number of data points", + "minimum": 10 + } + }, + "required": ["npts"], + } + + ui = { + "npts": {"ui:widget": "updown"} + } + return ConfSchema(json=json, ui=ui) + + def init(self): + pass + + def meta_setup(self): + columns_out = { + 'points_df_out': { + 'x': 'float64', + 'y': 'float64' + }, + 'points_ddf_out': { + 'x': 'float64', + 'y': 'float64' + } + } + return MetaData(inports={}, outports=columns_out) + + def process(self, inputs): + npts = self.conf['npts'] + seed = self.conf.get('nseed') + if seed is not None: + np.random.seed(seed) + df = pd.DataFrame() + df['x'] = np.random.rand(npts) + df['y'] = np.random.rand(npts) + output = {} + if self.outport_connected('points_df_out'): + output.update({'points_df_out': df}) + return output + + +class DistanceNode(_PortTypesMixin, Node): + + def ports_setup(self): + port_type = PortsSpecSchema.port_type + input_ports = { + 'points_df_in': { + port_type: [pd.DataFrame] + } + } + + output_ports = { + 'distance_df': { + port_type: [pd.DataFrame] + }, + 'distance_abs_df': { + PortsSpecSchema.port_type: [pd.DataFrame] + } + } + input_connections = self.get_connected_inports() + if 'points_df_in' in input_connections: + types = input_connections['points_df_in'] + # connected, use the types passed in from parent + return NodePorts(inports={'points_df_in': {port_type: types}}, + outports={'distance_df': {port_type: types}, + 'distance_abs_df': {port_type: types}, + }) + else: + return NodePorts(inports=input_ports, outports=output_ports) + + def conf_schema(self): + return ConfSchema() + + def init(self): + self.delayed_process = True + + def meta_setup(self): + req_cols = { + 'x': 'float64', + 'y': 'float64' + } + required = { + 'points_df_in': req_cols, + } + input_meta = self.get_input_meta() + output_cols = ({ + 'distance_df': { + 'distance_df': 'float64', + 'x': 'float64', + 'y': 'float64' + }, + 'distance_abs_df': { + 'distance_abs_df': 'float64', + 'x': 'float64', + 'y': 'float64' + } + }) + if 'points_df_in' in input_meta: + col_from_inport = input_meta['points_df_in'] + # additional ports + output_cols['distance_df'].update(col_from_inport) + output_cols['distance_abs_df'].update(col_from_inport) + return MetaData(inports=required, outports=output_cols) + + def process(self, inputs): + df = inputs['points_df_in'] + output = {} + if self.outport_connected('distance_df'): + copy_df = df.copy() + copy_df['distance_df'] = np.sqrt((df['x'] ** 2 + df['y'] ** 2)) + output.update({'distance_df': copy_df}) + if self.outport_connected('distance_abs_df'): + copy_df = df.copy() + copy_df['distance_abs_df'] = np.abs(df['x']) + np.abs(df['y']) + output.update({'distance_abs_df': copy_df}) + return output diff --git a/tests/unit/test_node_api.py b/gQuant/gquant/tests/unit/test_node_api.py similarity index 90% rename from tests/unit/test_node_api.py rename to gQuant/gquant/tests/unit/test_node_api.py index 18b80d1d..ae6b0053 100644 --- a/tests/unit/test_node_api.py +++ b/gQuant/gquant/tests/unit/test_node_api.py @@ -62,16 +62,6 @@ def setUp(self): self.distance_task = Task(distance_task_spec) - points_noports_task_spec = { - TaskSpecSchema.task_id: 'points_noport_task', - TaskSpecSchema.node_type: 'PointNoPortsNode', - TaskSpecSchema.filepath: custom_module, - TaskSpecSchema.conf: {'npts': 1000}, - TaskSpecSchema.inputs: {} - } - - self.points_noports_task = Task(points_noports_task_spec) - def tearDown(self): pass diff --git a/tests/unit/test_node_taskgraph_typechecking.py b/gQuant/gquant/tests/unit/test_node_taskgraph_typechecking.py similarity index 100% rename from tests/unit/test_node_taskgraph_typechecking.py rename to gQuant/gquant/tests/unit/test_node_taskgraph_typechecking.py diff --git a/tests/unit/test_taskgraph_api.py b/gQuant/gquant/tests/unit/test_taskgraph_api.py similarity index 88% rename from tests/unit/test_taskgraph_api.py rename to gQuant/gquant/tests/unit/test_taskgraph_api.py index 6aa5902f..4864e45c 100644 --- a/tests/unit/test_taskgraph_api.py +++ b/gQuant/gquant/tests/unit/test_taskgraph_api.py @@ -27,7 +27,7 @@ from io import StringIO import warnings import unittest - +import pandas as pd from gquant.dataframe_flow import (TaskSpecSchema, TaskGraph) from gquant.dataframe_flow.task import DEFAULT_MODULE # noqa: F401 from gquant.dataframe_flow import Node @@ -44,7 +44,7 @@ conf: npts: 1000 inputs: [] -- id: distance_by_cudf +- id: distance_by_df type: DistanceNode conf: {} inputs: @@ -55,10 +55,9 @@ class TestTaskGraphAPI(unittest.TestCase): def setUp(self): import gc # python garbage collector - import cudf # warmup - s = cudf.Series([1, 2, 3, None, 4], nan_as_null=False) + s = pd.Series([1, 2, 3, None, 4]) del(s) gc.collect() @@ -72,7 +71,7 @@ def setUp(self): } distance_task_spec = { - TaskSpecSchema.task_id: 'distance_by_cudf', + TaskSpecSchema.task_id: 'distance_by_df', TaskSpecSchema.node_type: 'DistanceNode', TaskSpecSchema.conf: {}, TaskSpecSchema.inputs: { @@ -100,14 +99,12 @@ def test_viz_graph(self): ''' nx_graph = self.tgraph.viz_graph(show_ports=True) nx_nodes = ['points_task', 'points_task.points_df_out', - 'points_task.points_ddf_out', - 'distance_by_cudf', 'distance_by_cudf.distance_df', - 'distance_by_cudf.distance_abs_df'] + 'distance_by_df', 'distance_by_df.distance_df', + 'distance_by_df.distance_abs_df'] nx_edges = [('points_task', 'points_task.points_df_out'), - ('points_task', 'points_task.points_ddf_out'), - ('points_task.points_df_out', 'distance_by_cudf'), - ('distance_by_cudf', 'distance_by_cudf.distance_df'), - ('distance_by_cudf', 'distance_by_cudf.distance_abs_df')] + ('points_task.points_df_out', 'distance_by_df'), + ('distance_by_df', 'distance_by_df.distance_df'), + ('distance_by_df', 'distance_by_df.distance_abs_df')] self.assertEqual(list(nx_graph.nodes), nx_nodes) self.assertEqual(list(nx_graph.edges), nx_edges) @@ -119,7 +116,7 @@ def test_build(self): self.tgraph.build() points_node = self.tgraph['points_task'] - distance_node = self.tgraph['distance_by_cudf'] + distance_node = self.tgraph['distance_by_df'] onode_info = { 'to_node': distance_node, @@ -147,10 +144,10 @@ def test_build(self): } self.assertEqual(inode_in_cols, distance_node.get_input_meta()) - inode_out_cols = {'distance_df': {'distance_cudf': 'float64', + inode_out_cols = {'distance_df': {'distance_df': 'float64', 'x': 'float64', 'y': 'float64'}, - 'distance_abs_df': {'distance_abs_cudf': 'float64', + 'distance_abs_df': {'distance_abs_df': 'float64', 'x': 'float64', 'y': 'float64'}} self.assertEqual(inode_out_cols, distance_node.meta_setup().outports) @@ -158,7 +155,7 @@ def test_build(self): def test_run(self): '''Test that a taskgraph can run successfully. ''' - outlist = ['distance_by_cudf.distance_df'] + outlist = ['distance_by_df.distance_df'] # Using numpy random seed to get repeatable and deterministic results. # For seed 2335 should get something around 761.062831178. replace_spec = { @@ -169,9 +166,9 @@ def test_run(self): } } } - (dist_df_w_cudf, ) = self.tgraph.run( + (dist_df_w_df, ) = self.tgraph.run( outputs=outlist, replace=replace_spec) - dist_sum = dist_df_w_cudf['distance_cudf'].sum() + dist_sum = dist_df_w_df['distance_df'].sum() # self.assertAlmostEqual(dist_sum, 0.0, places, msg, delta) self.assertAlmostEqual(dist_sum, 761.062831178) # match to 7 places @@ -243,7 +240,7 @@ def test_save_load_cache(self): 2. Load points_task df from cache when running the taskgraph. ''' replace_spec = {'points_task': {TaskSpecSchema.save: True}} - outlist = ['distance_by_cudf.distance_df'] + outlist = ['distance_by_df.distance_df'] with warnings.catch_warnings(): # ignore UserWarning: Using CPU via Pandas to write HDF dataset diff --git a/tests/unit/test_workflow_serialization.py b/gQuant/gquant/tests/unit/test_workflow_serialization.py similarity index 100% rename from tests/unit/test_workflow_serialization.py rename to gQuant/gquant/tests/unit/test_workflow_serialization.py diff --git a/tests/unit/utils.py b/gQuant/gquant/tests/unit/utils.py similarity index 100% rename from tests/unit/utils.py rename to gQuant/gquant/tests/unit/utils.py diff --git a/gquantlab/.eslintignore b/gQuant/gquantlab/.eslintignore similarity index 100% rename from gquantlab/.eslintignore rename to gQuant/gquantlab/.eslintignore diff --git a/gquantlab/.eslintrc.js b/gQuant/gquantlab/.eslintrc.js similarity index 100% rename from gquantlab/.eslintrc.js rename to gQuant/gquantlab/.eslintrc.js diff --git a/gquantlab/.gitignore b/gQuant/gquantlab/.gitignore similarity index 96% rename from gquantlab/.gitignore rename to gQuant/gquantlab/.gitignore index c0a8535a..232ad412 100644 --- a/gquantlab/.gitignore +++ b/gQuant/gquantlab/.gitignore @@ -4,8 +4,8 @@ node_modules/ *.egg-info/ .ipynb_checkpoints *.tsbuildinfo +gquantlab/labextension -*/labextension/*.tgz # Created by https://www.gitignore.io/api/python # Edit at https://www.gitignore.io/?templates=python @@ -107,3 +107,6 @@ dmypy.json .pyre/ # End of https://www.gitignore.io/api/python + +# OSX files +.DS_Store diff --git a/gquantlab/.prettierignore b/gQuant/gquantlab/.prettierignore similarity index 100% rename from gquantlab/.prettierignore rename to gQuant/gquantlab/.prettierignore diff --git a/gquantlab/.prettierrc b/gQuant/gquantlab/.prettierrc similarity index 100% rename from gquantlab/.prettierrc rename to gQuant/gquantlab/.prettierrc diff --git a/gQuant/gquantlab/LICENSE b/gQuant/gquantlab/LICENSE new file mode 100644 index 00000000..18bcb431 --- /dev/null +++ b/gQuant/gquantlab/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "{}" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2018 NVIDIA Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/gquantlab/MANIFEST.in b/gQuant/gquantlab/MANIFEST.in similarity index 100% rename from gquantlab/MANIFEST.in rename to gQuant/gquantlab/MANIFEST.in diff --git a/gquantlab/README.md b/gQuant/gquantlab/README.md similarity index 53% rename from gquantlab/README.md rename to gQuant/gquantlab/README.md index 45f42c0a..28c7ae32 100644 --- a/gquantlab/README.md +++ b/gQuant/gquantlab/README.md @@ -2,7 +2,18 @@ ![Github Actions Status](https://github.com/rapidsai/gQuant/gquantlab/workflows/Build/badge.svg) -gQuant Jupyterlab extension +## gQuant jupyterlab extension +The gQuant Juyterlab extension provides the user interface to build the dataframe flow TaskGraph easily. It takes advantage of the open sources projects like [jupyterlab](https://github.com/jupyterlab/jupyterlab), [ipywidget](https://github.com/jupyter-widgets/ipywidgets), [React](https://reactjs.org/) and [D3](https://d3js.org/). It features: +1. Takes full advantage of the JupyterLab project that the extension adds commands to Jupyterlab context menu, command palette and bind them with keyboard shortcuts to speed up the productivity. +2. Define a new TaskGraph file format `.gq.yaml` that can be edited in the Jupyterlab. +3. Visually presents the TaskGraph as a DAG graph. Users can zoom in and out, freely move the nodes around, and make connections between nodes. +4. Use the special `Ouput Collector` to gather the results and organize them in a tab widget. The IPython [rich display](https://ipython.readthedocs.io/en/stable/config/integrating.html#rich-display) is fully supported. +5. Visually shows the progress of graph evaluation and computation dependence. +6. Automatically generate the UI elements to edit and validate the Node configuration given the configuration JSON schema. It exposes the function API in a user-friendly way. User can change the configuration and re-run the computation to test out the hyperparameters easily. +7. Dynamically compute the input-output ports compatibility, dataframe columns names and types, ports types to prevent connection errors. +8. Nodes can have multiple output ports that can be used to generate different output types. E.g. some data loader Node provides both `cudf` and `dask_cudf` output ports. The multiple GPUs distributed computation computation is automatically enabled by switching to the `dask_cudf` output port. +9. Provides the standard API to extend your computation Nodes. +10. The composite node can encapsulate the TaskGraph into a single node for easy reuse. The composite node can be exported as a regular gQuant node without any coding. This extension is composed of a Python package named `gquantlab` @@ -29,11 +40,10 @@ Set the gQuant path as the folder to start the development or you can open the ## Install -Note: You will need NodeJS to install the extension. +Note: You will need NodeJS of version 12^14^15 to install the extension. ```bash pip install gquantlab -jupyter lab build ``` ## Troubleshoot @@ -105,24 +115,3 @@ Now every change will be built locally and bundled into JupyterLab. Be sure to r pip uninstall gquantlab jupyter labextension uninstall gquantlab ``` - -### Start the JupyterLab - -Once the gquantlab plugin is install, the jupyterlab can be started. There is -one important environment to consider before starting. The custom module files -are specified in the `gquantrc` file. You can find an example `gquantrc` file in -the gQuant root directory. `gquantrc` file is by default is read at the same location -as the jupyterlab server's root directory. However, this can be overwirtten by -setting the `GQUANT_CONFIG` environment variable. In the example `gquantrc`, system -environment variable `MODULEPATH` is used to point to the paths of the module files. -To start the jupyterlab, please make sure `MODULEPATH` is set properly. - -For example, if you want to start the jupyterlab in the gQuant root directory. -```bash -MODULEPATH=$PWD/modules jupyter-lab --allow-root --ip=0.0.0.0 --no-browser --NotebookApp.token='' -``` - -Or, if you want to start the jupyterlab in the gquantlab directory. -```bash -GQUANT_CONFIG=../gquantrc MODULEPATH=$PWD/../modules jupyter-lab --allow-root --ip=0.0.0.0 --no-browser --NotebookApp.token='' -``` diff --git a/gQuant/gquantlab/gquantlab/__init__.py b/gQuant/gquantlab/gquantlab/__init__.py new file mode 100644 index 00000000..586b11dd --- /dev/null +++ b/gQuant/gquantlab/gquantlab/__init__.py @@ -0,0 +1,39 @@ + +import json +import os.path as osp + +from ._version import __version__ + +HERE = osp.abspath(osp.dirname(__file__)) + +with open(osp.join(HERE, 'labextension', 'package.json')) as fid: + data = json.load(fid) + +def _jupyter_labextension_paths(): + return [{ + 'src': 'labextension', + 'dest': data['name'] + }] + + + +from .handlers import setup_handlers + + +def _jupyter_server_extension_points(): + return [{ + "module": "gquantlab" + }] + + +def _load_jupyter_server_extension(server_app): + """Registers the API handler to receive HTTP requests from the frontend extension. + + Parameters + ---------- + lab_app: jupyterlab.labapp.LabApp + JupyterLab application instance + """ + setup_handlers(server_app.web_app) + server_app.log.info("Registered gQuantLab extension at URL path /gquantlab") + diff --git a/gquantlab/gquantlab/_frontend.py b/gQuant/gquantlab/gquantlab/_frontend.py similarity index 100% rename from gquantlab/gquantlab/_frontend.py rename to gQuant/gquantlab/gquantlab/_frontend.py diff --git a/gQuant/gquantlab/gquantlab/_version.py b/gQuant/gquantlab/gquantlab/_version.py new file mode 100644 index 00000000..1adc2b45 --- /dev/null +++ b/gQuant/gquantlab/gquantlab/_version.py @@ -0,0 +1,19 @@ +__all__ = ['__version__'] + +def _fetchVersion(): + import json + import os + + HERE = os.path.abspath(os.path.dirname(__file__)) + + for d, _, _ in os.walk(HERE): + try: + with open(os.path.join(d, 'package.json')) as f: + return json.load(f)['version'] + except FileNotFoundError: + pass + + raise FileNotFoundError('Could not find package.json under dir {}'.format(HERE)) + +__version__ = _fetchVersion() + diff --git a/gquantlab/gquantlab/gquantmodel.py b/gQuant/gquantlab/gquantlab/gquantmodel.py similarity index 100% rename from gquantlab/gquantlab/gquantmodel.py rename to gQuant/gquantlab/gquantlab/gquantmodel.py diff --git a/gquantlab/gquantlab/handlers.py b/gQuant/gquantlab/gquantlab/handlers.py similarity index 92% rename from gquantlab/gquantlab/handlers.py rename to gQuant/gquantlab/gquantlab/handlers.py index 4aa73bbf..5639b874 100644 --- a/gquantlab/gquantlab/handlers.py +++ b/gQuant/gquantlab/gquantlab/handlers.py @@ -1,7 +1,6 @@ import json - -from notebook.base.handlers import APIHandler -from notebook.utils import url_path_join +from jupyter_server.base.handlers import APIHandler +from jupyter_server.utils import url_path_join import tornado from gquant.dataframe_flow import TaskGraph from .server_utils import (get_nodes, add_nodes) @@ -61,12 +60,14 @@ def get(self): val_dict = getattr(client_mod, 'validation') client_info['validation'].update(val_dict) else: - print(client_mod, 'no validation') + pass + # print(client_mod, 'no validation') if hasattr(client_mod, 'display'): val_dict = getattr(client_mod, 'display') client_info['display'].update(val_dict) else: - print(client_mod, 'no display') + pass + # print(client_mod, 'no display') # else: # print(key, mod.mod, 'no client') @@ -78,12 +79,14 @@ def get(self): val_dict = getattr(client_mod, 'validation') client_info['validation'].update(val_dict) else: - print(client_mod, 'no validation') + pass + # print(client_mod, 'no validation') if hasattr(client_mod, 'display'): val_dict = getattr(client_mod, 'display') client_info['display'].update(val_dict) else: - print(client_mod, 'no display') + pass + # print(client_mod, 'no display') self.finish(json.dumps(client_info)) diff --git a/gquantlab/gquantlab/server_utils.py b/gQuant/gquantlab/gquantlab/server_utils.py similarity index 100% rename from gquantlab/gquantlab/server_utils.py rename to gQuant/gquantlab/gquantlab/server_utils.py diff --git a/gQuant/gquantlab/jupyter-config/gquantlab.json b/gQuant/gquantlab/jupyter-config/gquantlab.json new file mode 100644 index 00000000..77c32881 --- /dev/null +++ b/gQuant/gquantlab/jupyter-config/gquantlab.json @@ -0,0 +1,7 @@ +{ + "ServerApp": { + "jpserver_extensions": { + "gquantlab": true + } + } +} diff --git a/gquantlab/notebooks/Empty.ipynb b/gQuant/gquantlab/notebooks/Empty.ipynb similarity index 100% rename from gquantlab/notebooks/Empty.ipynb rename to gQuant/gquantlab/notebooks/Empty.ipynb diff --git a/gquantlab/notebooks/full_example.ipynb b/gQuant/gquantlab/notebooks/full_example.ipynb similarity index 100% rename from gquantlab/notebooks/full_example.ipynb rename to gQuant/gquantlab/notebooks/full_example.ipynb diff --git a/gquantlab/notebooks/test.ipynb b/gQuant/gquantlab/notebooks/test.ipynb similarity index 100% rename from gquantlab/notebooks/test.ipynb rename to gQuant/gquantlab/notebooks/test.ipynb diff --git a/gquantlab/notebooks/test_dask.ipynb b/gQuant/gquantlab/notebooks/test_dask.ipynb similarity index 100% rename from gquantlab/notebooks/test_dask.ipynb rename to gQuant/gquantlab/notebooks/test_dask.ipynb diff --git a/gquantlab/package.json b/gQuant/gquantlab/package.json similarity index 52% rename from gquantlab/package.json rename to gQuant/gquantlab/package.json index 613a5111..8de8e324 100644 --- a/gquantlab/package.json +++ b/gQuant/gquantlab/package.json @@ -1,6 +1,6 @@ { "name": "gquantlab", - "version": "0.1.2", + "version": "1.0.0", "description": "gQuant Jupyterlab extension", "keywords": [ "jupyter", @@ -28,52 +28,60 @@ "url": "https://github.com/rapidsai/gQuant.git" }, "scripts": { - "build": "jlpm run build:lib", - "build:labextension": "cd gquantlab && rimraf labextension && mkdirp labextension && cd labextension && npm pack ../..", - "build:lib": "tsc", + "build": "jlpm run build:lib && jlpm run build:labextension:dev", "build:all": "jlpm run build:labextension", + "build:labextension": "jupyter labextension build .", + "build:labextension:dev": "jupyter labextension build --development True .", + "build:lib": "tsc", + "build:prod": "jlpm run build:lib && jlpm run build:labextension", "clean": "jlpm run clean:lib", - "clean:lib": "rimraf lib tsconfig.tsbuildinfo", - "clean:labextension": "rimraf gquantlab/labextension", "clean:all": "jlpm run clean:lib && jlpm run clean:labextension", + "clean:labextension": "rimraf gquantlab/labextension", + "clean:lib": "rimraf lib tsconfig.tsbuildinfo", "eslint": "eslint . --ext .ts,.tsx --fix", "eslint:check": "eslint . --ext .ts,.tsx", - "prepare": "jlpm run clean && jlpm run build", - "watch": "tsc -w" + "install:extension": "jupyter labextension develop --overwrite .", + "prepare": "jlpm run clean && jlpm run build:prod", + "watch": "run-p watch:src watch:labextension", + "watch:labextension": "jupyter labextension watch .", + "watch:src": "tsc -w" }, "dependencies": { "@emotion/core": "^10.0.28", "@emotion/styled": "^10.0.27", - "@jupyter-widgets/base": "^3.0.0", - "@jupyterlab/application": "^2.0.0", - "@jupyterlab/cells": "^2.2.0", - "@jupyterlab/coreutils": "^4.0.0", - "@jupyterlab/docregistry": "^2.1.1", - "@jupyterlab/filebrowser": "^2.1.1", - "@jupyterlab/launcher": "^2.1.1", - "@jupyterlab/mainmenu": "^2.1.1", - "@jupyterlab/notebook": "^2.2.0", - "@jupyterlab/services": "^5.0.0", - "@jupyterlab/ui-components": "^2.1.1", - "@lumino/coreutils": "^1.5.2", - "@lumino/signaling": "^1.4.2", - "@lumino/widgets": "^1.13.2", + "@jupyter-widgets/base": "^4.0.0", + "@jupyterlab/application": "^3.0.0", + "@jupyterlab/cells": "^3.0.0", + "@jupyterlab/coreutils": "^5.0.0", + "@jupyterlab/docregistry": "^3.0.0", + "@jupyterlab/filebrowser": "^3.0.0", + "@jupyterlab/launcher": "^3.0.0", + "@jupyterlab/mainmenu": "^3.0.0", + "@jupyterlab/notebook": "^3.0.0", + "@jupyterlab/services": "^6.0.0", + "@jupyterlab/ui-components": "^3.0.0", + "@lumino/coreutils": "^1.5.3", + "@lumino/signaling": "^1.4.3", + "@lumino/widgets": "^1.16.1", "@rjsf/core": "^2.3.0", "@types/d3": "^5.7.2", + "@types/js-yaml": "^3.12.5", "bootstrap": "^4.5.0", "d3": "^5.16.0", - "d3-dag": "^0.4.0" + "d3-dag": "^0.6.0" }, "devDependencies": { - "@typescript-eslint/eslint-plugin": "^2.25.0", - "@typescript-eslint/parser": "^2.25.0", - "eslint": "^6.8.0", + "@jupyterlab/builder": "^3.0.0-rc.13", + "@typescript-eslint/eslint-plugin": "^2.27.0", + "@typescript-eslint/parser": "^2.27.0", + "eslint": "^7.5.0", "eslint-config-prettier": "^6.10.1", "eslint-plugin-prettier": "^3.1.2", "mkdirp": "^1.0.3", - "prettier": "1.16.4", - "rimraf": "^2.6.1", - "typescript": "~3.7.0" + "npm-run-all": "^4.1.5", + "prettier": "^1.19.0", + "rimraf": "^3.0.2", + "typescript": "~4.1.3" }, "sideEffects": [ "style/*.css" @@ -89,6 +97,7 @@ } } }, - "extension": true + "extension": true, + "outputDir": "gquantlab/labextension" } } diff --git a/gQuant/gquantlab/pyproject.toml b/gQuant/gquantlab/pyproject.toml new file mode 100644 index 00000000..5f536843 --- /dev/null +++ b/gQuant/gquantlab/pyproject.toml @@ -0,0 +1,3 @@ +[build-system] +requires = ["jupyter_packaging~=0.7.9", "jupyterlab>=3.0.0rc13,==3.*", "setuptools>=40.8.0", "wheel"] +build-backend = "setuptools.build_meta" diff --git a/gquantlab/setup.py b/gQuant/gquantlab/setup.py similarity index 58% rename from gquantlab/setup.py rename to gQuant/gquantlab/setup.py index 74157e5c..63dcc384 100644 --- a/gquantlab/setup.py +++ b/gQuant/gquantlab/setup.py @@ -1,11 +1,12 @@ """ -Setup Module to setup Python Handlers for the gquantlab extension. +gquantlab setup """ +import json import os from jupyter_packaging import ( create_cmdclass, install_npm, ensure_targets, - combine_commands, ensure_python, get_version, + combine_commands, skip_if_exists ) import setuptools @@ -14,17 +15,15 @@ # The name of the project name="gquantlab" -# Ensure a valid python version -ensure_python(">=3.5") - # Get our version -version = get_version(os.path.join(name, "_version.py")) +with open(os.path.join(HERE, 'package.json')) as f: + version = json.load(f)['version'] lab_path = os.path.join(HERE, name, "labextension") # Representative files that should exist after a successful build jstargets = [ - os.path.join(HERE, "lib", "gquantlab.js"), + os.path.join(lab_path, "package.json"), ] package_data_spec = { @@ -33,47 +32,58 @@ ] } +labext_name = "gquantlab" + data_files_spec = [ - ("share/jupyter/lab/extensions", lab_path, "*.tgz"), - ("etc/jupyter/jupyter_notebook_config.d", + ("share/jupyter/labextensions/%s" % labext_name, lab_path, "**"), + ("share/jupyter/labextensions/%s" % labext_name, HERE, "install.json"),("etc/jupyter/jupyter_server_config.d", "jupyter-config", "gquantlab.json"), + ] -cmdclass = create_cmdclass("jsdeps", +cmdclass = create_cmdclass("jsdeps", package_data_spec=package_data_spec, data_files_spec=data_files_spec ) -cmdclass["jsdeps"] = combine_commands( - install_npm(HERE, build_cmd="build:all", npm=["jlpm"]), +js_command = combine_commands( + install_npm(HERE, build_cmd="build:prod", npm=["jlpm"]), ensure_targets(jstargets), ) +is_repo = os.path.exists(os.path.join(HERE, ".git")) +if is_repo: + cmdclass["jsdeps"] = js_command +else: + cmdclass["jsdeps"] = skip_if_exists(jstargets, js_command) + with open("README.md", "r") as fh: long_description = fh.read() setup_args = dict( name=name, version=version, - url="https://github.com/rapidsai/gQuant/gquantlab", - author="Yi", + url="https://github.com/rapidsai/gQuant.git", + author="{'name': 'Yi Dong', 'email': 'doyend@gmail.com'}", description="gQuant Jupyterlab extension", long_description= long_description, long_description_content_type="text/markdown", cmdclass= cmdclass, packages=setuptools.find_packages(), install_requires=[ - "jupyterlab~=2.0", + "jupyterlab>=3.0.0rc13,==3.*", + "ipywidgets", ], zip_safe=False, include_package_data=True, + python_requires=">=3.6", license="Apache", platforms="Linux, Mac OS X, Windows", - keywords=["Jupyter", "JupyterLab"], + keywords=["Jupyter", "JupyterLab", "JupyterLab3"], classifiers=[ + "License :: OSI Approved :: Apache", "Programming Language :: Python", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.5", "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", diff --git a/gquantlab/src/EditorPanel.ts b/gQuant/gquantlab/src/EditorPanel.ts similarity index 100% rename from gquantlab/src/EditorPanel.ts rename to gQuant/gquantlab/src/EditorPanel.ts diff --git a/gquantlab/src/FilePathSelector.tsx b/gQuant/gquantlab/src/FilePathSelector.tsx similarity index 100% rename from gquantlab/src/FilePathSelector.tsx rename to gQuant/gquantlab/src/FilePathSelector.tsx diff --git a/gquantlab/src/chart.tsx b/gQuant/gquantlab/src/chart.tsx similarity index 100% rename from gquantlab/src/chart.tsx rename to gQuant/gquantlab/src/chart.tsx diff --git a/gquantlab/src/chartEngine.tsx b/gQuant/gquantlab/src/chartEngine.tsx similarity index 97% rename from gquantlab/src/chartEngine.tsx rename to gQuant/gquantlab/src/chartEngine.tsx index 494c3309..aff523e8 100644 --- a/gquantlab/src/chartEngine.tsx +++ b/gQuant/gquantlab/src/chartEngine.tsx @@ -1,6 +1,7 @@ import React from 'react'; -import { dagStratify, sugiyama, layeringSimplex, decrossOpt, coordVert } from 'd3-dag'; -import YAML from 'yaml'; +import { dagStratify, sugiyama, layeringLongestPath, decrossTwoLayer, coordCenter } from 'd3-dag'; +//import YAML from 'yaml'; +import jsyaml from 'js-yaml'; import { IEdge, INode, ContentHandler, IChartInput } from './document'; // eslint-disable-next-line @typescript-eslint/no-unused-vars @@ -388,9 +389,9 @@ export class ChartEngine extends React.Component { const dagData = dagStratify()(data); sugiyama() .size([height ? height : DefaultHeight, width ? width : DefaultWidth]) - .layering(layeringSimplex()) - .decross(decrossOpt()) - .coord(coordVert())(dagData); + .layering(layeringLongestPath()) + .decross(decrossTwoLayer()) + .coord(coordCenter())(dagData); // set the coordinates dagData.descendants().forEach((d: any) => { if (transform) { @@ -498,7 +499,7 @@ export class ChartEngine extends React.Component { this.props.contentHandler.privateCopy.save(); console.log('edges:', state.edges.length, 'nodes:', state.nodes.length); } - const yamlText = YAML.stringify(output); + const yamlText = jsyaml.safeDump(output); this.props.contentHandler.update(yamlText); } if (update) { diff --git a/gquantlab/src/commands.ts b/gQuant/gquantlab/src/commands.ts similarity index 99% rename from gquantlab/src/commands.ts rename to gQuant/gquantlab/src/commands.ts index 67ba055f..7f6b5c3d 100644 --- a/gquantlab/src/commands.ts +++ b/gQuant/gquantlab/src/commands.ts @@ -1,6 +1,7 @@ /* eslint-disable @typescript-eslint/camelcase */ import { JupyterFrontEnd } from '@jupyterlab/application'; -import YAML from 'yaml'; +//import YAML from 'yaml'; +import jsyaml from 'js-yaml'; import { CommandRegistry } from '@lumino/commands'; import { gqIcon, @@ -137,7 +138,7 @@ export function setupCommands( }); const mainView = getMainView(); const obj = mainView.contentHandler.privateCopy.get('value'); - model.content = YAML.stringify(obj); + model.content = jsyaml.safeDump(obj); model.format = 'text'; app.serviceManager.contents.save(model.path, model); }; @@ -463,7 +464,7 @@ class ${args['nodeName']}(gquant.plugin_nodes.util.CompositeNode): if (isGquantVisible()) { mainView = app.shell.currentWidget as any; objStr = JSON.stringify( - YAML.parse(mainView.contentHandler.context.model.toString()), + jsyaml.safeLoad(mainView.contentHandler.context.model.toString()), null, 2 ); @@ -630,7 +631,7 @@ export function setupToolBarCommands( ext: '.gq.yaml' }); const obj = contentHandler.privateCopy.get('value'); - model.content = YAML.stringify(obj); + model.content = jsyaml.safeDump(obj); model.format = 'text'; app.serviceManager.contents.save(model.path, model); }; diff --git a/gquantlab/src/connectionHandler.ts b/gQuant/gquantlab/src/connectionHandler.ts similarity index 100% rename from gquantlab/src/connectionHandler.ts rename to gQuant/gquantlab/src/connectionHandler.ts diff --git a/gquantlab/src/document.ts b/gQuant/gquantlab/src/document.ts similarity index 98% rename from gquantlab/src/document.ts rename to gQuant/gquantlab/src/document.ts index 0dc9ced1..bedb2f2b 100644 --- a/gquantlab/src/document.ts +++ b/gQuant/gquantlab/src/document.ts @@ -5,7 +5,7 @@ import { } from '@jupyterlab/docregistry'; import { MainView } from './mainComponent'; import { requestAPI } from './gquantlab'; -import YAML from 'yaml'; +import jsyaml from 'js-yaml'; import { IEditorProp } from './nodeEditor'; import { Signal } from '@lumino/signaling'; import { MainAreaWidget } from '@jupyterlab/apputils'; @@ -196,7 +196,7 @@ export class ContentHandler { await this.context.ready; const yamlContent = this.context.model.toString(); console.log('model path', this.context.path); - const objContent = YAML.parse(yamlContent); + const objContent = jsyaml.safeLoad(yamlContent); this.renderGraph(objContent, width, height); }; refreshContent(); diff --git a/gquantlab/src/dragHandler.ts b/gQuant/gquantlab/src/dragHandler.ts similarity index 100% rename from gquantlab/src/dragHandler.ts rename to gQuant/gquantlab/src/dragHandler.ts diff --git a/gquantlab/src/editorWidget.tsx b/gQuant/gquantlab/src/editorWidget.tsx similarity index 100% rename from gquantlab/src/editorWidget.tsx rename to gQuant/gquantlab/src/editorWidget.tsx diff --git a/gquantlab/src/eventHandler.ts b/gQuant/gquantlab/src/eventHandler.ts similarity index 100% rename from gquantlab/src/eventHandler.ts rename to gQuant/gquantlab/src/eventHandler.ts diff --git a/gquantlab/src/gquantlab.ts b/gQuant/gquantlab/src/gquantlab.ts similarity index 100% rename from gquantlab/src/gquantlab.ts rename to gQuant/gquantlab/src/gquantlab.ts diff --git a/gquantlab/src/index.ts b/gQuant/gquantlab/src/index.ts similarity index 100% rename from gquantlab/src/index.ts rename to gQuant/gquantlab/src/index.ts diff --git a/gquantlab/src/mainComponent.tsx b/gQuant/gquantlab/src/mainComponent.tsx similarity index 100% rename from gquantlab/src/mainComponent.tsx rename to gQuant/gquantlab/src/mainComponent.tsx diff --git a/gquantlab/src/nodeEditor.tsx b/gQuant/gquantlab/src/nodeEditor.tsx similarity index 100% rename from gquantlab/src/nodeEditor.tsx rename to gQuant/gquantlab/src/nodeEditor.tsx diff --git a/gquantlab/src/showType.ts b/gQuant/gquantlab/src/showType.ts similarity index 100% rename from gquantlab/src/showType.ts rename to gQuant/gquantlab/src/showType.ts diff --git a/gquantlab/src/svg.d.ts b/gQuant/gquantlab/src/svg.d.ts similarity index 100% rename from gquantlab/src/svg.d.ts rename to gQuant/gquantlab/src/svg.d.ts diff --git a/gquantlab/src/validator.ts b/gQuant/gquantlab/src/validator.ts similarity index 100% rename from gquantlab/src/validator.ts rename to gQuant/gquantlab/src/validator.ts diff --git a/gquantlab/src/version.ts b/gQuant/gquantlab/src/version.ts similarity index 100% rename from gquantlab/src/version.ts rename to gQuant/gquantlab/src/version.ts diff --git a/gquantlab/src/widget.ts b/gQuant/gquantlab/src/widget.ts similarity index 100% rename from gquantlab/src/widget.ts rename to gQuant/gquantlab/src/widget.ts diff --git a/gquantlab/style/clean.svg b/gQuant/gquantlab/style/clean.svg similarity index 100% rename from gquantlab/style/clean.svg rename to gQuant/gquantlab/style/clean.svg diff --git a/gquantlab/style/editor.css b/gQuant/gquantlab/style/editor.css similarity index 100% rename from gquantlab/style/editor.css rename to gQuant/gquantlab/style/editor.css diff --git a/gquantlab/style/gq.svg b/gQuant/gquantlab/style/gq.svg similarity index 100% rename from gquantlab/style/gq.svg rename to gQuant/gquantlab/style/gq.svg diff --git a/gquantlab/style/index.css b/gQuant/gquantlab/style/index.css similarity index 100% rename from gquantlab/style/index.css rename to gQuant/gquantlab/style/index.css diff --git a/gquantlab/style/layout.svg b/gQuant/gquantlab/style/layout.svg similarity index 100% rename from gquantlab/style/layout.svg rename to gQuant/gquantlab/style/layout.svg diff --git a/gquantlab/style/run.svg b/gQuant/gquantlab/style/run.svg similarity index 100% rename from gquantlab/style/run.svg rename to gQuant/gquantlab/style/run.svg diff --git a/gquantlab/tsconfig.json b/gQuant/gquantlab/tsconfig.json similarity index 96% rename from gquantlab/tsconfig.json rename to gQuant/gquantlab/tsconfig.json index a86606a5..f82a054e 100644 --- a/gquantlab/tsconfig.json +++ b/gQuant/gquantlab/tsconfig.json @@ -19,7 +19,7 @@ "strict": true, "strictNullChecks": false, "target": "es2017", - "allowJs": true, + "allowJs": false, "types": [], "noImplicitThis": false, }, diff --git a/gquantlab_demo.gif b/gQuant/gquantlab_demo.gif similarity index 100% rename from gquantlab_demo.gif rename to gQuant/gquantlab_demo.gif diff --git a/gQuant/plugins/nemo_plugin/README.md b/gQuant/plugins/nemo_plugin/README.md new file mode 100644 index 00000000..0388a422 --- /dev/null +++ b/gQuant/plugins/nemo_plugin/README.md @@ -0,0 +1,53 @@ +## NeMo Plugin Example + +This is an example to show how to write an external gQuant plugin. gQuant take advantage of the `entry point` inside the `setup.py` file to register the plugin. gQuant can discover all the plugins that has the entry point group name `gquant.plugin`. Check the `setup.py` file to see details. + +### Create an new Python enviroment +```bash +conda create -n test python=3.8 +``` + +### Install the gQuant +To install the gQuant graph computation library, run: +```bash +pip install gquant +``` +Or install `gquant` at the gquant directory: +```bash +pip install . +``` + +### Install the gquantlab JupyterLab plugin +To install `gquantlab` JupyterLab plugin, make sure `nodejs` of version [12^14^15] is installed. E.g: +```bash +conda install -c conda-forge nodejs=12.4.0 +``` +Then install the `gquantlab`: +```bash +pip install gquantlab +``` +Or install `gquantlab` at the gquantlab directory: +```bash +pip install . +``` + +### Install the external example plugin +It depends on `gquant_rapids_plugin` plugin, install it first. Check the README file in `gquant_rapids_plugin` directory. +Next install `nemo` library. Currently, it is only compatible with old version of nemo. +``` +git clone -b v0.11.1 https://github.com/NVIDIA/NeMo.git +cd NeMo +cp ../nemo.patch . +git apply nemo.patch && bash reinstall.sh +``` +To install the external plugin, in the plugin diretory, run following command +```bash +pip install . +``` + +### Launch the Jupyter lab +After launching the JupyterLab by, +```bash +jupyter-lab --allow-root --ip=0.0.0.0 --no-browser --NotebookApp.token='' +``` +You can see the `DistanceNode` and `PointNode` under the name `custom_node` in the menu. diff --git a/modules/nemo_gquant_modules/__init__.py b/gQuant/plugins/nemo_plugin/gquant_nemo_plugin/__init__.py similarity index 100% rename from modules/nemo_gquant_modules/__init__.py rename to gQuant/plugins/nemo_plugin/gquant_nemo_plugin/__init__.py diff --git a/modules/nemo_gquant_modules/asr.py b/gQuant/plugins/nemo_plugin/gquant_nemo_plugin/asr.py similarity index 100% rename from modules/nemo_gquant_modules/asr.py rename to gQuant/plugins/nemo_plugin/gquant_nemo_plugin/asr.py diff --git a/modules/nemo_gquant_modules/client.py b/gQuant/plugins/nemo_plugin/gquant_nemo_plugin/client.py similarity index 100% rename from modules/nemo_gquant_modules/client.py rename to gQuant/plugins/nemo_plugin/gquant_nemo_plugin/client.py diff --git a/modules/nemo_gquant_modules/common.py b/gQuant/plugins/nemo_plugin/gquant_nemo_plugin/common.py similarity index 100% rename from modules/nemo_gquant_modules/common.py rename to gQuant/plugins/nemo_plugin/gquant_nemo_plugin/common.py diff --git a/modules/nemo_gquant_modules/cv.py b/gQuant/plugins/nemo_plugin/gquant_nemo_plugin/cv.py similarity index 100% rename from modules/nemo_gquant_modules/cv.py rename to gQuant/plugins/nemo_plugin/gquant_nemo_plugin/cv.py diff --git a/modules/nemo_gquant_modules/nemoBaseNode.py b/gQuant/plugins/nemo_plugin/gquant_nemo_plugin/nemoBaseNode.py similarity index 100% rename from modules/nemo_gquant_modules/nemoBaseNode.py rename to gQuant/plugins/nemo_plugin/gquant_nemo_plugin/nemoBaseNode.py diff --git a/modules/nemo_gquant_modules/nemo_util/__init__.py b/gQuant/plugins/nemo_plugin/gquant_nemo_plugin/nemo_util/__init__.py similarity index 100% rename from modules/nemo_gquant_modules/nemo_util/__init__.py rename to gQuant/plugins/nemo_plugin/gquant_nemo_plugin/nemo_util/__init__.py diff --git a/modules/nemo_gquant_modules/nemo_util/inferNemo.py b/gQuant/plugins/nemo_plugin/gquant_nemo_plugin/nemo_util/inferNemo.py similarity index 100% rename from modules/nemo_gquant_modules/nemo_util/inferNemo.py rename to gQuant/plugins/nemo_plugin/gquant_nemo_plugin/nemo_util/inferNemo.py diff --git a/modules/nemo_gquant_modules/nemo_util/nemoHPO.py b/gQuant/plugins/nemo_plugin/gquant_nemo_plugin/nemo_util/nemoHPO.py similarity index 99% rename from modules/nemo_gquant_modules/nemo_util/nemoHPO.py rename to gQuant/plugins/nemo_plugin/gquant_nemo_plugin/nemo_util/nemoHPO.py index 492e1ea6..1c1149fa 100644 --- a/modules/nemo_gquant_modules/nemo_util/nemoHPO.py +++ b/gQuant/plugins/nemo_plugin/gquant_nemo_plugin/nemo_util/nemoHPO.py @@ -1,7 +1,4 @@ -import os -from gquant.dataframe_flow.task import load_modules -load_modules(os.getenv('MODULEPATH')+'/rapids_modules/') -from rapids_modules import GridRandomSearchNode # noqa #E402 +from gquant_rapids_plugin.ml import GridRandomSearchNode from gquant.plugin_nodes.util.contextCompositeNode import ContextCompositeNode # noqa #E402 from gquant.dataframe_flow.portsSpecSchema import (ConfSchema, # noqa #E402 NodePorts) diff --git a/modules/nemo_gquant_modules/nemo_util/trainNemo.py b/gQuant/plugins/nemo_plugin/gquant_nemo_plugin/nemo_util/trainNemo.py similarity index 100% rename from modules/nemo_gquant_modules/nemo_util/trainNemo.py rename to gQuant/plugins/nemo_plugin/gquant_nemo_plugin/nemo_util/trainNemo.py diff --git a/modules/nemo_gquant_modules/nlp.py b/gQuant/plugins/nemo_plugin/gquant_nemo_plugin/nlp.py similarity index 100% rename from modules/nemo_gquant_modules/nlp.py rename to gQuant/plugins/nemo_plugin/gquant_nemo_plugin/nlp.py diff --git a/modules/nemo_gquant_modules/simple_gan.py b/gQuant/plugins/nemo_plugin/gquant_nemo_plugin/simple_gan.py similarity index 100% rename from modules/nemo_gquant_modules/simple_gan.py rename to gQuant/plugins/nemo_plugin/gquant_nemo_plugin/simple_gan.py diff --git a/modules/nemo_gquant_modules/tts.py b/gQuant/plugins/nemo_plugin/gquant_nemo_plugin/tts.py similarity index 100% rename from modules/nemo_gquant_modules/tts.py rename to gQuant/plugins/nemo_plugin/gquant_nemo_plugin/tts.py diff --git a/modules/nemo_gquant_modules/tutorials.py b/gQuant/plugins/nemo_plugin/gquant_nemo_plugin/tutorials.py similarity index 100% rename from modules/nemo_gquant_modules/tutorials.py rename to gQuant/plugins/nemo_plugin/gquant_nemo_plugin/tutorials.py diff --git a/gQuant/plugins/nemo_plugin/nemo.patch b/gQuant/plugins/nemo_plugin/nemo.patch new file mode 100644 index 00000000..b0c3a756 --- /dev/null +++ b/gQuant/plugins/nemo_plugin/nemo.patch @@ -0,0 +1,99 @@ +diff --git a/nemo/collections/nlp/metrics/sacrebleu.py b/nemo/collections/nlp/metrics/sacrebleu.py +index 5130dd96..3b223ac6 100755 +--- a/nemo/collections/nlp/metrics/sacrebleu.py ++++ b/nemo/collections/nlp/metrics/sacrebleu.py +@@ -61,13 +61,16 @@ from nemo.collections.nlp.data.tokenizers.fairseq_tokenizer import tokenize_en + VERSION = '1.3.5' + + try: ++ import threading + # SIGPIPE is not available on Windows machines, throwing an exception. + from signal import SIGPIPE + + # If SIGPIPE is available, change behaviour to default instead of ignore. + from signal import signal, SIG_DFL + +- signal(SIGPIPE, SIG_DFL) ++ ++ if threading.current_thread() == threading.main_thread(): ++ signal(SIGPIPE, SIG_DFL) + + except ImportError: + logging.warning('Could not import signal.SIGPIPE (this is expected on Windows machines)') +diff --git a/nemo/backends/pytorch/common/rnn.py b/nemo/backends/pytorch/common/rnn.py +index c1c62ac0..b9936fe3 100644 +--- a/nemo/backends/pytorch/common/rnn.py ++++ b/nemo/backends/pytorch/common/rnn.py +@@ -235,7 +235,7 @@ class EncoderRNN(TrainableNM): + embedded = self.embedding(inputs) + embedded = self.dropout(embedded) + if input_lens is not None: +- embedded = nn.utils.rnn.pack_padded_sequence(embedded, input_lens, batch_first=True) ++ embedded = nn.utils.rnn.pack_padded_sequence(embedded, input_lens.cpu(), batch_first=True) + + outputs, hidden = self.rnn(embedded) + # outputs of shape (seq_len, batch, num_directions * hidden_size) +diff --git a/nemo/backends/pytorch/tutorials/chatbot/modules.py b/nemo/backends/pytorch/tutorials/chatbot/modules.py +index 2459afa1..59b88d28 100644 +--- a/nemo/backends/pytorch/tutorials/chatbot/modules.py ++++ b/nemo/backends/pytorch/tutorials/chatbot/modules.py +@@ -122,7 +122,7 @@ class EncoderRNN(TrainableNM): + embedded = self.embedding(input_seq) + embedded = self.embedding_dropout(embedded) + # Pack padded batch of sequences for RNN module +- packed = t.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths) ++ packed = t.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths.cpu()) + # Forward pass through GRU + outputs, hidden = self.gru(packed, hidden) + # Unpack padding +diff --git a/nemo/collections/nlp/nm/trainables/common/encoder_rnn.py b/nemo/collections/nlp/nm/trainables/common/encoder_rnn.py +index 2fc2ff0a..9ec7acc4 100644 +--- a/nemo/collections/nlp/nm/trainables/common/encoder_rnn.py ++++ b/nemo/collections/nlp/nm/trainables/common/encoder_rnn.py +@@ -64,7 +64,7 @@ class EncoderRNN(TrainableNM): + embedded = self.embedding(inputs) + embedded = self.dropout(embedded) + if input_lens is not None: +- embedded = nn.utils.rnn.pack_padded_sequence(embedded, input_lens, batch_first=True) ++ embedded = nn.utils.rnn.pack_padded_sequence(embedded, input_lens.cpu(), batch_first=True) + + outputs, hidden = self.rnn(embedded) + # outputs of shape (seq_len, batch, num_directions * hidden_size) +diff --git a/nemo/collections/tts/parts/tacotron2.py b/nemo/collections/tts/parts/tacotron2.py +index 925251f1..5f81647e 100644 +--- a/nemo/collections/tts/parts/tacotron2.py ++++ b/nemo/collections/tts/parts/tacotron2.py +@@ -221,7 +221,7 @@ class Encoder(nn.Module): + + # pytorch tensor are not reversible, hence the conversion + input_lengths = input_lengths.cpu().numpy() +- x = nn.utils.rnn.pack_padded_sequence(x, input_lengths, batch_first=True, enforce_sorted=False) ++ x = nn.utils.rnn.pack_padded_sequence(x, input_lengths.cpu(), batch_first=True, enforce_sorted=False) + + self.lstm.flatten_parameters() + outputs, _ = self.lstm(x) +diff --git a/requirements/requirements_asr.txt b/requirements/requirements_asr.txt +index 901a79af..4eb76f95 100644 +--- a/requirements/requirements_asr.txt ++++ b/requirements/requirements_asr.txt +@@ -14,4 +14,4 @@ unidecode + webdataset + kaldi-python-io +-librosa<=0.7.2 ++librosa<=0.8.0 +-numba<=0.48 ++numba==0.52.0 +diff --git a/requirements/requirements_nlp.txt b/requirements/requirements_nlp.txt +index 885adf3e..0e4e44e2 100644 +--- a/requirements/requirements_nlp.txt ++++ b/requirements/requirements_nlp.txt +@@ -3,7 +3,7 @@ h5py + matplotlib + sentencepiece + torchtext +-transformers>=2.11.0 ++transformers>=2.11.0,<=3.5.1 + unidecode + youtokentome + numpy + diff --git a/notebooks/10_nemo_chatbot.ipynb b/gQuant/plugins/nemo_plugin/notebooks/10_nemo_chatbot.ipynb similarity index 99% rename from notebooks/10_nemo_chatbot.ipynb rename to gQuant/plugins/nemo_plugin/notebooks/10_nemo_chatbot.ipynb index 599afec6..5e4df236 100644 --- a/notebooks/10_nemo_chatbot.ipynb +++ b/gQuant/plugins/nemo_plugin/notebooks/10_nemo_chatbot.ipynb @@ -174,7 +174,7 @@ } ], "source": [ - "taskGraph=TaskGraph.load_taskgraph('../taskgraphs/nemo_examples/chatbot_example.gq.yaml')\n", + "taskGraph=TaskGraph.load_taskgraph('../taskgraphs/chatbot_example.gq.yaml')\n", "taskGraph.draw()" ] }, @@ -220,7 +220,7 @@ } ], "source": [ - "taskGraph=TaskGraph.load_taskgraph('../taskgraphs/nemo_examples/chatbot_simplified.gq.yaml')\n", + "taskGraph=TaskGraph.load_taskgraph('../taskgraphs/chatbot_simplified.gq.yaml')\n", "taskGraph.draw()" ] }, @@ -1144,7 +1144,7 @@ } ], "source": [ - "taskGraph=TaskGraph.load_taskgraph('../taskgraphs/nemo_examples/chatbot_hpo.gq.yaml')\n", + "taskGraph=TaskGraph.load_taskgraph('../taskgraphs/chatbot_hpo.gq.yaml')\n", "taskGraph.draw()" ] }, @@ -1182,7 +1182,7 @@ } ], "source": [ - "taskGraph=TaskGraph.load_taskgraph('../taskgraphs/nemo_examples/chatbot_large_hpo_search.gq.yaml')\n", + "taskGraph=TaskGraph.load_taskgraph('../taskgraphs/chatbot_large_hpo_search.gq.yaml')\n", "taskGraph.draw()" ] }, diff --git a/gQuant/plugins/nemo_plugin/setup.py b/gQuant/plugins/nemo_plugin/setup.py new file mode 100644 index 00000000..291752a9 --- /dev/null +++ b/gQuant/plugins/nemo_plugin/setup.py @@ -0,0 +1,18 @@ +from setuptools import setup, find_packages + +setup( + name='gquant_nemo_plugin', + packages=find_packages(include=['gquant_nemo_plugin', + 'gquant_nemo_plugin.nemo_util']), + entry_points={ + 'gquant.plugin': + ['gquant_nemo_plugin = gquant_nemo_plugin', + 'gquant_nemo_plugin.asr = gquant_nemo_plugin.asr', + 'gquant_nemo_plugin.cv = gquant_nemo_plugin.cv', + 'gquant_nemo_plugin.nlp = gquant_nemo_plugin.nlp', + 'gquant_nemo_plugin.util = gquant_nemo_plugin.nemo_util', + 'gquant_nemo_plugin.gan = gquant_nemo_plugin.simple_gan', + 'gquant_nemo_plugin.tts = gquant_nemo_plugin.tts', + 'gquant_nemo_plugin.tutorials = gquant_nemo_plugin.tutorials'], + } +) diff --git a/taskgraphs/nemo_examples/chatbot_example.gq.yaml b/gQuant/plugins/nemo_plugin/taskgraphs/chatbot_example.gq.yaml similarity index 89% rename from taskgraphs/nemo_examples/chatbot_example.gq.yaml rename to gQuant/plugins/nemo_plugin/taskgraphs/chatbot_example.gq.yaml index 2723c581..e9f2ff27 100644 --- a/taskgraphs/nemo_examples/chatbot_example.gq.yaml +++ b/gQuant/plugins/nemo_plugin/taskgraphs/chatbot_example.gq.yaml @@ -6,7 +6,7 @@ datafile: notebooks/movie_data.txt min_count: 3 inputs: {} - module: nemo_modules + module: gquant_nemo_plugin.tutorials - id: encoder type: EncoderRNNNode conf: @@ -18,7 +18,7 @@ inputs: input_lengths: data.src_lengths input_seq: data.src - module: nemo_modules + module: gquant_nemo_plugin.tutorials - id: greedy_decoder type: GreedyLuongAttnDecoderRNNNode conf: @@ -31,7 +31,7 @@ inputs: encoder_outputs: encoder_eval.outputs in_nm: decoder.out_nm - module: nemo_modules + module: gquant_nemo_plugin.tutorials - id: decoder type: LuongAttnDecoderRNNNode conf: @@ -44,7 +44,7 @@ targets: data.tgt encoder_outputs: encoder.outputs max_target_len: data.max_tgt_lengths - module: nemo_modules + module: gquant_nemo_plugin.tutorials - id: loss type: MaskedXEntropyLossNode conf: {} @@ -52,7 +52,7 @@ predictions: decoder.outputs target: data.tgt mask: data.mask - module: nemo_modules + module: gquant_nemo_plugin.tutorials - id: "" type: Output_Collector conf: {} @@ -100,6 +100,7 @@ inputs: loss@loss: loss.loss eval_loss@loss: eval_loss.loss + module: gquant_nemo_plugin.util - id: inference type: NemoInferNode conf: @@ -120,6 +121,7 @@ eval_data@src: eval_data.src greedy_decoder@outputs: greedy_decoder.outputs eval_loss@loss: eval_loss.loss + module: gquant_nemo_plugin.util - id: eval_data type: DialogDataLayerNode conf: @@ -128,7 +130,7 @@ datafile: notebooks/movie_data.txt min_count: 3 inputs: {} - module: nemo_modules + module: gquant_nemo_plugin.tutorials - id: encoder_eval type: EncoderRNNNode conf: @@ -142,7 +144,7 @@ input_seq: eval_data.src input_lengths: eval_data.src_lengths in_nm: encoder.out_nm - module: nemo_modules + module: gquant_nemo_plugin.tutorials - id: decoder_eval type: LuongAttnDecoderRNNNode conf: @@ -157,7 +159,7 @@ encoder_outputs: encoder_eval.outputs max_target_len: eval_data.max_tgt_lengths in_nm: decoder.out_nm - module: nemo_modules + module: gquant_nemo_plugin.tutorials - id: eval_loss type: MaskedXEntropyLossNode conf: {} @@ -165,4 +167,4 @@ predictions: decoder_eval.outputs target: eval_data.tgt mask: eval_data.mask - module: nemo_modules + module: gquant_nemo_plugin.tutorials diff --git a/taskgraphs/nemo_examples/chatbot_hpo.gq.yaml b/gQuant/plugins/nemo_plugin/taskgraphs/chatbot_hpo.gq.yaml similarity index 98% rename from taskgraphs/nemo_examples/chatbot_hpo.gq.yaml rename to gQuant/plugins/nemo_plugin/taskgraphs/chatbot_hpo.gq.yaml index 9d021199..a30f0d9e 100644 --- a/taskgraphs/nemo_examples/chatbot_hpo.gq.yaml +++ b/gQuant/plugins/nemo_plugin/taskgraphs/chatbot_hpo.gq.yaml @@ -85,3 +85,4 @@ taskgraph: taskgraphs/nemo_examples/chatbot_example.gq.yaml inputs: conf_in: rnn_train.conf_out + module: gquant_nemo_plugin.util diff --git a/taskgraphs/nemo_examples/chatbot_large_hpo_search.gq.yaml b/gQuant/plugins/nemo_plugin/taskgraphs/chatbot_large_hpo_search.gq.yaml similarity index 98% rename from taskgraphs/nemo_examples/chatbot_large_hpo_search.gq.yaml rename to gQuant/plugins/nemo_plugin/taskgraphs/chatbot_large_hpo_search.gq.yaml index ee256da8..0c340378 100644 --- a/taskgraphs/nemo_examples/chatbot_large_hpo_search.gq.yaml +++ b/gQuant/plugins/nemo_plugin/taskgraphs/chatbot_large_hpo_search.gq.yaml @@ -112,3 +112,4 @@ taskgraph: taskgraphs/nemo_examples/chatbot_example.gq.yaml inputs: conf_in: rnn_train.conf_out + module: gquant_nemo_plugin.util diff --git a/taskgraphs/nemo_examples/chatbot_simplified.gq.yaml b/gQuant/plugins/nemo_plugin/taskgraphs/chatbot_simplified.gq.yaml similarity index 100% rename from taskgraphs/nemo_examples/chatbot_simplified.gq.yaml rename to gQuant/plugins/nemo_plugin/taskgraphs/chatbot_simplified.gq.yaml diff --git a/taskgraphs/nemo_examples/nemo_train_composite.gq.yaml b/gQuant/plugins/nemo_plugin/taskgraphs/nemo_train_composite.gq.yaml similarity index 94% rename from taskgraphs/nemo_examples/nemo_train_composite.gq.yaml rename to gQuant/plugins/nemo_plugin/taskgraphs/nemo_train_composite.gq.yaml index 8a790185..0c3df7e8 100644 --- a/taskgraphs/nemo_examples/nemo_train_composite.gq.yaml +++ b/gQuant/plugins/nemo_plugin/taskgraphs/nemo_train_composite.gq.yaml @@ -9,6 +9,7 @@ name: data inputs: {} module: nemo_modules +module: gquant_nemo_plugin.tutorials - id: train type: NemoTrainNode conf: @@ -44,6 +45,7 @@ name: sgd inputs: network@loss@loss: network.loss@loss + module: gquant_nemo_plugin.util - id: "" type: Output_Collector conf: {} @@ -78,3 +80,4 @@ inputs: net@x: data.x loss@target: data.y + module: gquant_nemo_plugin.util diff --git a/taskgraphs/nemo_examples/nemo_train_infer.gq.yaml b/gQuant/plugins/nemo_plugin/taskgraphs/nemo_train_infer.gq.yaml similarity index 89% rename from taskgraphs/nemo_examples/nemo_train_infer.gq.yaml rename to gQuant/plugins/nemo_plugin/taskgraphs/nemo_train_infer.gq.yaml index 71c5fa36..c7367a88 100644 --- a/taskgraphs/nemo_examples/nemo_train_infer.gq.yaml +++ b/gQuant/plugins/nemo_plugin/taskgraphs/nemo_train_infer.gq.yaml @@ -8,7 +8,7 @@ x_hi: 4 name: data inputs: {} - module: nemo_modules + module: gquant_nemo_plugin.tutorials - id: net type: TaylorNetNode conf: @@ -16,7 +16,7 @@ name: net inputs: x: data.x - module: nemo_modules + module: gquant_nemo_plugin.tutorials - id: loss type: MSELossNode conf: @@ -24,7 +24,7 @@ inputs: predictions: net.y_pred target: data.y - module: nemo_modules + module: gquant_nemo_plugin.tutorials - id: train type: NemoTrainNode conf: @@ -58,6 +58,7 @@ name: sgd inputs: loss@loss: loss.loss + module: gquant_nemo_plugin.util - id: "" type: Output_Collector conf: {} @@ -79,3 +80,4 @@ inputs: net@y_pred: net.y_pred log_dir: train.checkpoint_dir + module: gquant_nemo_plugin.util diff --git a/taskgraphs/nemo_examples/simplified.gq.yaml b/gQuant/plugins/nemo_plugin/taskgraphs/simplified.gq.yaml similarity index 100% rename from taskgraphs/nemo_examples/simplified.gq.yaml rename to gQuant/plugins/nemo_plugin/taskgraphs/simplified.gq.yaml diff --git a/gQuant/plugins/rapids_plugin/README.md b/gQuant/plugins/rapids_plugin/README.md new file mode 100644 index 00000000..6f556f54 --- /dev/null +++ b/gQuant/plugins/rapids_plugin/README.md @@ -0,0 +1,70 @@ +## gQuant RAPIDS Plugin Example +This is a example to show how to write an external gQuant RAPIDS plugin. gQuant take advantage of the `entry point` inside the `setup.py` file to register the plugin. gQuant can discover all the plugins that has the entry point group name `gquant.plugin`. Check the `setup.py` file to see details. + +The examples range from simple accelerated calculation of technical trading indicators through defining workflows for interactively developing trading strategies and automating many typical tasks. + +The extensibility of the system is highlighted by examples showing how to create a dataframe flow graph, which allows for easy re-use and composability of higher level workflows. + +The examples also show how to easily convert a single-threaded solution into a Dask distributed one. + +These examples can be used as-is or, as they are open source, can be extended to suit your environments. + +### Create an new Python enviroment +```bash +conda create -n test python=3.8 +``` + +### Prerequisites +- NVIDIA Pascal™ GPU architecture or better. +- [CUDA 9.2](https://developer.nvidia.com/cuda-92-download-archive) with driver v396.37+ or [CUDA 10.0](https://developer.nvidia.com/cuda-10.0-download-archive) with driver v410.48+. +- Ubuntu 16.04 or 18.04. +- [NVIDIA-docker v2+](https://github.com/nvidia/nvidia-docker/wiki/Frequently-Asked-Questions#how-do-i-install-20-if-im-not-using-the-latest-docker-version). + + +### Download data files + +Run the following command at the project root diretory +```bash +bash download_data.sh + +``` + +### Install the gQuant +To install the gQuant graph computation library, run: +```bash +pip install gquant +``` +Or install `gquant` at the gquant directory: +```bash +pip install . +``` + +### Install the gquantlab JupyterLab plugin +To install `gquantlab` JupyterLab plugin, make sure `nodejs` of version [12^14^15] is installed. E.g: +```bash +conda install -c conda-forge nodejs=12.4.0 +``` +Then install the `gquantlab`: +```bash +pip install gquantlab +``` +Or install `gquantlab` at the gquantlab directory: +```bash +pip install . +``` + +### Install the external example plugin +Install RAPIDS: +```bash +conda install -y -c rapidsai -c nvidia -c conda-forge -c defaults rapids=0.17 +``` +To install the external plugin, in the plugin diretory, run following command +```bash +pip install . +``` + +### Launch the Jupyter lab +After launching the JupyterLab by, +```bash +jupyter-lab --allow-root --ip=0.0.0.0 --no-browser --NotebookApp.token='' +``` diff --git a/download_data.sh b/gQuant/plugins/rapids_plugin/download_data.sh similarity index 100% rename from download_data.sh rename to gQuant/plugins/rapids_plugin/download_data.sh diff --git a/modules/rapids_modules/__init__.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/__init__.py similarity index 90% rename from modules/rapids_modules/__init__.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/__init__.py index adba6acd..1ad7a348 100644 --- a/modules/rapids_modules/__init__.py +++ b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/__init__.py @@ -1,10 +1,3 @@ -from .dataloader import * # noqa: F403,F401 -from .analysis import * # noqa: F403,F401 -from .transform import * # noqa: F403,F401 -from .backtest import * # noqa: F403,F401 -from .strategy import * # noqa: F403,F401 -from .portofolio import * # noqa: F403,F401 -from .ml import * # noqa: F403,F401 from .client import validation, display # noqa: F401 from gquant.dataframe_flow._node_flow import register_validator from gquant.dataframe_flow._node_flow import register_copy_function @@ -28,8 +21,7 @@ def _validate_df(df_to_val, ref_cols, obj): number of columns. TODO: Create a ValidationError subclass. ''' - if (isinstance(df_to_val, cudf.DataFrame) or - isinstance(df_to_val, dask_cudf.DataFrame)) and \ + if isinstance(df_to_val, cudf.DataFrame) and \ len(df_to_val) == 0: err_msg = 'Node "{}" produced empty output'.format(obj.uid) raise Exception(err_msg) diff --git a/modules/rapids_modules/_port_type_node.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/_port_type_node.py similarity index 100% rename from modules/rapids_modules/_port_type_node.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/_port_type_node.py diff --git a/modules/rapids_modules/analysis/__init__.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/analysis/__init__.py similarity index 100% rename from modules/rapids_modules/analysis/__init__.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/analysis/__init__.py diff --git a/modules/rapids_modules/analysis/barPlotNode.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/analysis/barPlotNode.py similarity index 100% rename from modules/rapids_modules/analysis/barPlotNode.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/analysis/barPlotNode.py diff --git a/modules/rapids_modules/analysis/cumReturnNode.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/analysis/cumReturnNode.py similarity index 100% rename from modules/rapids_modules/analysis/cumReturnNode.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/analysis/cumReturnNode.py diff --git a/modules/rapids_modules/analysis/exportXGBoostNode.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/analysis/exportXGBoostNode.py similarity index 90% rename from modules/rapids_modules/analysis/exportXGBoostNode.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/analysis/exportXGBoostNode.py index f93c586b..15a45609 100644 --- a/modules/rapids_modules/analysis/exportXGBoostNode.py +++ b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/analysis/exportXGBoostNode.py @@ -2,6 +2,7 @@ from gquant.dataframe_flow.portsSpecSchema import (ConfSchema, MetaData, NodePorts, PortsSpecSchema) from xgboost import Booster +from gquant.dataframe_flow.util import get_file_path class XGBoostExportNode(Node): @@ -46,7 +47,7 @@ def conf_schema(self): "properties": { "path": { "type": "string", - "description": """The output filepath for the csv""" + "description": """The output filepath for the xgboost model""" } }, "required": ["path"], @@ -69,5 +70,6 @@ def process(self, inputs): model = inputs[self.INPUT_PORT_NAME] if isinstance(model, dict): model = model['booster'] - model.save_model(self.conf['path']) - return {self.OUTPUT_PORT_NAME: self.conf['path']} + pathname = get_file_path(self.conf['path']) + model.save_model(pathname) + return {self.OUTPUT_PORT_NAME: pathname} diff --git a/modules/rapids_modules/analysis/importanceCurve.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/analysis/importanceCurve.py similarity index 100% rename from modules/rapids_modules/analysis/importanceCurve.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/analysis/importanceCurve.py diff --git a/modules/rapids_modules/analysis/linePlotNode.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/analysis/linePlotNode.py similarity index 100% rename from modules/rapids_modules/analysis/linePlotNode.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/analysis/linePlotNode.py diff --git a/modules/rapids_modules/analysis/outCsvNode.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/analysis/outCsvNode.py similarity index 94% rename from modules/rapids_modules/analysis/outCsvNode.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/analysis/outCsvNode.py index b5436167..76371e2d 100644 --- a/modules/rapids_modules/analysis/outCsvNode.py +++ b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/analysis/outCsvNode.py @@ -1,5 +1,6 @@ from gquant.dataframe_flow import Node import dask_cudf +from gquant.dataframe_flow.util import get_file_path from gquant.dataframe_flow.portsSpecSchema import ConfSchema from .._port_type_node import _PortTypesMixin @@ -76,5 +77,6 @@ def process(self, inputs): input_df = raw_input_df.compute() # get the computed value else: input_df = raw_input_df - input_df.to_pandas().to_csv(self.conf['path'], index=False) + input_df.to_pandas().to_csv(get_file_path(self.conf['path']), + index=False) return {self.OUTPUT_PORT_NAME: raw_input_df} diff --git a/modules/rapids_modules/analysis/rocCurveNode.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/analysis/rocCurveNode.py similarity index 100% rename from modules/rapids_modules/analysis/rocCurveNode.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/analysis/rocCurveNode.py diff --git a/modules/rapids_modules/analysis/scatterPlotNode.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/analysis/scatterPlotNode.py similarity index 100% rename from modules/rapids_modules/analysis/scatterPlotNode.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/analysis/scatterPlotNode.py diff --git a/modules/rapids_modules/analysis/sharpeRatioNode.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/analysis/sharpeRatioNode.py similarity index 100% rename from modules/rapids_modules/analysis/sharpeRatioNode.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/analysis/sharpeRatioNode.py diff --git a/modules/rapids_modules/backtest/__init__.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/backtest/__init__.py similarity index 100% rename from modules/rapids_modules/backtest/__init__.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/backtest/__init__.py diff --git a/modules/rapids_modules/backtest/simpleBackTest.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/backtest/simpleBackTest.py similarity index 100% rename from modules/rapids_modules/backtest/simpleBackTest.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/backtest/simpleBackTest.py diff --git a/modules/rapids_modules/cache.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/cache.py similarity index 100% rename from modules/rapids_modules/cache.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/cache.py diff --git a/modules/rapids_modules/client.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/client.py similarity index 100% rename from modules/rapids_modules/client.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/client.py diff --git a/modules/rapids_modules/cuindicator/__init__.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/cuindicator/__init__.py similarity index 100% rename from modules/rapids_modules/cuindicator/__init__.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/cuindicator/__init__.py diff --git a/modules/rapids_modules/cuindicator/ewm.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/cuindicator/ewm.py similarity index 100% rename from modules/rapids_modules/cuindicator/ewm.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/cuindicator/ewm.py diff --git a/modules/rapids_modules/cuindicator/frac_diff.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/cuindicator/frac_diff.py similarity index 100% rename from modules/rapids_modules/cuindicator/frac_diff.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/cuindicator/frac_diff.py diff --git a/modules/rapids_modules/cuindicator/indicator.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/cuindicator/indicator.py similarity index 100% rename from modules/rapids_modules/cuindicator/indicator.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/cuindicator/indicator.py diff --git a/modules/rapids_modules/cuindicator/pewm.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/cuindicator/pewm.py similarity index 100% rename from modules/rapids_modules/cuindicator/pewm.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/cuindicator/pewm.py diff --git a/modules/rapids_modules/cuindicator/rolling.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/cuindicator/rolling.py similarity index 100% rename from modules/rapids_modules/cuindicator/rolling.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/cuindicator/rolling.py diff --git a/modules/rapids_modules/cuindicator/util.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/cuindicator/util.py similarity index 100% rename from modules/rapids_modules/cuindicator/util.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/cuindicator/util.py diff --git a/modules/rapids_modules/cuindicator/windows.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/cuindicator/windows.py similarity index 100% rename from modules/rapids_modules/cuindicator/windows.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/cuindicator/windows.py diff --git a/modules/rapids_modules/dataloader/__init__.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/dataloader/__init__.py similarity index 100% rename from modules/rapids_modules/dataloader/__init__.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/dataloader/__init__.py diff --git a/modules/rapids_modules/dataloader/classificationGenerator.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/dataloader/classificationGenerator.py similarity index 100% rename from modules/rapids_modules/dataloader/classificationGenerator.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/dataloader/classificationGenerator.py diff --git a/modules/rapids_modules/dataloader/csvStockLoader.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/dataloader/csvStockLoader.py similarity index 100% rename from modules/rapids_modules/dataloader/csvStockLoader.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/dataloader/csvStockLoader.py diff --git a/modules/rapids_modules/dataloader/stockMap.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/dataloader/stockMap.py similarity index 100% rename from modules/rapids_modules/dataloader/stockMap.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/dataloader/stockMap.py diff --git a/modules/rapids_modules/dataloader/stockNameLoader.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/dataloader/stockNameLoader.py similarity index 100% rename from modules/rapids_modules/dataloader/stockNameLoader.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/dataloader/stockNameLoader.py diff --git a/modules/rapids_modules/ml/__init__.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/ml/__init__.py similarity index 100% rename from modules/rapids_modules/ml/__init__.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/ml/__init__.py diff --git a/modules/rapids_modules/ml/forestInference.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/ml/forestInference.py similarity index 100% rename from modules/rapids_modules/ml/forestInference.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/ml/forestInference.py diff --git a/modules/rapids_modules/ml/gridRandomSearchNode.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/ml/gridRandomSearchNode.py similarity index 100% rename from modules/rapids_modules/ml/gridRandomSearchNode.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/ml/gridRandomSearchNode.py diff --git a/modules/rapids_modules/ml/splitDataNode.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/ml/splitDataNode.py similarity index 100% rename from modules/rapids_modules/ml/splitDataNode.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/ml/splitDataNode.py diff --git a/modules/rapids_modules/ml/xgboostNode.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/ml/xgboostNode.py similarity index 99% rename from modules/rapids_modules/ml/xgboostNode.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/ml/xgboostNode.py index 09e18891..8fcd3f14 100644 --- a/modules/rapids_modules/ml/xgboostNode.py +++ b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/ml/xgboostNode.py @@ -469,7 +469,7 @@ def process(self, inputs): # get the client client = dask.distributed.client.default_client() dtrain = xgb.dask.DaskDMatrix(client, input_df[required_cols]) - prediction = xgb.dask.predict(client, bst_model, dtrain).persist() + prediction = xgb.dask.predict(client, bst_model, dtrain) pred_df = dask_cudf.from_dask_dataframe( prediction.to_dask_dataframe()) pred_df.index = input_df.index diff --git a/modules/rapids_modules/portofolio/__init__.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/portofolio/__init__.py similarity index 100% rename from modules/rapids_modules/portofolio/__init__.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/portofolio/__init__.py diff --git a/modules/rapids_modules/portofolio/simpleAveragePortOpt.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/portofolio/simpleAveragePortOpt.py similarity index 100% rename from modules/rapids_modules/portofolio/simpleAveragePortOpt.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/portofolio/simpleAveragePortOpt.py diff --git a/modules/rapids_modules/strategy/__init__.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/strategy/__init__.py similarity index 100% rename from modules/rapids_modules/strategy/__init__.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/strategy/__init__.py diff --git a/modules/rapids_modules/strategy/movingAverageStrategyNode.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/strategy/movingAverageStrategyNode.py similarity index 100% rename from modules/rapids_modules/strategy/movingAverageStrategyNode.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/strategy/movingAverageStrategyNode.py diff --git a/modules/rapids_modules/strategy/portExpMovingAverageStrategyNode.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/strategy/portExpMovingAverageStrategyNode.py similarity index 100% rename from modules/rapids_modules/strategy/portExpMovingAverageStrategyNode.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/strategy/portExpMovingAverageStrategyNode.py diff --git a/modules/rapids_modules/strategy/xgboostStrategyNode.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/strategy/xgboostStrategyNode.py similarity index 99% rename from modules/rapids_modules/strategy/xgboostStrategyNode.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/strategy/xgboostStrategyNode.py index 94317ea1..cf91860b 100644 --- a/modules/rapids_modules/strategy/xgboostStrategyNode.py +++ b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/strategy/xgboostStrategyNode.py @@ -201,7 +201,7 @@ def process(self, inputs): num_boost_round=self.conf["num_of_rounds"]) dtrain = xgb.dask.DaskDMatrix(client, input_df[train_cols]) - prediction = xgb.dask.predict(client, bst, dtrain).persist() + prediction = xgb.dask.predict(client, bst, dtrain) pred_df = dask_cudf.from_dask_dataframe( prediction.to_dask_dataframe()) pred_df.index = input_df.index diff --git a/modules/rapids_modules/transform/__init__.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/transform/__init__.py similarity index 100% rename from modules/rapids_modules/transform/__init__.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/transform/__init__.py diff --git a/modules/rapids_modules/transform/addSignIndicator.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/transform/addSignIndicator.py similarity index 100% rename from modules/rapids_modules/transform/addSignIndicator.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/transform/addSignIndicator.py diff --git a/modules/rapids_modules/transform/assetFilterNode.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/transform/assetFilterNode.py similarity index 100% rename from modules/rapids_modules/transform/assetFilterNode.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/transform/assetFilterNode.py diff --git a/modules/rapids_modules/transform/assetIndicatorNode.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/transform/assetIndicatorNode.py similarity index 100% rename from modules/rapids_modules/transform/assetIndicatorNode.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/transform/assetIndicatorNode.py diff --git a/modules/rapids_modules/transform/averageNode.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/transform/averageNode.py similarity index 100% rename from modules/rapids_modules/transform/averageNode.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/transform/averageNode.py diff --git a/modules/rapids_modules/transform/daskComputeNode.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/transform/daskComputeNode.py similarity index 100% rename from modules/rapids_modules/transform/daskComputeNode.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/transform/daskComputeNode.py diff --git a/modules/rapids_modules/transform/data_obj.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/transform/data_obj.py similarity index 100% rename from modules/rapids_modules/transform/data_obj.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/transform/data_obj.py diff --git a/modules/rapids_modules/transform/datetimeFilterNode.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/transform/datetimeFilterNode.py similarity index 100% rename from modules/rapids_modules/transform/datetimeFilterNode.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/transform/datetimeFilterNode.py diff --git a/modules/rapids_modules/transform/dropNode.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/transform/dropNode.py similarity index 100% rename from modules/rapids_modules/transform/dropNode.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/transform/dropNode.py diff --git a/modules/rapids_modules/transform/indicatorNode.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/transform/indicatorNode.py similarity index 100% rename from modules/rapids_modules/transform/indicatorNode.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/transform/indicatorNode.py diff --git a/modules/rapids_modules/transform/leftMergeNode.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/transform/leftMergeNode.py similarity index 100% rename from modules/rapids_modules/transform/leftMergeNode.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/transform/leftMergeNode.py diff --git a/modules/rapids_modules/transform/linearEmbedding.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/transform/linearEmbedding.py similarity index 100% rename from modules/rapids_modules/transform/linearEmbedding.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/transform/linearEmbedding.py diff --git a/modules/rapids_modules/transform/maxNode.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/transform/maxNode.py similarity index 100% rename from modules/rapids_modules/transform/maxNode.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/transform/maxNode.py diff --git a/modules/rapids_modules/transform/minNode.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/transform/minNode.py similarity index 100% rename from modules/rapids_modules/transform/minNode.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/transform/minNode.py diff --git a/modules/rapids_modules/transform/normalizationNode.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/transform/normalizationNode.py similarity index 100% rename from modules/rapids_modules/transform/normalizationNode.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/transform/normalizationNode.py diff --git a/modules/rapids_modules/transform/onehotEncoding.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/transform/onehotEncoding.py similarity index 100% rename from modules/rapids_modules/transform/onehotEncoding.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/transform/onehotEncoding.py diff --git a/modules/rapids_modules/transform/renameNode.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/transform/renameNode.py similarity index 100% rename from modules/rapids_modules/transform/renameNode.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/transform/renameNode.py diff --git a/modules/rapids_modules/transform/returnFeatureNode.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/transform/returnFeatureNode.py similarity index 100% rename from modules/rapids_modules/transform/returnFeatureNode.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/transform/returnFeatureNode.py diff --git a/modules/rapids_modules/transform/sortNode.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/transform/sortNode.py similarity index 100% rename from modules/rapids_modules/transform/sortNode.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/transform/sortNode.py diff --git a/modules/rapids_modules/transform/valueFilterNode.py b/gQuant/plugins/rapids_plugin/gquant_rapids_plugin/transform/valueFilterNode.py similarity index 100% rename from modules/rapids_modules/transform/valueFilterNode.py rename to gQuant/plugins/rapids_plugin/gquant_rapids_plugin/transform/valueFilterNode.py diff --git a/gQuant/plugins/rapids_plugin/gquantrc b/gQuant/plugins/rapids_plugin/gquantrc new file mode 100644 index 00000000..e7b33464 --- /dev/null +++ b/gQuant/plugins/rapids_plugin/gquantrc @@ -0,0 +1,2 @@ +[ModuleFiles] +my_node= %(MODULEPATH)s/my_node.py diff --git a/modules/my_node.py b/gQuant/plugins/rapids_plugin/modules/my_node.py similarity index 100% rename from modules/my_node.py rename to gQuant/plugins/rapids_plugin/modules/my_node.py diff --git a/gQuant/plugins/rapids_plugin/notebooks/01_tutorial.ipynb b/gQuant/plugins/rapids_plugin/notebooks/01_tutorial.ipynb new file mode 100644 index 00000000..ab33cc26 --- /dev/null +++ b/gQuant/plugins/rapids_plugin/notebooks/01_tutorial.ipynb @@ -0,0 +1,1357 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Introduction to gQuant\n", + "\n", + "**gQuant** is a set of open-source examples for Quantitative Analysis tasks:\n", + "- Data preparation & feat. engineering\n", + "- Alpha seeking modeling\n", + "- Technical indicators\n", + "- Backtesting\n", + "\n", + "It is GPU-accelerated by leveraging [**RAPIDS.ai**](https://rapids.ai) technology, and has Multi-GPU and Multi-Node support.\n", + "\n", + "gQuant computing components are oriented around its plugins and task graph.\n", + "\n", + "## Download example datasets\n", + "\n", + "Before getting started, let's download the example datasets if not present." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset is already present. No need to re-download it.\n" + ] + } + ], + "source": [ + "! ((test ! -f './data/stock_price_hist.csv.gz' || test ! -f './data/security_master.csv.gz') && \\\n", + " cd .. && bash download_data.sh) || echo \"Dataset is already present. No need to re-download it.\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## About this notebook\n", + "\n", + "In this tutorial, we are going to use gQuant to do a simple quant job. The job tasks are listed below:\n", + " 1. load csv stock data.\n", + " 2. filter out the stocks that has average volume smaller than 50.\n", + " 3. sort the stock symbols and datetime.\n", + " 4. add rate of return as a feature into the table.\n", + " 5. in two branches, computethe mean volume and mean return.\n", + " 6. read the file containing the stock symbol names, and join the computed dataframes.\n", + " 7. output the result in csv files.\n", + " \n", + "## TaskGraph playground\n", + "\n", + "Run the following gquant code to start a empty TaskGraph where computation graph can be created. You can follow the steps as listed below." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "aa99803de134434495790ac9be5c9871", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "GQuantWidget(sub=HBox())" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import sys; sys.path.insert(0, '..')\n", + "from gquant.dataframe_flow import TaskGraph\n", + "task_graph = TaskGraph()\n", + "task_graph.draw()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step by Step to build your first task graph\n", + "\n", + "### Create Task node to load the included stock csv file \n", + "\n", + "\n", + "### Explore the data and visualize it\n", + "\n", + "\n", + "### Clean up the Task nodes for next steps\n", + "\n", + "\n", + "### Filter the data and compute the rate of return feature\n", + "\n", + "\n", + "### Save current TaskGraph for a composite Task node\n", + "\n", + "\n", + "### Clean up the redudant feature computation Task nodes\n", + "\n", + "\n", + "### Compute the averge volume and returns \n", + "\n", + "\n", + "### Dump the dataframe to csv files\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Just in case you cannnot follow along, here you can load the tutorial taskgraph from the file. First one is the graph to calculate the return feature. " + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "fb40274ceb44404f8221b1af3f9b94e5", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "GQuantWidget(sub=HBox(), value=[OrderedDict([('id', 'stock_data'), ('type', 'CsvStockLoader'), ('conf', {'file…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "task_graph = TaskGraph.load_taskgraph('../taskgraphs/get_return_feature.gq.yaml')\n", + "task_graph.draw()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Load the full graph and click on the `run` button to see the result" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "24c9bc5ac191466a978d6cde8d668b9c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "GQuantWidget(sub=HBox(), value=[OrderedDict([('id', 'stock_data'), ('type', 'CsvStockLoader'), ('conf', {'file…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "task_graph = TaskGraph.load_taskgraph('../taskgraphs/tutorial_intro.gq.yaml')\n", + "task_graph.draw()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## About Task graphs, nodes and plugins\n", + "\n", + "Quant processing operators are defined as nodes that operates on **cuDF**/**dask_cuDF** dataframes.\n", + "\n", + "A **task graph** is a list of tasks composed of gQuant nodes.\n", + "\n", + "The cell below contains the task graph described before." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "import warnings; warnings.simplefilter(\"ignore\")\n", + "csv_average_return = 'average_return.csv'\n", + "csv_average_volume = 'average_volume.csv'\n", + "csv_file_path = './data/stock_price_hist.csv.gz'\n", + "csv_name_file_path = './data/security_master.csv.gz'\n", + "from gquant.dataframe_flow import TaskSpecSchema \n", + "\n", + "# load csv stock data\n", + "task_csvdata = {\n", + " TaskSpecSchema.task_id: 'stock_data',\n", + " TaskSpecSchema.node_type: 'CsvStockLoader',\n", + " TaskSpecSchema.conf: {'file': csv_file_path},\n", + " TaskSpecSchema.inputs: {},\n", + " TaskSpecSchema.module: \"gquant_rapids_plugin.dataloader\"\n", + "}\n", + "\n", + "# filter out the stocks that has average volume smaller than 50\n", + "task_minVolume = {\n", + " TaskSpecSchema.task_id: 'volume_filter',\n", + " TaskSpecSchema.node_type: 'ValueFilterNode',\n", + " TaskSpecSchema.conf: [{'min': 50.0, 'column': 'volume'}],\n", + " TaskSpecSchema.inputs: {'in': 'stock_data.cudf_out'},\n", + " TaskSpecSchema.module: \"gquant_rapids_plugin.transform\"\n", + "}\n", + "\n", + "# sort the stock symbols and datetime\n", + "task_sort = {\n", + " TaskSpecSchema.task_id: 'sort_node',\n", + " TaskSpecSchema.node_type: 'SortNode',\n", + " TaskSpecSchema.conf: {'keys': ['asset', 'datetime']},\n", + " TaskSpecSchema.inputs: {'in': 'volume_filter.out'},\n", + " TaskSpecSchema.module: \"gquant_rapids_plugin.transform\"\n", + "}\n", + "\n", + "# add rate of return as a feature into the table\n", + "task_addReturn = {\n", + " TaskSpecSchema.task_id: 'add_return_feature',\n", + " TaskSpecSchema.node_type: 'ReturnFeatureNode',\n", + " TaskSpecSchema.conf: {},\n", + " TaskSpecSchema.inputs: {'stock_in': 'sort_node.out'},\n", + " TaskSpecSchema.module: \"gquant_rapids_plugin.transform\"\n", + "}\n", + "\n", + "# read the stock symbol name file and join the computed dataframes\n", + "task_stockSymbol = {\n", + " TaskSpecSchema.task_id: 'stock_name',\n", + " TaskSpecSchema.node_type: 'StockNameLoader',\n", + " TaskSpecSchema.conf: {'file': csv_name_file_path },\n", + " TaskSpecSchema.inputs: {},\n", + " TaskSpecSchema.module: \"gquant_rapids_plugin.dataloader\"\n", + "}\n", + "\n", + "# In two branches, compute the mean volume and mean return seperately\n", + "task_volumeMean = {\n", + " TaskSpecSchema.task_id: 'average_volume',\n", + " TaskSpecSchema.node_type: 'AverageNode',\n", + " TaskSpecSchema.conf: {'column': 'volume'},\n", + " TaskSpecSchema.inputs: {'stock_in': 'add_return_feature.stock_out'},\n", + " TaskSpecSchema.module: \"gquant_rapids_plugin.transform\"\n", + "}\n", + "\n", + "task_returnMean = {\n", + " TaskSpecSchema.task_id: 'average_return',\n", + " TaskSpecSchema.node_type: 'AverageNode',\n", + " TaskSpecSchema.conf: {'column': 'returns'},\n", + " TaskSpecSchema.inputs: {'stock_in': 'add_return_feature.stock_out'},\n", + " TaskSpecSchema.module: \"gquant_rapids_plugin.transform\"\n", + "}\n", + "\n", + "task_leftMerge1 = {\n", + " TaskSpecSchema.task_id: 'left_merge1',\n", + " TaskSpecSchema.node_type: 'LeftMergeNode',\n", + " TaskSpecSchema.conf: {'column': 'asset'},\n", + " TaskSpecSchema.inputs: {'left': 'average_return.stock_out', \n", + " 'right': 'stock_name.stock_name'},\n", + " TaskSpecSchema.module: \"gquant_rapids_plugin.transform\"\n", + "}\n", + "\n", + "task_leftMerge2 = {\n", + " TaskSpecSchema.task_id: 'left_merge2',\n", + " TaskSpecSchema.node_type: 'LeftMergeNode',\n", + " TaskSpecSchema.conf: {'column': 'asset'},\n", + " TaskSpecSchema.inputs: {'left': 'average_volume.stock_out', \n", + " 'right': 'stock_name.stock_name'},\n", + " TaskSpecSchema.module: \"gquant_rapids_plugin.transform\"\n", + "}\n", + "\n", + "# output the result in csv files\n", + "\n", + "task_outputCsv1 = {\n", + " TaskSpecSchema.task_id: 'output_csv1',\n", + " TaskSpecSchema.node_type: 'OutCsvNode',\n", + " TaskSpecSchema.conf: {'path': csv_average_return},\n", + " TaskSpecSchema.inputs: {'df_in': 'left_merge1.merged'},\n", + " TaskSpecSchema.module: \"gquant_rapids_plugin.analysis\"\n", + "}\n", + "\n", + "task_outputCsv2 = {\n", + " TaskSpecSchema.task_id: 'output_csv2',\n", + " TaskSpecSchema.node_type: 'OutCsvNode',\n", + " TaskSpecSchema.conf: {'path': csv_average_volume },\n", + " TaskSpecSchema.inputs: {'df_in': 'left_merge2.merged'},\n", + " TaskSpecSchema.module: \"gquant_rapids_plugin.analysis\"\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In Python, a gQuant task-spec is defined as a dictionary with the following fields:\n", + "- `id`\n", + "- `type`\n", + "- `conf`\n", + "- `inputs`\n", + "- `filepath`\n", + "- `module`\n", + "\n", + "As a best practice, we recommend using the `TaskSpecSchema` class for these fields, instead of strings.\n", + "\n", + "The `id` for a given task must be unique within a task graph. To use the result(s) of other task(s) as input(s) of a different task, we use the id(s) of the former task(s) in the `inputs` field of the next task.\n", + "\n", + "The `type` field contains the node type to use for the compute task. gQuant includes a collection of node classes. These can be found in `gquant.plugin_nodes`. Click [here](#node_class_example) to see a gQuant node class example.\n", + "\n", + "The `conf` field is used to parameterise a task. It lets you access user-set parameters within a plugin (such as `self.conf['min']` in the example above). Each node defines the `conf` json schema. The gQuant UI can use this schema to generate the proper form UI for the inputs. It is recommended to use the UI to configure the `conf`. \n", + "\n", + "The `filepath` field is used to specify a python module where a custom plugin is defined. It is optional if the plugin is in `plugin_nodes` directory, and mandatory when the plugin is somewhere else. In a different tutorial, we will learn how to create custom plugins.\n", + "\n", + "The `module` is optional to tell gQuant the name of module that the node type is from. If it is not specified, gQuant will search for it among all the customized modules. \n", + "\n", + "A custom node schema will look something like this:\n", + "```\n", + "custom_task = {\n", + " TaskSpecSchema.task_id: 'custom_calc',\n", + " TaskSpecSchema.node_type: 'CustomNode',\n", + " TaskSpecSchema.conf: {},\n", + " TaskSpecSchema.inputs: ['some_other_node'],\n", + " TaskSpecSchema.filepath: 'custom_nodes.py'\n", + "}\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Below, we compose our task graph and visualize it as a graph." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "9fe6df6a63b0413bb8ec6f1f4c159bcc", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "GQuantWidget(sub=HBox(), value=[OrderedDict([('id', 'stock_data'), ('type', 'CsvStockLoader'), ('conf', {'file…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from gquant.dataframe_flow import TaskGraph\n", + "\n", + "# list of nodes composing the task graph\n", + "task_list = [\n", + " task_csvdata, task_minVolume, task_sort, task_addReturn,\n", + " task_stockSymbol, task_volumeMean, task_returnMean,\n", + " task_leftMerge1, task_leftMerge2,\n", + " task_outputCsv1, task_outputCsv2]\n", + "\n", + "task_graph = TaskGraph(task_list)\n", + "task_graph.draw()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will use `save_taskgraph` method to save the task graph to a **yaml file**.\n", + "\n", + "That will allow us to re-use it in the future." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "task_graph_file_name = '01_tutorial_task_graph.gq.yaml'\n", + "\n", + "task_graph.save_taskgraph(task_graph_file_name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here is a snippet of the content in the resulting yaml file:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "- id: stock_data\n", + " type: CsvStockLoader\n", + " conf:\n", + " file: ./data/stock_price_hist.csv.gz\n", + " inputs: {}\n", + " module: gquant_rapids_plugin.dataloader\n", + "- id: volume_filter\n", + " type: ValueFilterNode\n", + " conf:\n", + " - column: volume\n", + " min: 50\n", + " inputs:\n", + " in: stock_data.cudf_out\n", + " module: gquant_rapids_plugin.transform\n", + "- id: sort_node\n", + " type: SortNode\n", + " conf:\n", + " keys:\n", + " - asset\n" + ] + } + ], + "source": [ + "%%bash -s \"$task_graph_file_name\"\n", + "head -n 19 $1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The yaml file describes the computation tasks. We can load it and visualize it as a graph." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "3c5f7fabeb3046f5a459c8f769e6e5f7", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "GQuantWidget(sub=HBox(), value=[OrderedDict([('id', 'stock_data'), ('type', 'CsvStockLoader'), ('conf', {'file…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "task_graph = TaskGraph.load_taskgraph(task_graph_file_name)\n", + "task_graph.draw()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Building a task graph\n", + "\n", + "Running the task graph is the next logical step. Nevertheless, it can optionally be built before running it.\n", + "\n", + "By calling `build` method, the graph is traversed without running the dataframe computations. This could be useful to inspect the column names and types, validate that the plugins can be instantiated, and check for errors.\n", + "\n", + "The output of `build` are instances of each task in a dictionary.\n", + "\n", + "In the example below, we inspect the column names and types for the inputs and outputs of the `left_merge1` task:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Output of build task graph are instances of each task in a dictionary:\n", + "\n", + "stock_data: \n", + "volume_filter: \n", + "sort_node: \n", + "add_return_feature: \n", + "stock_name: \n", + "average_volume: \n", + "average_return: \n", + "left_merge1: \n", + "left_merge2: \n", + "output_csv1: \n", + "output_csv2: \n", + "\n" + ] + } + ], + "source": [ + "from pprint import pprint\n", + "\n", + "task_graph.build()\n", + "\n", + "print('Output of build task graph are instances of each task in a dictionary:\\n')\n", + "print(str(task_graph))" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "output meta in outgoing dataframe:\n", + "\n", + "MetaData(inports={'left': {}, 'right': {}}, outports={'merged': {'returns': 'float64', 'asset': 'int64', 'asset_name': 'object'}})\n" + ] + } + ], + "source": [ + "# output meta in 'left_merge_1' node\n", + "\n", + "print('output meta in outgoing dataframe:\\n')\n", + "pprint(task_graph['left_merge1'].meta_setup())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Running a task graph\n", + "\n", + "To execute the graph computations, we will use the `run` method. If the `Output_Collector` task node is not added to the graph, a output list can be feeded to the run method. The result can be displayed in a rich mode if the `formated` argument is turned on.\n", + "\n", + "`run` can also takes an optional `replace` argument which is used and explained later on" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c64c1e5f75d84692a2b085a5d66fc286", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Tab(children=(Output(), Output(), Output(), Output(layout=Layout(border='1px solid black'), outputs=({'output_…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "outputs = ['stock_data.cudf_out', 'output_csv1.df_out', 'output_csv2.df_out']\n", + "task_graph.run(outputs=outputs, formated=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The result can be used as a tuple or dictionary." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
assetvolumeasset_name
0869577154.528596CTT
1869584701.630560LPT
2869587119.878161HBP
3869589161.938559DSLV
4869590204.126667BPTH
............
4995743880.495558NQM
4996744076.090120NQP
4997744186.637237NQS
49987443569.610747NR
4999745568.407921NRT
\n", + "

5000 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " asset volume asset_name\n", + "0 869577 154.528596 CTT\n", + "1 869584 701.630560 LPT\n", + "2 869587 119.878161 HBP\n", + "3 869589 161.938559 DSLV\n", + "4 869590 204.126667 BPTH\n", + "... ... ... ...\n", + "4995 7438 80.495558 NQM\n", + "4996 7440 76.090120 NQP\n", + "4997 7441 86.637237 NQS\n", + "4998 7443 569.610747 NR\n", + "4999 7455 68.407921 NRT\n", + "\n", + "[5000 rows x 3 columns]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result = task_graph.run(outputs=outputs)\n", + "csv_data_df, csv_1_df, csv_2_df = result\n", + "result['output_csv2.df_out']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can profile each of the computation node running time by turning on the profiler." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id:stock_data process time:4.007s\n", + "id:volume_filter process time:0.009s\n", + "id:sort_node process time:0.087s\n", + "id:add_return_feature process time:0.054s\n", + "id:average_volume process time:0.015s\n", + "id:average_return process time:0.014s\n", + "id:stock_name process time:0.014s\n", + "id:left_merge1 process time:0.003s\n", + "id:output_csv1 process time:0.015s\n", + "id:left_merge2 process time:0.002s\n", + "id:output_csv2 process time:0.013s\n" + ] + } + ], + "source": [ + "outputs = ['stock_data.cudf_out', 'output_csv1.df_out', 'output_csv2.df_out']\n", + "csv_data_df, csv_1_df, csv_2_df = task_graph.run(outputs=outputs, profile=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Where most of the time is spent on the csv file processing. This is because we have to convert the time string to the proper format via CPU. Let's inspect the content of `csv_1_df` and `csv_2_df`." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "csv_1_df content:\n", + " asset returns asset_name\n", + "0 869577 -0.000295 CTT\n", + "1 869584 0.000387 LPT\n", + "2 869587 0.027713 HBP\n", + "3 869589 0.001337 DSLV\n", + "4 869590 0.009657 BPTH\n", + "... ... ... ...\n", + "4995 24076 -0.000566 STV\n", + "4996 24077 0.000285 INVN\n", + "4997 24078 0.045625 TCO\n", + "4998 24079 0.073684 WTFCW\n", + "4999 24088 0.001275 MCF\n", + "\n", + "[5000 rows x 3 columns]\n", + "\n", + "csv_2_df content:\n", + " asset volume asset_name\n", + "0 869577 154.528596 CTT\n", + "1 869584 701.630560 LPT\n", + "2 869587 119.878161 HBP\n", + "3 869589 161.938559 DSLV\n", + "4 869590 204.126667 BPTH\n", + "... ... ... ...\n", + "4995 24076 328.960250 STV\n", + "4996 24077 1998.246290 INVN\n", + "4997 24078 443.700394 TCO\n", + "4998 24079 221.725000 WTFCW\n", + "4999 24088 128.905316 MCF\n", + "\n", + "[5000 rows x 3 columns]\n" + ] + } + ], + "source": [ + "print('csv_1_df content:')\n", + "print(csv_1_df)\n", + "\n", + "print('\\ncsv_2_df content:')\n", + "print(csv_2_df) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Also, please notice that two resulting csv files has been created:\n", + "- average_return.csv\n", + "- average_volume.csv" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "csv files created:\n" + ] + } + ], + "source": [ + "print('\\ncsv files created:')\n", + "!find . -iname \"*symbol*\" " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Subgraphs\n", + "\n", + "A nice feature of task graphs is that we can evaluate any **subgraph**. For instance, if you are only interested in the `average volume` result, you can run only the tasks which are relevant for that computation.\n", + "\n", + "If we would not want to re-run tasks, we could also use the `replace` argument of the `run` function with a `load` option.\n", + "\n", + "The `replace` argument needs to be a dictionary where each key is the task/node id. The values are a replacement task-spec dictionary (i.e. each key is a spec overload, and its value is what to overload with).\n", + "\n", + "In the example below, instead of re-running the `stock_data` node to load a csv file into a `cudf` dataframe, we will use its dataframe output to load from it." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " asset volume\n", + "0 93 86.594737\n", + "1 165 67.020000\n", + "2 239 128.835141\n", + "3 281 68.450000\n", + "4 592 110.333333\n", + "... ... ...\n", + "4995 869589 161.938559\n", + "4996 869590 204.126667\n", + "4997 869592 101.423675\n", + "4998 869597 81.298788\n", + "4999 869599 83.425988\n", + "\n", + "[5000 rows x 2 columns]\n" + ] + } + ], + "source": [ + "replace = {\n", + " 'stock_data': {\n", + " 'load': {\n", + " 'cudf_out': csv_data_df\n", + " },\n", + " 'save': True\n", + " }\n", + "}\n", + "\n", + "(volume_mean_df, ) = task_graph.run(outputs=['average_volume.stock_out'],\n", + " replace=replace)\n", + "\n", + "print(volume_mean_df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As a convenience, we can save on disk the checkpoints for any of the nodes, and re-load them if needed. It is only needed to set the save option to `True`. This step will take a while depends on the disk IO speed.\n", + "\n", + "In the example above, the `replace` spec directs `run` to save on disk for the `stock_data`. If `load` was boolean then the data would be loaded from disk presuming the data was saved to disk in a prior run.\n", + "\n", + "The default directory for saving is `/.cache/.hdf5`.\n", + "\n", + "`replace` is also used to override parameters in the tasks. For instance, if we wanted to use the value `40.0` instead `50.0` in the task `volume_filter`, we would do something similar to:\n", + "```\n", + "replace_spec = {\n", + " 'volume_filter': {\n", + " 'conf': {\n", + " 'min': 40.0\n", + " }\n", + " },\n", + " 'some_task': etc...\n", + "}\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Return mean Dataframe:\n", + "\n", + " asset returns\n", + "0 93 0.241380\n", + "1 165 0.000700\n", + "2 239 0.010021\n", + "3 281 -0.088465\n", + "4 592 0.619716\n", + "... ... ...\n", + "4995 869589 0.001337\n", + "4996 869590 0.009657\n", + "4997 869592 0.001202\n", + "4998 869597 -0.003332\n", + "4999 869599 0.003291\n", + "\n", + "[5000 rows x 2 columns]\n" + ] + } + ], + "source": [ + "replace = {'stock_data': {'load': True},\n", + " 'average_return': {'save': True}}\n", + "\n", + "\n", + "(return_mean_df, ) = task_graph.run(outputs=['average_return.stock_out'], replace=replace)\n", + "\n", + "print('Return mean Dataframe:\\n')\n", + "print(return_mean_df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, we might want to load the `return_mean_df` from the saved file and evaluate only tasks that we are interested in.\n", + "\n", + "In the cells below, we compare different load approaches:\n", + "- in-memory,\n", + "- from disk, \n", + "- and not loading at all.\n", + "\n", + "When working interactively, or in situations requiring iterative and explorative task graphs, a significant amount of time is saved by just re-loading the data that do not require to be recalculated." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using in-memory dataframes for load:\n", + "CPU times: user 183 ms, sys: 88.8 ms, total: 272 ms\n", + "Wall time: 281 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "print('Using in-memory dataframes for load:')\n", + "\n", + "replace = {'stock_data': {'load': {\n", + " 'cudf_out': csv_data_df\n", + " }},\n", + " 'average return': {'load': \n", + " {'stock_out': return_mean_df}}\n", + " }\n", + "\n", + "_ = task_graph.run(outputs=['output_csv2.df_out'], replace=replace)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using cached dataframes on disk for load:\n", + "CPU times: user 2.97 s, sys: 932 ms, total: 3.9 s\n", + "Wall time: 3.88 s\n" + ] + } + ], + "source": [ + "%%time\n", + "print('Using cached dataframes on disk for load:')\n", + "\n", + "replace = {'stock_data': {'load': True},\n", + " 'average return': {'load': True}}\n", + "\n", + "_ = task_graph.run(outputs=['output_csv2.df_out'], replace=replace)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Re-running dataframes calculations instead of using load:\n", + "CPU times: user 2.94 s, sys: 989 ms, total: 3.93 s\n", + "Wall time: 3.89 s\n" + ] + } + ], + "source": [ + "%%time\n", + "print('Re-running dataframes calculations instead of using load:')\n", + "\n", + "replace = {'stock_data': {'load': True}}\n", + "\n", + "_ = task_graph.run(outputs=['output_csv2.df_out'], replace=replace)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "An idiomatic way to save data, if not on disk, or load data, if present on disk, is demonstrated below." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 3.04 s, sys: 892 ms, total: 3.93 s\n", + "Wall time: 3.92 s\n" + ] + } + ], + "source": [ + "%%time\n", + "import os\n", + "\n", + "loadsave_csv_data = 'load' if os.path.isfile('./.cache/stock_data.hdf5') else 'save'\n", + "loadsave_return_mean = 'load' if os.path.isfile('./.cache/average_return.hdf5') else 'save'\n", + "\n", + "replace = {'stock_data': {loadsave_csv_data: True},\n", + " 'average_return': {loadsave_return_mean: True}}\n", + "\n", + "_ = task_graph.run(outputs=['output_csv2.df_out'], replace=replace)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Delete temporary files\n", + "\n", + "A few cells above, we generated a .yaml file containing the example task graph, and also a couple of CSV files.\n", + "\n", + "Let's keep our directory clean, and delete them." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "%%bash -s \"$task_graph_file_name\" \"$csv_average_return\" \"$csv_average_volume\" \n", + "rm -f $1 $2 $3" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "---\n", + "\n", + "## Node class example\n", + "\n", + "Implementing custom nodes in gQuant is very straighforward.\n", + "\n", + "Data scientists only need to override five methods in the parent class `Node`:\n", + "- `init`\n", + "- `meta_setup`\n", + "- `ports_setup`\n", + "- `conf_schema`\n", + "- `process`\n", + "\n", + "`init` method is usually used to define the required column names\n", + "\n", + "`ports_setup` defines the input and output ports for the node\n", + "\n", + "`meta_setup` method is used to calculate the output meta name and types.\n", + "\n", + "`conf_schema` method is used to define the JSON schema for the node conf so the client can generate the proper UI for it.\n", + "\n", + "`process` method takes input dataframes and computes the output dataframe. \n", + "\n", + "In this way, dataframes are strongly typed, and errors can be detected early before the time-consuming computation happens.\n", + "\n", + "Below, it can be observed `ValueFilterNode` implementation details:" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "class ValueFilterNode(_PortTypesMixin, Node):\n", + "\n", + " def init(self):\n", + " _PortTypesMixin.init(self)\n", + "\n", + " def meta_setup(self):\n", + " cols_required = {\"asset\": \"int64\"}\n", + " return _PortTypesMixin.meta_setup(self, required=cols_required)\n", + "\n", + " def ports_setup(self):\n", + " return _PortTypesMixin.ports_setup(self)\n", + "\n", + " def conf_schema(self):\n", + " json = {\n", + " \"title\": \"Value Filter Node configure\",\n", + " \"type\": \"array\",\n", + " \"description\": \"\"\"Filter the dataframe based on a list of\n", + " min/max values.\"\"\",\n", + " \"items\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"column\": {\n", + " \"type\": \"string\",\n", + " \"description\": \"dataframe column to be filered on\"\n", + " },\n", + " \"min\": {\n", + " \"type\": \"number\",\n", + " \"description\": \"min value, inclusive\"\n", + " },\n", + " \"max\": {\n", + " \"type\": \"number\",\n", + " \"description\": \"max value, inclusive\"\n", + " }\n", + " }\n", + " }\n", + " }\n", + " ui = {}\n", + " input_meta = self.get_input_meta()\n", + " if self.INPUT_PORT_NAME in input_meta:\n", + " col_from_inport = input_meta[self.INPUT_PORT_NAME]\n", + " enums = [col for col in col_from_inport.keys()]\n", + " json['items']['properties']['column']['enum'] = enums\n", + " return ConfSchema(json=json, ui=ui)\n", + " else:\n", + " return ConfSchema(json=json, ui=ui)\n", + "\n", + " def process(self, inputs):\n", + " \"\"\"\n", + " filter the dataframe based on a list of min/max values. The node's\n", + " conf is a list of column criteria. It defines the column name in\n", + " 'column`, the min value in `min` and the max value in `max`.\n", + "\n", + " Arguments\n", + " -------\n", + " inputs: list\n", + " list of input dataframes.\n", + " Returns\n", + " -------\n", + " dataframe\n", + " \"\"\"\n", + "\n", + " input_df = inputs[self.INPUT_PORT_NAME]\n", + " str_list = []\n", + " for column_item in self.conf:\n", + " column_name = column_item['column']\n", + " if 'min' in column_item:\n", + " minValue = column_item['min']\n", + " str_item = '%s >= %f' % (column_name, minValue)\n", + " str_list.append(str_item)\n", + " if 'max' in column_item:\n", + " maxValue = column_item['max']\n", + " str_item = '%s <= %f' % (column_name, maxValue)\n", + " str_list.append(str_item)\n", + " input_df = input_df.query(\" and \".join(str_list))\n", + " return {self.OUTPUT_PORT_NAME: input_df}\n", + "\n" + ] + } + ], + "source": [ + "import inspect\n", + "from gquant_rapids_plugin.transform import ValueFilterNode\n", + "\n", + "print(inspect.getsource(ValueFilterNode))" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'status': 'ok', 'restart': True}" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import IPython\n", + "app = IPython.Application.instance()\n", + "app.kernel.do_shutdown(True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/gQuant/plugins/rapids_plugin/notebooks/02_single_stock_trade.ipynb b/gQuant/plugins/rapids_plugin/notebooks/02_single_stock_trade.ipynb new file mode 100644 index 00000000..6b115e9f --- /dev/null +++ b/gQuant/plugins/rapids_plugin/notebooks/02_single_stock_trade.ipynb @@ -0,0 +1,383 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### gQuant Tutorial\n", + "First import all the necessary modules." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import sys; sys.path.insert(0, '..')\n", + "import os\n", + "import warnings\n", + "import ipywidgets as widgets\n", + "from gquant.dataframe_flow import TaskGraph\n", + "\n", + "warnings.simplefilter(\"ignore\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset is already present. No need to re-download it.\n" + ] + } + ], + "source": [ + "! ((test ! -f './data/stock_price_hist.csv.gz' || test ! -f './data/security_master.csv.gz') && \\\n", + " cd .. && bash download_data.sh) || echo \"Dataset is already present. No need to re-download it.\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this tutorial, we are going to use gQuant to do a simple quant job. The task is fully described in a yaml file" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "- conf:\n", + " file: notebooks/data/stock_price_hist.csv.gz\n", + " id: stock_data\n", + " inputs: {}\n", + " module: gquant_rapids_plugin.dataloader\n", + " type: CsvStockLoader\n", + "- conf:\n", + " file: notebooks/data/security_master.csv.gz\n", + " id: stock_name\n", + " inputs: {}\n", + " module: gquant_rapids_plugin.dataloader\n", + " type: StockNameLoader\n", + "- conf:\n", + " asset: 4330\n", + " id: stock_selector\n", + " inputs:\n", + " name_map: stock_name.map_data\n", + " stock_in: stock_data.cudf_out\n", + " module: gquant_rapids_plugin.transform\n", + " type: AssetFilterNode\n", + "- conf: {}\n", + " id: ''\n", + " inputs:\n", + " in1: stock_selector.stock_name\n", + " in2: lineplot.lineplot\n", + " in3: barplot.barplot\n", + " in4: sharpe_ratio.sharpe_out\n", + " in5: cumulative_return.cum_return\n", + " in6: stock_data.cudf_out\n", + " module: rapids_modules\n", + " type: Output_Collector\n" + ] + } + ], + "source": [ + "!head -n 31 ../taskgraphs/simple_trade.gq.yaml" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The yaml file is describing the computation task by a graph, we can visualize it" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "205d890c5eaa49f7b8c5130b97ef8980", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "GQuantWidget(sub=HBox(), value=[OrderedDict([('id', 'stock_data'), ('type', 'CsvStockLoader'), ('conf', {'file…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "task_graph = TaskGraph.load_taskgraph('../taskgraphs/simple_trade.gq.yaml')\n", + "task_graph.draw()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "8a424b40940546e2931d8f13f3f2e74c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Tab(children=(Output(), Output(), Output(), Output(), Output(), Output(), Output(layout=Layout(border='1px sol…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "task_graph.run(formated=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We define a method to organize the output images" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_figures(result):\n", + " # format the figures\n", + " figure_width = '1200px'\n", + " figure_height = '400px'\n", + " bar_figure = result['barplot.barplot']\n", + " sharpe_number = result['sharpe_ratio.sharpe_out']\n", + " cum_return = result['cumulative_return.cum_return']\n", + " signals = result['lineplot.lineplot']\n", + " symbol = result['stock_selector.stock_name']\n", + "\n", + " bar_figure.layout.height = figure_height\n", + " bar_figure.layout.width = figure_width\n", + " cum_return.layout.height = figure_height\n", + " cum_return.layout.width = figure_width\n", + " cum_return.title = 'P & L %.3f' % (sharpe_number)\n", + " bar_figure.marks[0].labels = [symbol]\n", + " cum_return.marks[0].labels = [symbol]\n", + " signals.layout.height = figure_height\n", + " signals.layout.width = figure_width\n", + " bar_figure.axes = [bar_figure.axes[1]]\n", + " cum_return.axes = [cum_return.axes[0]]\n", + " output = widgets.VBox([bar_figure, cum_return, signals])\n", + "\n", + " return output" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Rerun the graph and send the computation result to the `plot_figure` method" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "40e6288fcd354e9eb0870f2dbfbb8608", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "VBox(children=(Figure(axes=[Axis(label='Price', orientation='vertical', scale=LinearScale(max=38.13, min=-10.1…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "result = task_graph.run()\n", + "plot_figures(result)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can change the TaskGraph node parameters interatively and hit the run button to get the updated result. It can also be done programtically, E.g. change the mean reversion parameters:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "df21e88386a2487a9f8ae6f9bbe5ac40", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "VBox(children=(Figure(axes=[Axis(label='Price', orientation='vertical', scale=LinearScale(max=38.13, min=-10.1…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "o = task_graph.run(\n", + " outputs=(list(result.get_keys())[0:]),\n", + " replace={'stock_data': {\"load\": {'cudf_out': result['stock_data.cudf_out']}},\n", + " 'mean_reversion': {'conf': {'fast': 1, 'slow': 10}}})\n", + "figure_combo = plot_figures(o)\n", + "figure_combo" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Since computation is accelerated in the GPU, we can do hyper-parameter search interatively, try to change the parameters of the `slow` and `fast` for the moving average and see if you can improve the result:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "3e80a4f4282f4f19bf2efce27c378ff6", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "VBox(children=(HBox(children=(IntRangeSlider(value=(10, 30), continuous_update=False, description='MA:', max=6…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "para_selector = widgets.IntRangeSlider(value=[10, 30],\n", + " min=3,\n", + " max=60,\n", + " step=1,\n", + " description=\"MA:\",\n", + " disabled=False,\n", + " continuous_update=False,\n", + " orientation='horizontal',\n", + " readout=True)\n", + "\n", + "\n", + "def para_selection(*stocks):\n", + " with out:\n", + " print('run')\n", + " para1 = para_selector.value[0]\n", + " para2 = para_selector.value[1]\n", + " o = task_graph.run(\n", + " outputs=(list(result.get_keys())[0:]),\n", + " replace={'stock_data': {\"load\": {'cudf_out': result['stock_data.cudf_out']}},\n", + " 'mean_reversion': {'conf': {'fast': para1, 'slow': para2}}})\n", + " figure_combo = plot_figures(o)\n", + " if (len(w.children) < 2):\n", + " w.children = (w.children[0], figure_combo,)\n", + " else:\n", + " w.children[1].children[1].marks = figure_combo.children[1].marks\n", + " w.children[1].children[2].marks = figure_combo.children[2].marks\n", + " w.children[1].children[1].title = 'P & L %.3f' % (o['sharpe_ratio.sharpe_out'])\n", + "\n", + "\n", + "out = widgets.Output(layout={'border': '1px solid black'})\n", + "para_selector.observe(para_selection, 'value')\n", + "selectors = widgets.HBox([para_selector])\n", + "w = widgets.VBox([selectors])\n", + "w" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'status': 'ok', 'restart': True}" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import IPython\n", + "app = IPython.Application.instance()\n", + "app.kernel.do_shutdown(True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/03_simple_dask_example.ipynb b/gQuant/plugins/rapids_plugin/notebooks/03_simple_dask_example.ipynb similarity index 84% rename from notebooks/03_simple_dask_example.ipynb rename to gQuant/plugins/rapids_plugin/notebooks/03_simple_dask_example.ipynb index 39075e61..05b12a78 100644 --- a/notebooks/03_simple_dask_example.ipynb +++ b/gQuant/plugins/rapids_plugin/notebooks/03_simple_dask_example.ipynb @@ -23,7 +23,7 @@ "\n", "

Client

\n", "\n", "\n", @@ -39,7 +39,7 @@ "" ], "text/plain": [ - "" + "" ] }, "execution_count": 2, @@ -125,7 +125,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "ee56162705ff4c4eb031c6fb5a84a2d3", + "model_id": "2d409afaf9ec46cd9d09af299b89be84", "version_major": 2, "version_minor": 0 }, @@ -158,7 +158,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "d0203b59aba643e8957e62354d2eb697", + "model_id": "ac85f4fd74e748adac67769fb68c29d3", "version_major": 2, "version_minor": 0 }, @@ -191,14 +191,14 @@ { "data": { "text/plain": [ - "['/home/quant/gQuant/notebooks/many-small/0.csv',\n", - " '/home/quant/gQuant/notebooks/many-small/1.csv',\n", - " '/home/quant/gQuant/notebooks/many-small/2.csv',\n", - " '/home/quant/gQuant/notebooks/many-small/3.csv',\n", - " '/home/quant/gQuant/notebooks/many-small/4.csv',\n", - " '/home/quant/gQuant/notebooks/many-small/5.csv',\n", - " '/home/quant/gQuant/notebooks/many-small/6.csv',\n", - " '/home/quant/gQuant/notebooks/many-small/7.csv']" + "['/home/yi/Projects/demo_gquant_install/plugins/rapids_plugin/notebooks/many-small/0.csv',\n", + " '/home/yi/Projects/demo_gquant_install/plugins/rapids_plugin/notebooks/many-small/1.csv',\n", + " '/home/yi/Projects/demo_gquant_install/plugins/rapids_plugin/notebooks/many-small/2.csv',\n", + " '/home/yi/Projects/demo_gquant_install/plugins/rapids_plugin/notebooks/many-small/3.csv',\n", + " '/home/yi/Projects/demo_gquant_install/plugins/rapids_plugin/notebooks/many-small/4.csv',\n", + " '/home/yi/Projects/demo_gquant_install/plugins/rapids_plugin/notebooks/many-small/5.csv',\n", + " '/home/yi/Projects/demo_gquant_install/plugins/rapids_plugin/notebooks/many-small/6.csv',\n", + " '/home/yi/Projects/demo_gquant_install/plugins/rapids_plugin/notebooks/many-small/7.csv']" ] }, "execution_count": 6, @@ -224,41 +224,41 @@ "name": "stdout", "output_type": "stream", "text": [ - "- id: stock_data\n", - " type: CsvStockLoader\n", - " conf:\n", - " file: /home/quant/gQuant/notebooks/data/stock_price_hist.csv.gz\n", - " path: /home/quant/gQuant/notebooks/many-small\n", + "- conf:\n", + " file: notebooks/data/stock_price_hist.csv.gz\n", + " path: notebooks/many-small\n", + " id: stock_data\n", " inputs: {}\n", - " module: rapids_modules\n", - "- id: sort_node\n", - " type: SortNode\n", - " conf:\n", + " module: gquant_rapids_plugin.dataloader\n", + " type: CsvStockLoader\n", + "- conf:\n", " keys:\n", - " - asset\n", - " - datetime\n", + " - asset\n", + " - datetime\n", + " id: sort_node\n", " inputs:\n", " in: stock_data.dask_cudf_out\n", - " module: rapids_modules\n", - "- id: \"\"\n", - " type: Output_Collector\n", - " conf: {}\n", + " module: gquant_rapids_plugin.transform\n", + " type: SortNode\n", + "- conf: {}\n", + " id: ''\n", " inputs:\n", " in1: output_csv.df_out\n", - "- id: average_volume\n", - " type: AverageNode\n", - " conf:\n", + " type: Output_Collector\n", + "- conf:\n", " column: volume\n", + " id: average_volume\n", " inputs:\n", " stock_in: sort_node.out\n", - " module: rapids_modules\n", - "- id: output_csv\n", - " type: OutCsvNode\n", - " conf:\n", - " path: /home/quant/gQuant/notebooks/dask_average_volume.csv\n", + " module: gquant_rapids_plugin.transform\n", + " type: AverageNode\n", + "- conf:\n", + " path: notebooks/dask_average_volume.csv\n", + " id: output_csv\n", " inputs:\n", " df_in: average_volume.stock_out\n", - " module: rapids_modules\n" + " module: gquant_rapids_plugin.analysis\n", + " type: OutCsvNode\n" ] } ], @@ -274,7 +274,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "745c2373d88a4faeb29affdeea9c0184", + "model_id": "a6ff3dc1c1b84e13b42dc199059e77d5", "version_major": 2, "version_minor": 0 }, @@ -300,16 +300,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "id:stock_data process time:0.018s\n", - "id:average_volume process time:0.318s\n", - "id:output_csv process time:0.344s\n", + "id:stock_data process time:0.020s\n", + "id:average_volume process time:0.185s\n", + "id:output_csv process time:0.276s\n", "\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "4fd773d9f1ef4263bcb3424f144adf3e", + "model_id": "64b0bdf3bc584f6f93fc8c21afcdb6e3", "version_major": 2, "version_minor": 0 }, @@ -480,6 +480,13 @@ "app.kernel.do_shutdown(True)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": null, @@ -504,7 +511,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.7" + "version": "3.8.5" } }, "nbformat": 4, diff --git a/gQuant/plugins/rapids_plugin/notebooks/04_portfolio_trade.ipynb b/gQuant/plugins/rapids_plugin/notebooks/04_portfolio_trade.ipynb new file mode 100644 index 00000000..02d430c6 --- /dev/null +++ b/gQuant/plugins/rapids_plugin/notebooks/04_portfolio_trade.ipynb @@ -0,0 +1,1070 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# gQuant - Making Quantitative Analysis Faster\n", + "\n", + "## Background\n", + "By definition, **Quantitative Finance** is the use of mathematical models and large datasets to analyze financial markets and securities, requiring massive computation to extract insight from the data. \n", + "\n", + "Many data science toolkits have been developed to help data scientists to manipulate the data. It starts with scalar number computations at the beginning. Later, the development of [Numpy](https://www.numpy.org) library helps to operate the numbers at vectors, and the popular [Pandas](https://pandas.pydata.org) library operates at a dataframe level. Manipulating data at a high level brings productivity gain for data scientists in quantitative finance.\n", + "\n", + "However, the amount of collected data is increasing exponentially over time. Also, more and more machine learning and statistical models are being developed. As a result, data scientists are facing new challenges hard to deal with traditional data science libraries.\n", + "\n", + "It is very time-consuming for CPUs to crunch massive amount of data and compute the complicated data science models. Large data set requires distributed computation, which is too complicated for data scientists to adopt.\n", + "\n", + "As a consequence, the quantitative workflow has become more complicated than ever. It integrates massive data from different sources, requiring multiple iterations to obtain significative results. \n", + "\n", + "**gQuant** has been developed to address all these challenges by organizing dataframes into graphs. It introduces the idea of **dataframe-flow**, which manipulates dataframes at graph level. An **acyclic directed graph** is defined, where the nodes are dataframe processors and the edges are the directions of passing resulting dataframes.\n", + "\n", + "With a graph approach, quant's workflow is described at a high level, letting quant analysts address the complicated workflow challenge.\n", + "\n", + "It is GPU-accelerated by leveraging [RAPIDS.ai](https://rapids.ai) technology and has **Multi-GPU and Multi-Node support**.\n", + "\n", + "We can get orders of magnitude performance boosts compared to CPU. gQuant dataframe-flow is **dataframe agnostic**, and can flow:\n", + "- Pandas dataframe, computed in the CPU.\n", + "- cuDF dataframe, computed in the GPU and producing the same result but much faster.\n", + "- dask_cuDF dataframe, being the computation automatically executed on multiple nodes and multiple GPUs." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Download example datasets\n", + "\n", + "Before getting started, let's download the example datasets if not present." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset is already present. No need to re-download it.\n" + ] + } + ], + "source": [ + "! ((test ! -f './data/stock_price_hist.csv.gz' || test ! -f './data/security_master.csv.gz') && \\\n", + " cd .. && bash download_data.sh) || echo \"Dataset is already present. No need to re-download it.\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prepare for running in Dask environment\n", + "\n", + "Let's start the Dask local cluster environment for distributed computation.\n", + "\n", + "Dask provides a web-based dashboard to help to track progress, identify performance issues, and debug failures. To learn more about Dask dashboard, just follow this [link](https://distributed.dask.org/en/latest/web.html).\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n", + "

Client

\n", + "\n", + "
\n", + "

Cluster

\n", + "
    \n", + "
  • Workers: 2
  • \n", + "
  • Cores: 2
  • \n", + "
  • Memory: 100.00 GB
  • \n", + "
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Start the Dask local cluster environment for distrubuted computation\n", + "from dask_cuda import LocalCUDACluster\n", + "from dask.distributed import Client\n", + "\n", + "cluster = LocalCUDACluster()\n", + "client = Client(cluster)\n", + "client\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Though our stock dataset is small enough to fit in a single 16G GPU, to show how to do distributed computation, we will split the dataframe into small pieces to be loaded by different workers in the cluster.\n", + "\n", + "Notice this step is need only if the dataset is not split in multiple files yet.\n", + "\n", + "First use this simple taskgraph to load data then sort it by the asset id and datatime:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "61924f0cdef24365b2086fdafc7b5acb", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "GQuantWidget(sub=HBox(), value=[OrderedDict([('id', 'stock_data'), ('type', 'CsvStockLoader'), ('conf', {'file…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import sys; sys.path.insert(0, '..')\n", + "from gquant.dataframe_flow import TaskGraph\n", + "task_graph = TaskGraph.load_taskgraph('../taskgraphs/sort_stocks.gq.yaml')\n", + "input_cached, = task_graph.run()\n", + "task_graph.draw()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "convert the sorted stock data into partitions and save it into csv files. Note, the data is slited in a way that the same asset belongs to the same partition" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['/home/yi/Projects/demo_gquant_install/notebooks/many-small/0.csv',\n", + " '/home/yi/Projects/demo_gquant_install/notebooks/many-small/1.csv',\n", + " '/home/yi/Projects/demo_gquant_install/notebooks/many-small/2.csv',\n", + " '/home/yi/Projects/demo_gquant_install/notebooks/many-small/3.csv',\n", + " '/home/yi/Projects/demo_gquant_install/notebooks/many-small/4.csv',\n", + " '/home/yi/Projects/demo_gquant_install/notebooks/many-small/5.csv',\n", + " '/home/yi/Projects/demo_gquant_install/notebooks/many-small/6.csv',\n", + " '/home/yi/Projects/demo_gquant_install/notebooks/many-small/7.csv']" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import dask.dataframe as dd\n", + "import os\n", + "num_partitions = 8\n", + "\n", + "os.makedirs('many-small', exist_ok=True)\n", + "dd.from_pandas(input_cached.set_index('asset'), npartitions=num_partitions).reset_index().to_csv('many-small/*.csv', index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## The toy example\n", + "In this notebook, we will use a simple toy example to show how easy it is to accelerate the quant workflow in the GPU.\n", + "\n", + "To mimic the end-to-end quantitative analyst task, we are going to backtest a simple mean reversion trading strategy.\n", + "\n", + "The workflow can be divided into two steps. You can follow with me with an empty gQuant widget to build the TaskGraph:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "1684faac504943b287881bfe436fd78c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "GQuantWidget(sub=HBox())" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "\n", + "task_graph = TaskGraph()\n", + "task_graph.draw()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Preprocess the dataset to remove bad points and add return feature\n", + "\n", + "\n", + "1. Load the 5000 end-of-day stocks CSV data into the dataframe and add rate of return feature to the dataframe.\n", + "\n", + "\n", + "2. Compute the average volume, min/max returns for each of the stocks\n", + "\n", + "\n", + "3. Merge the features into one dataframe, clean up the data by removing low volume stocks and extreme rate of returns stocks.\n", + "\n", + "\n", + "4. Create a composite node for this preprocess task\n", + "\n", + "\n", + "### Apply simple mean reversion algorithm and run backtest\n", + "\n", + "1. Clean up the nodes for the backtest\n", + "\n", + "\n", + "2. Compute the slow and fast exponential moving average and compute the trading signal based on it. Run backtesting and compute the returns from this strategy for each of the days and stock symbols. Run a simple portfolio optimization by averaging the stocks together for each of the trading days. Compute the sharpe ratio and cumulative return results.\n", + "\n", + "\n", + "3. Change the `slow`, `fast` parameters for the trading stratiges and re-run the backtest\n", + "\n", + "\n", + "4. Switch to run the backtest in a distributed environment by Dask\n", + "\n", + "\n", + "5. As a reference, switch to run the backtest in a CPU environment by Pandas\n", + "\n", + "\n", + "The whole workflow is organized into a TaskGraph file, which is described in a **gq.yaml** file.\n", + "\n", + "The same taskgraphs are saved in the `taskgraphs` directories. The whole workflow can be organized into a computation graph, which is described in a **yaml** file. \n", + "\n", + "Here is snippet of the yaml file:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "- conf:\n", + " file: notebooks/data/stock_price_hist.csv.gz\n", + " path: notebooks/many-small\n", + " id: stock_data\n", + " inputs: {}\n", + " module: gquant_rapids_plugin.dataloader\n", + " type: CsvStockLoader\n", + "- conf:\n", + " input:\n", + " - sort_node.in\n", + " output:\n", + " - drop_columns.out\n", + " subnode_ids:\n", + " - value_filter\n", + " subnodes_conf:\n", + " value_filter:\n", + " conf:\n", + " - column: min_return\n", + "...\n" + ] + } + ], + "source": [ + "!head -n 18 ../taskgraphs/portfolio_trade.gq.yaml\n", + "print(\"...\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "Lode the preprocess TaskGraph by `load_taskgraph` command" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c91e0a7be3bf4e69973965709e5eff9d", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "GQuantWidget(sub=HBox(), value=[OrderedDict([('id', 'stock_data'), ('type', 'CsvStockLoader'), ('conf', {'file…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "task_graph = TaskGraph.load_taskgraph('../taskgraphs/preprocess.gq.yaml')\n", + "task_graph.draw()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Lode the whole TaskGraph by `load_taskgraph` command" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "6ce4096ee9dd4ec78d929077650a9899", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "GQuantWidget(sub=HBox(), value=[OrderedDict([('id', 'stock_data'), ('type', 'CsvStockLoader'), ('conf', {'file…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "task_graph = TaskGraph.load_taskgraph('../taskgraphs/portfolio_trade.gq.yaml')\n", + "task_graph.draw()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Running this toy example in a Dask distributed environment is super easy, as gQuant operates at dataframe level.\n", + "\n", + "We just need to swap cuDF dataframes to **dask_cuDF** dataframes. Try to connect the `preprocess` node to the `Dask dataframe` output port in the `stock data` node.\n", + "\n", + "Similarly, to see how fast the GPU acceleration is, we can swtich to CPU computation environment by connecting to the `Pandas dataframe` output port.\n", + "\n", + "## Benchmarks\n", + "\n", + "While running this notebook, we have obtained the following results:\n", + "\n", + "- 181.00 seconds to run in CPU (Intel(R) Xeon(R) CPU E5-2698 v4 @ 2.20GHz).\n", + "- 9.06 seconds to run in GPU (NVIDIA v100).\n", + "\n", + "We get ~20x speed up by using GPU and GPU dataframes, compared to CPU and CPU dataframes.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## GQuant Task Node \n", + "\n", + "Each node is composed of:\n", + "- a unique id,\n", + "- a node type, \n", + "- configuration parameters\n", + "- from zero to many input nodes ids.\n", + "\n", + "gQuant's `load_taskgraph` takes this yaml file, and wires it into a graph.\n", + "\n", + "gQuant implementation includes some common nodes, useful for quantitative finance. With the help of [Numba](https://numba.pydata.org) library, we have implemented more than 30 technical indicators used in computing trading signals. All of them computed in the GPU.\n", + "\n", + "However, gQuant's goal is not to be comprehensive for quant applications. It provides a framework that is easy for anyone to implement his own nodes in the gQuant.\n", + "\n", + "\n", + "Data scientists only need to override five methods in the parent class `Node`:\n", + "- `init`\n", + "- `meta_setup`\n", + "- `ports_setup`\n", + "- `conf_schema`\n", + "- `process`\n", + "\n", + "`init` method is usually used to define the required column names\n", + "\n", + "`ports_setup` defines the input and output ports for the node\n", + "\n", + "`meta_setup` method is used to calculate the output meta name and types.\n", + "\n", + "`conf_schema` method is used to define the JSON schema for the node conf so the client can generate the proper UI for it.\n", + "\n", + "`process` method takes input dataframes and computes the output dataframe. \n", + "\n", + "In this way, dataframes are strongly typed, and errors can be detected early before the time-consuming computation happens.\n", + "\n", + "Here is the code example for implementing `MaxNode`, which is to compute the maximum value for a specified column in the dataframe." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "from gquant.dataframe_flow import Node\n", + "from gquant_rapids_plugin._port_type_node import _PortTypesMixin\n", + "from gquant.dataframe_flow.portsSpecSchema import ConfSchema\n", + "\n", + "\n", + "class MaxNode(Node, _PortTypesMixin):\n", + "\n", + " def init(self):\n", + " _PortTypesMixin.init(self)\n", + " self.INPUT_PORT_NAME = 'in'\n", + " self.OUTPUT_PORT_NAME = 'out'\n", + "\n", + " def ports_setup(self):\n", + " return _PortTypesMixin.ports_setup(self)\n", + "\n", + " def conf_schema(self):\n", + " json = {\n", + " \"title\": \"Maximum Value Node configure\",\n", + " \"type\": \"object\",\n", + " \"description\": \"Compute the maximum value of the key column\",\n", + " \"properties\": {\n", + " \"column\": {\n", + " \"type\": \"string\",\n", + " \"description\": \"column to calculate the maximum value\"\n", + " }\n", + " },\n", + " \"required\": [\"column\"],\n", + " }\n", + " input_meta = self.get_input_meta()\n", + " if self.INPUT_PORT_NAME in input_meta:\n", + " col_from_inport = input_meta[self.INPUT_PORT_NAME]\n", + " enums = [col for col in col_from_inport.keys()]\n", + " json['properties']['column']['enum'] = enums\n", + " ui = {}\n", + " return ConfSchema(json=json, ui=ui)\n", + " else:\n", + " ui = {\n", + " \"column\": {\"ui:widget\": \"text\"}\n", + " }\n", + " return ConfSchema(json=json, ui=ui)\n", + "\n", + " def process(self, inputs):\n", + " \"\"\"\n", + " Compute the maximum value of the key column which is defined in the\n", + " `column` of the node's conf\n", + "\n", + " Arguments\n", + " -------\n", + " inputs: list\n", + " list of input dataframes.\n", + " Returns\n", + " -------\n", + " dataframe\n", + " \"\"\"\n", + " input_df = inputs[self.INPUT_PORT_NAME]\n", + " max_column = self.conf['column']\n", + " volume_df = input_df[[max_column,\n", + " \"asset\"]].groupby([\"asset\"]).max().reset_index()\n", + " volume_df.columns = ['asset', max_column]\n", + " return {self.OUTPUT_PORT_NAME: volume_df}\n", + "\n", + " def meta_setup(self):\n", + " cols_required = {\"asset\": \"int64\"}\n", + " if 'column' in self.conf:\n", + " retention = {self.conf['column']: \"float64\",\n", + " \"asset\": \"int64\"}\n", + " return _PortTypesMixin.retention_meta_setup(self,\n", + " retention,\n", + " required=cols_required)\n", + " else:\n", + " retention = {\"asset\": \"int64\"}\n", + " return _PortTypesMixin.retention_meta_setup(self,\n", + " retention,\n", + " required=cols_required)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In case that there is no direct dataframe method for a particular logic, a Numba GPU kernel can be used to implement it. Some examples of customized GPU kernels in Numba can be found [here](https://github.com/rapidsai/gQuant/blob/master/notebooks/05_customize_nodes.ipynb).\n", + "\n", + "If we use customized GPU kernel functions inside the `process` method to process the dataframe instead of _normal_ dataframe API functions calls, we need to add `self.delayed_process = True` in the `meta_setup` method to let gQuant handle the dask graph integration problem. If we use _normal_ dataframe API functions inside the `process` method, nothing needs to be done as `self.delayed_process = False` by default.bgQuant automatically handles the complication of including a customized GPU kernel node into the Dask computation graph.\n", + "\n", + "Note, we set `self.delayed_process = True` for the `SortNode`. So th sort is performed at the Dask data partition level instead of sorting it globally. This has a benefits of guranteeing the sortting doens't pollute the data partition allocation, as sometimes we want to make sure the data partition remain the same during the distributed computation. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Running the TaskGraph programmatically\n", + "\n", + "\n", + "To run the TaskGraph programmatically , we can specifiy a list of output ports to the TaskGraph `run` method. The `profile` flag can be used to see the computation time spent on each of the nodes:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id:stock_data process time:4.120s\n", + "id:preprocess process time:0.879s\n", + "id:sort_after process time:0.055s\n", + "id:exp_mean_reversion process time:0.796s\n", + "id:backtest process time:0.001s\n", + "id:portfolio_opt process time:0.012s\n", + "id:sharpe_ratio process time:0.001s\n", + "id:cumulative_return process time:0.016s\n" + ] + } + ], + "source": [ + "import warnings; warnings.simplefilter(\"ignore\")\n", + "\n", + "o_gpu = task_graph.run(\n", + " outputs=['sharpe_ratio.sharpe_out', 'cumulative_return.cum_return','stock_data.cudf_out', 'preprocess.drop_columns@out'], profile=True)\n", + "gpu_strategy_cached = o_gpu['preprocess.drop_columns@out'] \n", + "gpu_input_cached = o_gpu['stock_data.cudf_out'] " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`o_gpu` will contain the outputs of four nodes: `sharpe_ratio`, `cumlative_return`, `stock_data`, `preprocess`.\n", + "\n", + "Similarly, the output from `stock_data` and `preprocess` nodes will be cached stored in `gpu_input_cached` and `strategy_cached` variables for later use. \n", + "\n", + "We can check how many of the stocks are filtered out by preprocessing steps:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "5052 stocks in original dataset.\n", + "1558 stocks remaining after filtering.\n" + ] + } + ], + "source": [ + "print(\"{} stocks in original dataset.\".format(len(gpu_input_cached['asset'].unique())))\n", + "print(\"{} stocks remaining after filtering.\".format(len(gpu_strategy_cached['asset'].unique())))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The result can be shown in IPython Rich display by turnning on the `formatted` flag:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "ca90df245c254e4ba78ae8ac9bd404ed", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Tab(children=(Output(), Output(), Output(), Output(layout=Layout(border='1px solid black'), outputs=({'output_…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "task_graph.run(\n", + " outputs=['sharpe_ratio.sharpe_out', 'cumulative_return.cum_return','preprocess.drop_columns@out'], formated=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This toy strategy gets a Sharpe ratio 0.338 without considering the transaction cost. Nice! \n", + "\n", + "[bqplot](https://github.com/bloomberg/bqplot) library is used to visualize the backtesting results in the JupyterLab notebooks. " + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "dc0336888fa940d4903d91ce87244888", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Figure(axes=[Axis(label='Cumulative return', orientation='vertical', scale=LinearScale(), side='left'), Axis(l…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# define the function to format the plots\n", + "def plot_figures(outputs):\n", + " # format the figures\n", + " figure_width = '1200px'\n", + " figure_height = '400px'\n", + " sharpe_number = outputs[0]\n", + " cum_return = outputs[1]\n", + " cum_return.layout.height = figure_height\n", + " cum_return.layout.width = figure_width\n", + " cum_return.title = 'P & L %.3f' % (sharpe_number)\n", + " return cum_return\n", + "\n", + "plot_figures(o_gpu)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "gQuant TaskGraph can be evaluated by overwritting any of the Node's parameters. E.g. we can change the parameters to filter out the stocks:" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "# Define some constants for the data filters.\n", + "# If using a GPU of 32G memory, you can safely \n", + "# set the `min_volume` to 5.0\n", + "min_volume = 10.0\n", + "min_rate = -10.0\n", + "max_rate = 10.0" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id:stock_data process time:4.021s\n", + "id:preprocess process time:0.702s\n", + "id:sort_after process time:0.130s\n", + "id:exp_mean_reversion process time:0.079s\n", + "id:backtest process time:0.003s\n", + "id:portfolio_opt process time:0.017s\n", + "id:sharpe_ratio process time:0.001s\n", + "id:cumulative_return process time:0.015s\n", + "5052 stocks in original dataset.\n", + "4405 stocks remaining after filtering.\n", + "CPU times: user 4.82 s, sys: 725 ms, total: 5.54 s\n", + "Wall time: 5.36 s\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "4227850521c94892a4fe3c2b94183811", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Figure(axes=[Axis(label='Cumulative return', orientation='vertical', scale=LinearScale()), Axis(label='Time', …" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%time\n", + "o_gpu = task_graph.run(\n", + " outputs=['sharpe_ratio.sharpe_out', 'cumulative_return.cum_return', 'stock_data.cudf_out', 'preprocess.drop_columns@out'], \n", + " replace={'preprocess': {\"conf\": {\n", + " \"subnodes_conf\": {\n", + " \"value_filter\": {\n", + " \"conf\": [{\"column\": \"average_volume\", \"min\": min_volume},\n", + " {\"column\": \"max_return\", \"max\": max_rate},\n", + " {\"column\": \"min_return\", \"min\": min_rate}]\n", + " }\n", + " },\n", + " \"taskgraph\": \"taskgraphs/preprocess.gq.yaml\",\n", + " \"input\": [\"sort_node.in\"],\n", + " \"output\": [\"drop_columns.out\"]\n", + " }}\n", + " }, profile=True)\n", + "\n", + "gpu_input_cached = o_gpu['stock_data.cudf_out'] \n", + "gpu_strategy_cached = o_gpu['preprocess.drop_columns@out'] \n", + "print(\"{} stocks in original dataset.\".format(len(gpu_input_cached['asset'].unique())))\n", + "print(\"{} stocks remaining after filtering.\".format(len(gpu_strategy_cached['asset'].unique())))\n", + "plot_figures(o_gpu)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the example above, `preprocess` node is a composite node that use a seperate TaskGraph as input and output. Any of the node inside the composite node TaskGraph configuration can be overridden as shown in the example. We change the `filter_value` node configuration inside the composite node to filter out the stocks that are not suitable for backtesting. It will discard stocks according to the values stored in `min_volume`, `min_rate`, and `max_rate` variables.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, we are going to compare the performance difference between CPU and GPU. The same computation graph can be used to flow the CPU Pandas dataframe with one change that\n", + "the preprocess node need to get input from the Pandas dataframe:" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id:stock_data process time:64.310s\n", + "id:preprocess process time:22.999s\n", + "id:sort_after process time:1.303s\n", + "id:exp_mean_reversion process time:6.645s\n", + "id:backtest process time:0.043s\n", + "id:portfolio_opt process time:0.448s\n", + "id:sharpe_ratio process time:0.001s\n", + "id:cumulative_return process time:0.014s\n", + "CPU times: user 1min 23s, sys: 14.5 s, total: 1min 38s\n", + "Wall time: 1min 35s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "o_cpu = task_graph.run(\n", + " outputs=['sharpe_ratio.sharpe_out', 'cumulative_return.cum_return'], \n", + " replace={'preprocess': {\"inputs\": {\"sort_node@in\": \"stock_data.pandas_out\"}}}, profile=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "8e2c4dadba0148d5b9294a42ec0689a3", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Figure(axes=[Axis(label='Cumulative return', orientation='vertical', scale=LinearScale(), side='left'), Axis(l…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plot_figures(o_cpu)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It produces the same result as the single GPU version but a lot slower.\n", + "\n", + "While running this notebook, we have obtained the following results:\n", + "\n", + "- 181.00 seconds to run in CPU (Intel(R) Xeon(R) CPU E5-2698 v4 @ 2.20GHz).\n", + "- 9.06 seconds to run in GPU (NVIDIA v100).\n", + "\n", + "We get ~20x speed up by using GPU and GPU dataframes, compared to CPU and CPU dataframes.\n", + "\n", + "Note, the input nodes load the dataframes from the cache variables to save the disk IO time." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The distributed computation is turned on by changing the preprocess node's input dataframe to dask dataframe: " + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id:stock_data process time:0.115s\n", + "id:preprocess process time:7.280s\n", + "id:backtest process time:0.013s\n", + "id:portfolio_opt process time:0.036s\n", + "id:sharpe_ratio process time:0.328s\n", + "id:cumulative_return process time:0.353s\n", + "CPU times: user 3.85 s, sys: 377 ms, total: 4.22 s\n", + "Wall time: 14.4 s\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "abf0fa6953074fad9f9bd7e7fa08872a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Figure(axes=[Axis(label='Cumulative return', orientation='vertical', scale=LinearScale()), Axis(label='Time', …" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%time\n", + "o_dask = task_graph.run(\n", + " outputs=['sharpe_ratio.sharpe_out', 'cumulative_return.cum_return'], \n", + " replace={'preprocess': {\"inputs\": {\"sort_node@in\": \"stock_data.dask_cudf_out\"}}}, profile=True)\n", + "plot_figures(o_dask)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Again, it produces the same results. However, the performance is not better than in the single GPU scenarios.\n", + "\n", + "Distributed computation only makes sense if we have a very large dataset that cannot be fit into one GPU.\n", + "\n", + "In this example, the dataset is small enough to be loaded into a single GPU. The between-GPU communication overhead dominates in the computation." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Strategy parameter search\n", + "Quantitative analysts often need to explore different parameters for their trading strategy.\n", + "\n", + "gQuant speeds up this iterative exploration process by using cached dataframes and sub-graphs evaluation.\n", + "\n", + "To find the optimal parameters for this toy mean reversion strategy, we only need the dataframe from `sort_2` node, which is cached in the `gpu_strategy_cached` variable.\n", + "\n", + "Because the GPU computation is so fast, we can make the parameter exploration interactive in the JupyterLab notebook:" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "30f5d401388b40648ce422e3973f4d9f", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "VBox(children=(HBox(children=(IntRangeSlider(value=(10, 30), continuous_update=False, description='MA:', max=6…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import ipywidgets as widgets\n", + "\n", + "para_selector = widgets.IntRangeSlider(value=[10, 30],\n", + " min=3,\n", + " max=60,\n", + " step=1,\n", + " description=\"MA:\",\n", + " disabled=False,\n", + " continuous_update=False,\n", + " orientation='horizontal',\n", + " readout=True)\n", + "\n", + "\n", + "def para_selection(*stocks):\n", + " with out:\n", + " para1 = para_selector.value[0]\n", + " para2 = para_selector.value[1]\n", + " o = task_graph.run(\n", + " outputs=['sharpe_ratio.sharpe_out', 'cumulative_return.cum_return'],\n", + " replace={'exp_mean_reversion': {'conf': {'fast': para1,\n", + " 'slow': para2}},\n", + " 'preprocess': {\"load\": {\"drop_columns@out\": gpu_strategy_cached},\n", + " \"conf\": {\n", + " \"subnodes_conf\": {\n", + " \"value_filter\": {\n", + " \"conf\": [{\"column\": \"average_volume\", \"min\": min_volume},\n", + " {\"column\": \"max_return\", \"max\": max_rate},\n", + " {\"column\": \"min_return\", \"min\": min_rate}]\n", + " }\n", + " },\n", + " \"taskgraph\": \"taskgraphs/preprocess.gq.yaml\",\n", + " \"input\": [\"sort_node.in\"],\n", + " \"output\": [\"drop_columns.out\"]\n", + " }}})\n", + "\n", + " figure_combo = plot_figures(o)\n", + " w.children = (w.children[0], figure_combo,)\n", + "\n", + "\n", + "out = widgets.Output(layout={'border': '1px solid black'})\n", + "para_selector.observe(para_selection, 'value')\n", + "selectors = widgets.HBox([para_selector])\n", + "w = widgets.VBox([selectors])\n", + "w" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "2b093c4f26244fdf831e4b1507dec36a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output(layout=Layout(border='1px solid black'))" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "out" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'status': 'ok', 'restart': True}" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import IPython\n", + "app = IPython.Application.instance()\n", + "app.kernel.do_shutdown(True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/gQuant/plugins/rapids_plugin/notebooks/05_customize_nodes_with_ports.ipynb b/gQuant/plugins/rapids_plugin/notebooks/05_customize_nodes_with_ports.ipynb new file mode 100644 index 00000000..45578aa8 --- /dev/null +++ b/gQuant/plugins/rapids_plugin/notebooks/05_customize_nodes_with_ports.ipynb @@ -0,0 +1,1842 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Customize your own GPU Kernels in gQuant\n", + "\n", + "The gQuant is designed to accelerate quantitive finance workflows on the GPU. The acceleration on GPU is facilitated by using cuDF dataframes organized into a computation graph. The cuDF project is a continously evolving library that provides a pandas-like API. Sometimes the data scientists are facing a few challenges that cannot be easily solved:\n", + "\n", + " 1. The quantitative work needs customized logic to manipulate the data, and there are no direct methods within cuDF to support this logic.\n", + " 2. Each cuDF dataframe method call launches the GPU kernel once. For performance crtical task, it is sometimes required to wrap lots of computation steps together in a single GPU kernel to reduce the kernel launch overheads.\n", + "\n", + "The solution is to build customized GPU kernels to implement them. The code and examples below illustrate a variety of approaches to implement customized GPU kernels in Python." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import sys; sys.path.insert(0, '..')\n", + "# Load necessary Python modules\n", + "import sys\n", + "from gquant.dataframe_flow import TaskSpecSchema, TaskGraph, MetaData\n", + "from gquant.dataframe_flow import Node, NodePorts, PortsSpecSchema\n", + "from gquant.dataframe_flow import ConfSchema\n", + "import cudf\n", + "import numpy as np\n", + "from numba import cuda\n", + "import cupy\n", + "import math\n", + "import dask\n", + "import dask_cudf" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Define a utility function to verify the results:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def verify(ground_truth, computed):\n", + " max_difference = (ground_truth - computed).abs().max()\n", + " # print('Max Difference: {}'.format(max_difference))\n", + " assert(max_difference < 1e-8)\n", + " return max_difference" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "task_graph = TaskGraph()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Example Problem: Calculating the distance of points to the origin\n", + "\n", + "The sample problem is to take a list of points in 2-D space and compute their distance to the origin.\n", + "We start by creating a source `Node` in the graph that generates a cuDF dataframe containing some configurable number of random points. A custom node is defined by inheriting from the `Node` class and overriding methods `init`, `meta_setup`, `ports_setup`, `conf_schema`, `process`.\n", + "\n", + "The `ports_setup` must return an instance of `NodePorts` which encapsulates the ports specs. Ports specs are dictionaries with port attributes/options per `PortsSpecSchema`.\n", + "\n", + "In the case of the `PointNode` below the input port is an empty dictionary, since no inputs are required, and the output has two ports `points_df_out` and `points_ddf_out`. It can output two types of dataframe frames depends who connects it.\n", + "\n", + "The `process` method receives a input dictionary where keys are input ports and values are input data. It return a dictionary where the keys correspond to the output ports. \n", + "\n", + "The `meta_setup` is used to compute the output meta information. It returns a dictionary where keys correspond to the output ports.\n", + "\n", + "The `conf_schema` is used to define the Node configuration [JSON schema](https://json-schema.org/). gQuantlab UI uses [RJSF](https://github.com/rjsf-team/react-jsonschema-form) project to generate HTML form elements based on the JSON schema. [RJSF playground](https://rjsf-team.github.io/react-jsonschema-form/) is a good place to learn how to write JSON schema and visualize it. `conf_schema` returns `ConfSchema` which encapsulate the JSON schema and UI schema together.\n", + "\n", + "The `column` and `port_types` information sometimes are determined dynamically. gQuant provides a few utility functions to help get dynamical graph information. `self.get_connected_inports()` will return a dictionay where keys are connected inport names and values are inport types. \n", + "`self.get_input_meta()` will return a dictionary where keys are connected inport names and values are column name/type paris from the parent node. `self.outport_connected(port_name)` method returns a boolean if the output port `port_name` is connected. The `PointNode` uses it to determine what kind of computation it needs to do depending on the connection." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "class PointNode(Node):\n", + "\n", + " def ports_setup(self):\n", + " input_ports = {}\n", + " output_ports = {\n", + " 'points_df_out': {\n", + " PortsSpecSchema.port_type: cudf.DataFrame\n", + " },\n", + " 'points_ddf_out': {\n", + " PortsSpecSchema.port_type: dask_cudf.DataFrame\n", + " },\n", + " }\n", + " return NodePorts(inports=input_ports, outports=output_ports)\n", + "\n", + " def conf_schema(self):\n", + " json = {\n", + " \"title\": \"PointNode configure\",\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"npts\": {\n", + " \"type\": \"number\",\n", + " \"description\": \"number of data points\",\n", + " \"minimum\": 10\n", + " },\n", + " \"npartitions\": {\n", + " \"type\": \"number\",\n", + " \"description\": \"num of partitions in the Dask dataframe\",\n", + " \"minimum\": 1\n", + " }\n", + "\n", + " },\n", + " \"required\": [\"npts\", \"npartitions\"],\n", + " }\n", + "\n", + " ui = {\n", + " \"npts\": {\"ui:widget\": \"updown\"},\n", + " \"npartitions\": {\"ui:widget\": \"updown\"}\n", + " }\n", + " return ConfSchema(json=json, ui=ui)\n", + "\n", + " def init(self):\n", + " pass\n", + " \n", + " def meta_setup(self):\n", + " columns_out = {\n", + " 'points_df_out': {\n", + " 'x': 'float64',\n", + " 'y': 'float64'\n", + " },\n", + " 'points_ddf_out': {\n", + " 'x': 'float64',\n", + " 'y': 'float64'\n", + " }\n", + " }\n", + " return MetaData(inports={}, outports=columns_out)\n", + "\n", + " def process(self, inputs):\n", + " npts = self.conf['npts']\n", + " df = cudf.DataFrame()\n", + " df['x'] = np.random.rand(npts)\n", + " df['y'] = np.random.rand(npts)\n", + " output = {}\n", + " if self.outport_connected('points_df_out'):\n", + " output.update({'points_df_out': df})\n", + " if self.outport_connected('points_ddf_out'):\n", + " npartitions = self.conf['npartitions']\n", + " ddf = dask_cudf.from_cudf(df, npartitions=npartitions)\n", + " output.update({'points_ddf_out': ddf})\n", + " return output" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The distance can be computed via cuDF methods. We define the `DistanceNode` to calculate the euclidean distance and add a `distance_cudf` column to the output dataframe. We will use that as the ground truth to compare and verify results later. Additionally, the distance node calculates absolute distance (Manhattan distance) in another output port. The compuation is done depending which output is connected.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "class DistanceNode(Node):\n", + "\n", + " def ports_setup(self):\n", + " port_type = PortsSpecSchema.port_type\n", + " input_ports = {\n", + " 'points_df_in': {\n", + " port_type: [cudf.DataFrame, dask_cudf.DataFrame]\n", + " }\n", + " }\n", + "\n", + " output_ports = {\n", + " 'distance_df': {\n", + " port_type: [cudf.DataFrame, dask_cudf.DataFrame]\n", + " },\n", + " 'distance_abs_df': {\n", + " PortsSpecSchema.port_type: [cudf.DataFrame, dask_cudf.DataFrame]\n", + " }\n", + " }\n", + " input_connections = self.get_connected_inports()\n", + " if 'points_df_in' in input_connections:\n", + " types = input_connections['points_df_in']\n", + " # connected, use the types passed in from parent\n", + " return NodePorts(inports={'points_df_in': {port_type: types}},\n", + " outports={'distance_df': {port_type: types},\n", + " 'distance_abs_df': {port_type: types},\n", + " })\n", + " else:\n", + " return NodePorts(inports=input_ports, outports=output_ports)\n", + "\n", + " def conf_schema(self):\n", + " return ConfSchema()\n", + "\n", + " def init(self):\n", + " self.delayed_process = True\n", + "\n", + "\n", + " def meta_setup(self):\n", + " req_cols = {\n", + " 'x': 'float64',\n", + " 'y': 'float64'\n", + " }\n", + " required = {\n", + " 'points_df_in': req_cols,\n", + " }\n", + " input_meta = self.get_input_meta()\n", + " output_cols = ({\n", + " 'distance_df': {\n", + " 'distance_cudf': 'float64',\n", + " 'x': 'float64',\n", + " 'y': 'float64'\n", + " },\n", + " 'distance_abs_df': {\n", + " 'distance_abs_cudf': 'float64',\n", + " 'x': 'float64',\n", + " 'y': 'float64'\n", + " }\n", + " })\n", + " if 'points_df_in' in input_meta:\n", + " col_from_inport = input_meta['points_df_in']\n", + " # additional ports\n", + " output_cols['distance_df'].update(col_from_inport)\n", + " output_cols['distance_abs_df'].update(col_from_inport)\n", + " return MetaData(inports=required, outports=output_cols)\n", + "\n", + " def process(self, inputs):\n", + " df = inputs['points_df_in']\n", + " output = {}\n", + " if self.outport_connected('distance_df'):\n", + " copy_df = df.copy()\n", + " copy_df['distance_cudf'] = (df['x'] ** 2 + df['y'] ** 2).sqrt()\n", + " output.update({'distance_df': copy_df})\n", + " if self.outport_connected('distance_abs_df'):\n", + " copy_df = df.copy()\n", + " copy_df['distance_abs_cudf'] = df['x'].abs() + df['y'].abs()\n", + " output.update({'distance_abs_df': copy_df})\n", + " return output" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Having these two nodes, we can construct a simple task graph to compute the distance." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# Task specifications.\n", + "module_name = 'custom_nodes'\n", + "\n", + "points_tspec = {\n", + " TaskSpecSchema.task_id: 'points_task',\n", + " TaskSpecSchema.node_type: PointNode,\n", + " TaskSpecSchema.conf: {'npts': 1000},\n", + " TaskSpecSchema.module: module_name,\n", + " TaskSpecSchema.inputs: {},\n", + "}\n", + "\n", + "cudf_distance_tspec = {\n", + " TaskSpecSchema.task_id: 'distance_by_cudf',\n", + " TaskSpecSchema.node_type: DistanceNode,\n", + " TaskSpecSchema.conf: {},\n", + " TaskSpecSchema.module: module_name,\n", + " TaskSpecSchema.inputs: {\n", + " 'points_df_in': 'points_task.points_df_out'\n", + " }\n", + "}\n", + "\n", + "out_spec = {\n", + " TaskSpecSchema.task_id: '',\n", + " TaskSpecSchema.node_type: \"Output_Collector\",\n", + " TaskSpecSchema.conf: {},\n", + " TaskSpecSchema.inputs: {\n", + " 'in0': 'points_task.points_df_out',\n", + " 'in1': 'distance_by_cudf.distance_df',\n", + " 'in2': 'distance_by_cudf.distance_abs_df'\n", + " }\n", + "}\n", + "\n", + "task_list = [points_tspec, cudf_distance_tspec, out_spec]\n", + "task_graph = TaskGraph(task_list)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can draw the graph in an interactive widget. First, let's register the dynamically defined gQuant nodes so the client knows about them. Note, this step is only needed if we would like to interact with gQuant by Jupyterlab UI. " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "3a1e2a3eb1394ce3bcab13db4daf6ec9", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "GQuantWidget(sub=HBox())" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "TaskGraph.register_lab_node(module_name, PointNode)\n", + "TaskGraph.register_lab_node(module_name, DistanceNode)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Draw the widget:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "85fd29bfb68c4dcdabf1b5fe2aa604be", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "GQuantWidget(sub=HBox(), value=[OrderedDict([('id', 'points_task'), ('type', 'PointNode'), ('conf', {'npts': 1…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "task_graph.draw()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The next step is to run the task graph to obtain the distances. The output is identified by the `id` of the distance node:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "119fc64a511742dfb9b2db98ed120971", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Tab(children=(Output(), Output(), Output(), Output(layout=Layout(border='1px solid black'), outputs=({'output_…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "r = task_graph.run()\n", + "points_df = r['points_task.points_df_out']\n", + "task_graph.run(formated=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Customized Kernel with Numba library\n", + "\n", + "Numba is an excellent python library used for accelerating numerical computations. Numba supports CUDA GPU programming by directly compiling a restricted subset of Python code into CUDA kernels and device functions. The Numba GPU kernel is written in Python and translated (JIT just-in-time compiled) into GPU code at runtime. This is achieved by decorating a Python function with `@cuda.jit`. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Just like a C/C++ CUDA GPU kernel, the `distance_kernel` function is called by thousands of threads in the GPU. The thread id is computed by `threadIdx.x`, `blockId.x` and `blockDim.x` built-in variables. Please check the [CUDA programming guild](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#thread-hierarchy) for details." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A cuDF series can be converted to GPU arrays compatible with the Numba library via.to_gpu_array` API. The next step is to define a Node that calls this Numba kernel to compute the distance and save the result into `distance_numba` column in the output dataframe." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "import cupy\n", + "@cuda.jit\n", + "def distance_kernel(x, y, distance, array_len):\n", + " # ii - overall thread index\n", + " ii = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x\n", + " if ii < array_len:\n", + " distance[ii] = math.sqrt(x[ii] ** 2 + y[ii] ** 2)\n", + "\n", + "\n", + "class NumbaDistanceNode(Node):\n", + "\n", + " def ports_setup(self):\n", + " port_type = PortsSpecSchema.port_type\n", + " input_ports = {\n", + " 'points_df_in': {\n", + " port_type: [cudf.DataFrame,\n", + " dask_cudf.DataFrame]\n", + " }\n", + " }\n", + "\n", + " output_ports = {\n", + " 'distance_df': {\n", + " port_type: [cudf.DataFrame,\n", + " dask_cudf.DataFrame]\n", + " }\n", + " }\n", + "\n", + " input_connections = self.get_connected_inports()\n", + " if 'points_df_in' in input_connections:\n", + " types = input_connections['points_df_in']\n", + " # connected\n", + " return NodePorts(inports={'points_df_in': {port_type: types}},\n", + " outports={'distance_df': {port_type: types}})\n", + " else:\n", + " return NodePorts(inports=input_ports, outports=output_ports)\n", + " \n", + " def init(self):\n", + " self.delayed_process = True\n", + "\n", + "\n", + " def meta_setup(self,):\n", + " required_cols = {'x': 'float64',\n", + " 'y': 'float64'}\n", + " required = {\n", + " 'points_df_in': required_cols,\n", + " 'distance_df': required_cols\n", + " }\n", + " input_meta = self.get_input_meta()\n", + " output_cols = ({\n", + " 'distance_df': {\n", + " 'distance_numba': 'float64',\n", + " 'x': 'float64',\n", + " 'y': 'float64'\n", + " }\n", + " })\n", + " if 'points_df_in' in input_meta:\n", + " col_from_inport = input_meta['points_df_in']\n", + " # additional ports\n", + " output_cols['distance_df'].update(col_from_inport)\n", + " return MetaData(inports=required, outports=output_cols)\n", + "\n", + " def conf_schema(self):\n", + " return ConfSchema()\n", + "\n", + " def process(self, inputs):\n", + " df = inputs['points_df_in']\n", + " number_of_threads = 16\n", + " number_of_blocks = ((len(df) - 1) // number_of_threads) + 1\n", + " # Inits device array by setting 0 for each index.\n", + " darr = cuda.device_array(len(df))\n", + " distance_kernel[(number_of_blocks,), (number_of_threads,)](\n", + " df['x'],\n", + " df['y'],\n", + " darr,\n", + " len(df))\n", + " df['distance_numba'] = darr\n", + " return {'distance_df': df}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `self.delayed_process = True` flag in the `meta_setup` is necesary to enable the logic in the `Node` class for handling `dask_cudf` dataframes in order to use Dask (for distributed computation i.e. multi-gpu in examples later on). The `dask_cudf` dataframe does not support GPU customized kernels directly. The `to_delayed` and `from_delayed` low level interfaces of `dask_cudf` enable this support. The gQuant framework handles `dask_cudf` dataframes automatically under the hood when we set this flag." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Customized Kernel by CuPy library\n", + "\n", + "CuPy is an alternative to Numba. Numba JIT compiles Python code into GPU device code at runtime. There are some limitations in how Numba can be used as well as JIT compilation latency overhead. When a Python process calls a Numba GPU kernel for the first time Numba has to compile the Python code, and each time a new Python process is started the GPU kernel has to be recompiled. If advanced features of CUDA are needed and latency is important, CuPy is an alternative library that can be used to compile C/C++ CUDA code. CuPy caches the GPU device code on disk (default location `$(HOME)/.cupy/kernel_cache` which can be changed via `CUPY_CACHE_DIR` environment variable) thus eliminating compilation latency for subsequent Python processes.\n", + "\n", + "`CuPy` GPU kernel is esentially a C/C++ GPU kernel. Below we define the `compute_distance` kernel using `CuPy`:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Using gQuant we can now define a Node that calls this CuPy kernel to compute the distance and save the results into `distance_cupy` column of a `cudf` dataframe." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "kernel_string = r'''\n", + " extern \"C\" __global__\n", + " void compute_distance(const double* x, const double* y,\n", + " double* distance, int arr_len) {\n", + " int tid = blockDim.x * blockIdx.x + threadIdx.x;\n", + " if (tid < arr_len){\n", + " distance[tid] = sqrt(x[tid]*x[tid] + y[tid]*y[tid]);\n", + " }\n", + " }\n", + "'''\n", + "\n", + "\n", + "class CupyDistanceNode(Node):\n", + "\n", + " def ports_setup(self):\n", + " port_type = PortsSpecSchema.port_type\n", + " input_ports = {\n", + " 'points_df_in': {\n", + " port_type: [cudf.DataFrame,\n", + " dask_cudf.DataFrame]\n", + " }\n", + " }\n", + "\n", + " output_ports = {\n", + " 'distance_df': {\n", + " port_type: [cudf.DataFrame,\n", + " dask_cudf.DataFrame]\n", + " }\n", + " }\n", + "\n", + " input_connections = self.get_connected_inports()\n", + " if 'points_df_in' in input_connections:\n", + " types = input_connections['points_df_in']\n", + " # connected\n", + " return NodePorts(inports={'points_df_in': {port_type: types}},\n", + " outports={'distance_df': {port_type: types}})\n", + " else:\n", + " return NodePorts(inports=input_ports, outports=output_ports)\n", + "\n", + " def init(self):\n", + " self.delayed_process = True\n", + "\n", + "\n", + " def meta_setup(self,):\n", + " cols_required = {'x': 'float64',\n", + " 'y': 'float64'}\n", + " required = {\n", + " 'points_df_in': cols_required,\n", + " 'distance_df': cols_required\n", + " }\n", + " input_meta = self.get_input_meta()\n", + " output_cols = ({\n", + " 'distance_df': {\n", + " 'distance_cupy': 'float64',\n", + " 'x': 'float64',\n", + " 'y': 'float64'\n", + " }\n", + " })\n", + " if 'points_df_in' in input_meta:\n", + " col_from_inport = input_meta['points_df_in']\n", + " # additional ports\n", + " output_cols['distance_df'].update(col_from_inport)\n", + " return MetaData(inports=required, outports=output_cols)\n", + "\n", + " def conf_schema(self):\n", + " return ConfSchema()\n", + "\n", + " def get_kernel(self):\n", + " raw_kernel = cupy.RawKernel(kernel_string, 'compute_distance')\n", + " return raw_kernel\n", + "\n", + " def process(self, inputs):\n", + " df = inputs['points_df_in']\n", + " cupy_x = cupy.asarray(df['x'])\n", + " cupy_y = cupy.asarray(df['y'])\n", + " number_of_threads = 16\n", + " number_of_blocks = (len(df) - 1) // number_of_threads + 1\n", + " dis = cupy.ndarray(len(df), dtype=cupy.float64)\n", + " self.get_kernel()((number_of_blocks,), (number_of_threads,),\n", + " (cupy_x, cupy_y, dis, len(df)))\n", + " df['distance_cupy'] = dis\n", + "\n", + " return {'distance_df': df}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `self.delayed_process = True` flag is added for the same reason as with `DistanceNumbaNode` i.e. to support `dask_cudf` data frames.\n", + "\n", + "Let's register these two added new nodes" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "TaskGraph.register_lab_node(module_name, NumbaDistanceNode)\n", + "TaskGraph.register_lab_node(module_name, CupyDistanceNode)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Computing using the Nodes with customized GPU kernels\n", + "\n", + "First we construct the computation graph for gQuant." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "0b49a2e2b8d9437cabe925588839a29a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "GQuantWidget(sub=HBox(), value=[OrderedDict([('id', 'points_task'), ('type', 'PointNode'), ('conf', {'npts': 1…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# For comparison to above re-use points dataframe instead\n", + "# of rand generating each time when running the task-graph.\n", + "\n", + "numba_distance_tspec = {\n", + " TaskSpecSchema.task_id: 'distance_by_numba',\n", + " TaskSpecSchema.node_type: NumbaDistanceNode,\n", + " TaskSpecSchema.conf: {}, \n", + " TaskSpecSchema.module: module_name,\n", + " TaskSpecSchema.inputs: {\n", + " 'points_df_in': 'points_task.points_df_out'\n", + " },\n", + "}\n", + "\n", + "cupy_distance_tspec = {\n", + " TaskSpecSchema.task_id: 'distance_by_cupy',\n", + " TaskSpecSchema.node_type: CupyDistanceNode,\n", + " TaskSpecSchema.conf: {},\n", + " TaskSpecSchema.module: module_name,\n", + " TaskSpecSchema.inputs: {\n", + " 'points_df_in': 'points_task.points_df_out'\n", + " },\n", + "}\n", + "\n", + "out_spec = {\n", + " TaskSpecSchema.task_id: '',\n", + " TaskSpecSchema.node_type: \"Output_Collector\",\n", + " TaskSpecSchema.conf: {},\n", + " TaskSpecSchema.inputs: {\n", + " 'in0': 'distance_by_cudf.distance_df',\n", + " 'in1': 'distance_by_numba.distance_df',\n", + " 'in2': 'distance_by_cupy.distance_df'\n", + " }\n", + "}\n", + "\n", + "task_list = [\n", + " points_tspec,\n", + " cudf_distance_tspec,\n", + " numba_distance_tspec,\n", + " cupy_distance_tspec,\n", + " out_spec\n", + "]\n", + "task_graph = TaskGraph(task_list)\n", + "\n", + "task_graph.draw()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Or, run it programmatically" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "d4014aaf33944b86a6618a2d6ecc127e", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Tab(children=(Output(), Output(), Output(), Output(layout=Layout(border='1px solid black'), outputs=({'output_…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "out_list = [\n", + " 'distance_by_cudf.distance_df',\n", + " 'distance_by_numba.distance_df',\n", + " 'distance_by_cupy.distance_df'\n", + "]\n", + "cache_load = {\"points_task\": {\"load\": {'points_df_out': points_df}}}\n", + "(df_w_cudf, df_w_numba, df_w_cupy) = task_graph.run(out_list, replace=cache_load)\n", + "task_graph.run(out_list, replace=cache_load, formated=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Use `verify` function defined above to verify the results:" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Max Difference cudf to numba: 2.220446049250313e-16\n", + "Max Difference cudf to cupy: 2.220446049250313e-16\n" + ] + } + ], + "source": [ + "mdiff = verify(df_w_cudf['distance_cudf'], df_w_numba['distance_numba'])\n", + "print('Max Difference cudf to numba: {}'.format(mdiff))\n", + "mdiff = verify(df_w_cudf['distance_cudf'], df_w_cupy['distance_cupy'])\n", + "print('Max Difference cudf to cupy: {}'.format(mdiff))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To illustrate multi-input nodes let's create a verify node." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "class VerifyNode(Node):\n", + "\n", + " def ports_setup(self):\n", + " input_ports = {\n", + " 'df1': {\n", + " PortsSpecSchema.port_type: [cudf.DataFrame,\n", + " dask_cudf.DataFrame]\n", + " },\n", + " 'df2': {\n", + " PortsSpecSchema.port_type: [cudf.DataFrame,\n", + " dask_cudf.DataFrame]\n", + " }\n", + " }\n", + " output_ports = {\n", + " 'max_diff': {\n", + " PortsSpecSchema.port_type: float\n", + " }\n", + " }\n", + "\n", + " connections = self.get_connected_inports() \n", + " for key in input_ports:\n", + " if key in connections:\n", + " # connected\n", + " types = connections[key]\n", + " input_ports[key].update({PortsSpecSchema.port_type: types})\n", + " return NodePorts(inports=input_ports, outports=output_ports)\n", + "\n", + " def meta_setup(self):\n", + " required ={\n", + " \"df1\": {},\n", + " \"df2\": {}\n", + " }\n", + " return MetaData(inports=required, outports={'max_diff': {}})\n", + "\n", + " def conf_schema(self):\n", + " json = {\n", + " \"title\": \"VerifyNode configure\",\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"df1_col\": {\n", + " \"type\": \"string\",\n", + " \"description\": \"dataframe1 column name\"\n", + " },\n", + " \"df2_col\": {\n", + " \"type\": \"string\",\n", + " \"description\": \"dataframe2 column name\"\n", + " }\n", + " },\n", + " \"required\": [\"df1_col\", \"df2_col\"],\n", + " }\n", + "\n", + " ui = {\n", + " \"df1_col\": {\"ui:widget\": \"text\"},\n", + " \"df2_col\": {\"ui:widget\": \"text\"}\n", + " }\n", + " return ConfSchema(json=json, ui=ui)\n", + "\n", + " def process(self, inputs):\n", + " df1 = inputs['df1']\n", + " df2 = inputs['df2']\n", + " col_df1 = self.conf['df1_col']\n", + " col_df2 = self.conf['df2_col']\n", + "\n", + " df1_col = df1[col_df1]\n", + " if isinstance(df1, dask_cudf.DataFrame):\n", + " # df1_col = df1_col.compute()\n", + " pass\n", + "\n", + " df2_col = df2[col_df2]\n", + " if isinstance(df2, dask_cudf.DataFrame):\n", + " # df2_col = df2_col.compute()\n", + " pass\n", + "\n", + " max_difference = (df1_col - df2_col).abs().max()\n", + "\n", + " if isinstance(max_difference, dask.dataframe.core.Scalar):\n", + " max_difference = float(max_difference.compute())\n", + " max_difference = float(max_difference)\n", + " return {'max_diff': max_difference}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Register the `VerifyNode`:" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "TaskGraph.register_lab_node(module_name, VerifyNode)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Define the full Taskgraph:" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "ecb7023af0084c43b953f33c03a12cf0", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "GQuantWidget(sub=HBox(), value=[OrderedDict([('id', 'points_task'), ('type', 'PointNode'), ('conf', {'npts': 1…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "verify_tspec = {\n", + " TaskSpecSchema.task_id: 'verify_cudf_to_numba',\n", + " TaskSpecSchema.node_type: VerifyNode,\n", + " TaskSpecSchema.conf: {\n", + " 'df1_col': 'distance_cudf',\n", + " 'df2_col': 'distance_numba'\n", + " }, \n", + " TaskSpecSchema.module: module_name,\n", + " TaskSpecSchema.inputs: {\n", + " 'df1': 'distance_by_cudf.distance_df',\n", + " 'df2': 'distance_by_numba.distance_df'\n", + " }\n", + "}\n", + "\n", + "verify_tspec2 = {\n", + " TaskSpecSchema.task_id: 'verify_cudf_to_cupy',\n", + " TaskSpecSchema.node_type: VerifyNode,\n", + " TaskSpecSchema.conf: {\n", + " 'df1_col': 'distance_cudf',\n", + " 'df2_col': 'distance_cupy'\n", + " },\n", + " TaskSpecSchema.module: module_name,\n", + " TaskSpecSchema.inputs: {\n", + " 'df1': 'distance_by_cudf.distance_df',\n", + " 'df2': 'distance_by_cupy.distance_df'\n", + " }\n", + "}\n", + "out_spec = {\n", + " TaskSpecSchema.task_id: '',\n", + " TaskSpecSchema.node_type: \"Output_Collector\",\n", + " TaskSpecSchema.conf: {},\n", + " TaskSpecSchema.inputs: {\n", + " 'in0': 'verify_cudf_to_numba.max_diff',\n", + " 'in1': 'verify_cudf_to_cupy.max_diff'\n", + " }\n", + "}\n", + "\n", + "task_list = [\n", + " points_tspec,\n", + " cudf_distance_tspec,\n", + " numba_distance_tspec,\n", + " cupy_distance_tspec,\n", + " out_spec,\n", + " verify_tspec, \n", + " verify_tspec2\n", + "]\n", + "task_graph = TaskGraph(task_list)\n", + "task_graph.draw()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Max Difference cudf to numba: 2.220446049250313e-16\n", + "Max Difference cudf to cupy: 2.220446049250313e-16\n" + ] + } + ], + "source": [ + "(max_cudf_to_numba_diff, max_cudf_to_cupy_diff) = task_graph.run([\n", + " 'verify_cudf_to_numba.max_diff',\n", + " 'verify_cudf_to_cupy.max_diff'\n", + "])\n", + "print('Max Difference cudf to numba: {}'.format(max_cudf_to_numba_diff))\n", + "print('Max Difference cudf to cupy: {}'.format(max_cudf_to_cupy_diff))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Dask distributed computation\n", + "\n", + "Using Dask and `dask-cudf` we can run the Nodes with customized GPU kernels on distributed dataframes. Under the hood of the `Node` class the Dask delayed processing API is handled for cudf dataframes when the `self.delayed_process = True` flag is set.\n", + "\n", + "We first start a distributed Dask environment. When a dask client is instantiated it registers itself as the default Dask scheduler (). Therefore all subsequent Dask distibuted dataframe operations will run in distributed fashion." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n", + "

Client

\n", + "\n", + "
\n", + "

Cluster

\n", + "
    \n", + "
  • Workers: 2
  • \n", + "
  • Cores: 2
  • \n", + "
  • Memory: 100.00 GB
  • \n", + "
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from dask_cuda import LocalCUDACluster\n", + "from dask.distributed import Client\n", + "\n", + "cluster = LocalCUDACluster()\n", + "client = Client(cluster)\n", + "client" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The Dask status page can be displayed in a web browser at `:8787`. The ip-address corresponds to the machine where the dask cluster (scheduler) was launched. Most likely same ip-address as where this jupyter notebook is running. Using the Dask status page is convenient for monitoring dask distributed processing. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The next step is to partition the `cudf` dataframe into a `dask_cudf` dataframe. Here we make the number of partitions corresponding to the number of workers:" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "class DistributedNode(Node):\n", + "\n", + " def ports_setup(self):\n", + " input_ports = {\n", + " 'points_df_in': {\n", + " PortsSpecSchema.port_type: cudf.DataFrame\n", + " }\n", + " }\n", + "\n", + " output_ports = {\n", + " 'points_ddf_out': {\n", + " PortsSpecSchema.port_type: dask_cudf.DataFrame\n", + " }\n", + " }\n", + "\n", + " return NodePorts(inports=input_ports, outports=output_ports)\n", + "\n", + " def init(self):\n", + " pass\n", + "\n", + " def meta_setup(self,):\n", + " cols_required = {\n", + " 'x': 'float64',\n", + " 'y': 'float64'\n", + " }\n", + " required = {\n", + " 'points_df_in': cols_required,\n", + " 'points_ddf_out': cols_required\n", + " }\n", + " input_meta = self.get_input_meta()\n", + " output_cols = ({\n", + " 'points_ddf_out': {\n", + " 'x': 'float64',\n", + " 'y': 'float64'\n", + " }\n", + " })\n", + " if 'points_df_in' in input_meta:\n", + " col_from_inport = input_meta['points_df_in']\n", + " # additional ports\n", + " output_cols['points_ddf_out'].update(col_from_inport)\n", + " return MetaData(inports=required, outports=output_cols)\n", + "\n", + " def conf_schema(self):\n", + " json = {\n", + " \"title\": \"DistributedNode configure\",\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"npartitions\": {\n", + " \"type\": \"number\",\n", + " \"description\": \"num of partitions in the Dask dataframe\",\n", + " \"minimum\": 1\n", + " }\n", + " },\n", + " \"required\": [\"npartitions\"],\n", + " }\n", + "\n", + " ui = {\n", + " \"npartitions\": {\"ui:widget\": \"updown\"}\n", + " }\n", + " return ConfSchema(json=json, ui=ui)\n", + "\n", + " def process(self, inputs):\n", + " npartitions = self.conf['npartitions']\n", + " df = inputs['points_df_in']\n", + " ddf = dask_cudf.from_cudf(df, npartitions=npartitions)\n", + " return {'points_ddf_out': ddf}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Register it:" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "TaskGraph.register_lab_node(module_name, DistributedNode)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We add this distribution node to the computation graph to convert `cudf` dataframes into `dask-cudf` dataframes. The `dask-cudf` dataframes are handled automatically in gQuant when `self.delayed_process=True` within a `Node` implementation (setup in `meta_setup`). When using nodes with ports with `self.delayed_process=True` setting, it is required that all input and output ports be of type `cudf.DataFrame`. Otherwise don't set `self.delayed_process` and one can write custom logic to handle distributed dataframes (refer to `VerifyNode` abover for an example where `dask_cudf` dataframes are handled directly within the process method)." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "261bd245de244b109984bc369777f922", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "GQuantWidget(sub=HBox(), value=[OrderedDict([('id', 'points_task'), ('type', 'PointNode'), ('conf', {'npts': 1…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "npartitions = len(client.scheduler_info()['workers'])\n", + "\n", + "\n", + "distribute_tspec = {\n", + " TaskSpecSchema.task_id: 'distributed_points',\n", + " TaskSpecSchema.node_type: DistributedNode,\n", + " TaskSpecSchema.conf: {'npartitions': npartitions},\n", + " TaskSpecSchema.module: module_name,\n", + " TaskSpecSchema.inputs: {\n", + " 'points_df_in': 'points_task.points_df_out'\n", + " }\n", + "}\n", + "\n", + "dask_cudf_distance_tspec = {\n", + " TaskSpecSchema.task_id: 'distance_by_cudf',\n", + " TaskSpecSchema.node_type: DistanceNode,\n", + " TaskSpecSchema.conf: {},\n", + " TaskSpecSchema.module: module_name,\n", + " TaskSpecSchema.inputs: {\n", + " 'points_df_in': 'distributed_points.points_ddf_out'\n", + " }\n", + "}\n", + "\n", + "dask_numba_distance_tspec = {\n", + " TaskSpecSchema.task_id: 'distance_by_numba',\n", + " TaskSpecSchema.node_type: NumbaDistanceNode,\n", + " TaskSpecSchema.conf: {},\n", + " TaskSpecSchema.module: module_name,\n", + " TaskSpecSchema.inputs: {\n", + " 'points_df_in': 'distributed_points.points_ddf_out'\n", + " }\n", + "}\n", + "\n", + "dask_cupy_distance_tspec = {\n", + " TaskSpecSchema.task_id: 'distance_by_cupy',\n", + " TaskSpecSchema.node_type: CupyDistanceNode,\n", + " TaskSpecSchema.conf: {},\n", + " TaskSpecSchema.module: module_name,\n", + " TaskSpecSchema.inputs: {\n", + " 'points_df_in': 'distributed_points.points_ddf_out'\n", + " }\n", + "}\n", + "\n", + "out_spec = {\n", + " TaskSpecSchema.task_id: '',\n", + " TaskSpecSchema.node_type: \"Output_Collector\",\n", + " TaskSpecSchema.conf: {},\n", + " TaskSpecSchema.inputs: {\n", + " 'in0': 'distributed_points.points_ddf_out',\n", + " 'in1': 'distance_by_cudf.distance_df',\n", + " 'in2': 'distance_by_numba.distance_df',\n", + " 'in3': 'distance_by_cupy.distance_df'\n", + " }\n", + "}\n", + "\n", + "\n", + "task_list = [\n", + " points_tspec,\n", + " distribute_tspec,\n", + " dask_cudf_distance_tspec,\n", + " dask_numba_distance_tspec,\n", + " dask_cupy_distance_tspec,\n", + " out_spec\n", + "]\n", + "\n", + "task_graph = TaskGraph(task_list)\n", + "task_graph.draw()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Run the distributed computation programmatically:" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "out_list = [\n", + " 'distributed_points.points_ddf_out',\n", + " 'distance_by_cudf.distance_df',\n", + " 'distance_by_numba.distance_df',\n", + " 'distance_by_cupy.distance_df'\n", + "]\n", + "\n", + "(points_ddf, ddf_w_cudf, ddf_w_numba, ddf_w_cupy) = task_graph.run(out_list)\n", + "df_w_cudf = ddf_w_cudf.compute()\n", + "df_w_numba = ddf_w_numba.compute()\n", + "df_w_cupy = ddf_w_cupy.compute()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "261bd245de244b109984bc369777f922", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "GQuantWidget(cache={'nodes': [{'width': 110, 'id': 'points_task', 'type': 'PointNode', 'schema': {'title': 'Po…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "verify_cudf_numba_tspec = verify_tspec.copy()\n", + "verify_cudf_cupy_tspec = verify_tspec2.copy()\n", + "\n", + "task_graph.extend(\n", + " [verify_cudf_numba_tspec,\n", + " verify_cudf_cupy_tspec],\n", + " replace=True)\n", + "task_graph.draw()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Verify the results:" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "HEAD points_ddf:\n", + " x y\n", + "0 0.793239 0.794183\n", + "1 0.405215 0.753843\n", + "2 0.039310 0.076363\n", + "3 0.456579 0.369289\n", + "4 0.165980 0.383398\n", + "\n", + "HEAD df_w_cudf:\n", + " x y distance_cudf\n", + "0 0.793239 0.794183 1.122477\n", + "1 0.405215 0.753843 0.855849\n", + "2 0.039310 0.076363 0.085887\n", + "3 0.456579 0.369289 0.587230\n", + "4 0.165980 0.383398 0.417784\n", + "\n", + "HEAD df_w_numba:\n", + " x y distance_numba\n", + "0 0.793239 0.794183 1.122477\n", + "1 0.405215 0.753843 0.855849\n", + "2 0.039310 0.076363 0.085887\n", + "3 0.456579 0.369289 0.587230\n", + "4 0.165980 0.383398 0.417784\n", + "\n", + "HEAD df_w_cupy:\n", + " x y distance_cupy\n", + "0 0.793239 0.794183 1.122477\n", + "1 0.405215 0.753843 0.855849\n", + "2 0.039310 0.076363 0.085887\n", + "3 0.456579 0.369289 0.587230\n", + "4 0.165980 0.383398 0.417784\n", + "\n", + "Max Difference cudf to numba: 2.220446049250313e-16\n", + "Max Difference cudf to cupy: 2.220446049250313e-16\n" + ] + } + ], + "source": [ + "# Use results above and avoid re-running dask\n", + "replace_spec = {\n", + " 'distance_by_cudf': {\n", + " TaskSpecSchema.load: {\n", + " 'distance_df': ddf_w_cudf\n", + " }\n", + " },\n", + " 'distance_by_numba': {\n", + " TaskSpecSchema.load: {\n", + " 'distance_df': ddf_w_numba\n", + " }\n", + " },\n", + " 'distance_by_cupy': {\n", + " TaskSpecSchema.load: {\n", + " 'distance_df': ddf_w_cupy\n", + " }\n", + " }\n", + "}\n", + "\n", + "(max_cudf_to_numba_diff, max_cudf_to_cupy_diff) = task_graph.run(\n", + " ['verify_cudf_to_numba.max_diff',\n", + " 'verify_cudf_to_cupy.max_diff'],\n", + " replace=replace_spec\n", + ")\n", + "\n", + "print('HEAD points_ddf:\\n{}\\n'.format(points_ddf.head()))\n", + "print('HEAD df_w_cudf:\\n{}\\n'.format(ddf_w_cudf.head()))\n", + "print('HEAD df_w_numba:\\n{}\\n'.format(ddf_w_numba.head()))\n", + "print('HEAD df_w_cupy:\\n{}\\n'.format(ddf_w_cupy.head()))\n", + "print('Max Difference cudf to numba: {}'.format(max_cudf_to_numba_diff))\n", + "print('Max Difference cudf to cupy: {}'.format(max_cudf_to_cupy_diff))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "One limitation to be aware of when using customized kernels within Nodes in the Dask environment, is that each GPU kernel works on one partition of the dataframe. Therefore if the computation depends on other partitions of the dataframe the approach above does not work." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Saving Custom Nodes and Kernels\n", + "\n", + "The gQuant examples already implement a number of `Nodes`. These can be found in `gquant.plugin_nodes` submodules.\n", + "\n", + "The customized kernels and nodes can be saved to your own python modules for future re-use instead of having to re-define them at runtime. The nodes we defined above were to a written to a python module \"custom_port_nodes.py\" (the `DistanceNode` was simplified to ommit the absolute distance calculation). We will re-run our workflow importing the Nodes from the custom module we wrote out.\n", + "\n", + "When defining the tasks we specify `filepath` for the path to the python module that has the Node definition. Notice, that the `node_type` is specified as a string instead of class. The string is the class name of the node that will be imported for running a task." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "340070cd3cf9451fbf61f19052b3d66c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "GQuantWidget(sub=HBox(), value=[OrderedDict([('id', 'points_task'), ('type', 'PointNode'), ('conf', {'npts': 1…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from gquant.dataframe_flow.util import get_file_path\n", + "npartitions = len(client.scheduler_info()['workers'])\n", + "points_tspec = {\n", + " TaskSpecSchema.task_id: 'points_task',\n", + " TaskSpecSchema.node_type: 'PointNode',\n", + " TaskSpecSchema.filepath: get_file_path('notebooks/custom_port_nodes.py'),\n", + " TaskSpecSchema.conf: {'npts': 1000},\n", + " TaskSpecSchema.inputs: {},\n", + "}\n", + "\n", + "distribute_tspec = {\n", + " TaskSpecSchema.task_id: 'distributed_points',\n", + " TaskSpecSchema.node_type: 'DistributedNode',\n", + " TaskSpecSchema.filepath: get_file_path('notebooks/custom_port_nodes.py'),\n", + " TaskSpecSchema.conf: {'npartitions': npartitions},\n", + " TaskSpecSchema.inputs: {\n", + " 'points_df_in': 'points_task.points_df_out'\n", + " }\n", + "}\n", + "\n", + "dask_cudf_distance_tspec = {\n", + " TaskSpecSchema.task_id: 'distance_by_cudf',\n", + " TaskSpecSchema.node_type: 'DistanceNode',\n", + " TaskSpecSchema.filepath: get_file_path('notebooks/custom_port_nodes.py'),\n", + " TaskSpecSchema.conf: {},\n", + " TaskSpecSchema.inputs: {\n", + " 'points_df_in': 'distributed_points.points_ddf_out'\n", + " }\n", + "}\n", + "\n", + "dask_numba_distance_tspec = {\n", + " TaskSpecSchema.task_id: 'distance_by_numba',\n", + " TaskSpecSchema.node_type: 'NumbaDistanceNode',\n", + " TaskSpecSchema.filepath: get_file_path('notebooks/custom_port_nodes.py'),\n", + " TaskSpecSchema.conf: {},\n", + " TaskSpecSchema.inputs: {\n", + " 'points_df_in': 'distributed_points.points_ddf_out'\n", + " }\n", + "}\n", + "\n", + "dask_cupy_distance_tspec = {\n", + " TaskSpecSchema.task_id: 'distance_by_cupy',\n", + " TaskSpecSchema.node_type: 'CupyDistanceNode',\n", + " TaskSpecSchema.filepath: get_file_path('notebooks/custom_port_nodes.py'),\n", + " TaskSpecSchema.conf: {},\n", + " TaskSpecSchema.inputs: {\n", + " 'points_df_in': 'distributed_points.points_ddf_out'\n", + " }\n", + "}\n", + "\n", + "verify_cudf_to_numba_tspec = {\n", + " TaskSpecSchema.task_id: 'verify_cudf_to_numba',\n", + " TaskSpecSchema.node_type: 'VerifyNode',\n", + " TaskSpecSchema.filepath: get_file_path('notebooks/custom_port_nodes.py'),\n", + " TaskSpecSchema.conf: {\n", + " 'df1_col': 'distance_cudf',\n", + " 'df2_col': 'distance_numba'\n", + " }, \n", + " TaskSpecSchema.inputs: {\n", + " 'df1': 'distance_by_cudf.distance_df',\n", + " 'df2': 'distance_by_numba.distance_df'\n", + " }\n", + "}\n", + "\n", + "verify_cudf_to_cupy_tspec = {\n", + " TaskSpecSchema.task_id: 'verify_cudf_to_cupy',\n", + " TaskSpecSchema.node_type: 'VerifyNode',\n", + " TaskSpecSchema.filepath: get_file_path('notebooks/custom_port_nodes.py'),\n", + " TaskSpecSchema.conf: {\n", + " 'df1_col': 'distance_cudf',\n", + " 'df2_col': 'distance_cupy'\n", + " }, \n", + " TaskSpecSchema.inputs: {\n", + " 'df1': 'distance_by_cudf.distance_df',\n", + " 'df2': 'distance_by_cupy.distance_df'\n", + " }\n", + "}\n", + "\n", + "task_list = [\n", + " points_tspec,\n", + " distribute_tspec,\n", + " dask_cudf_distance_tspec,\n", + " dask_numba_distance_tspec,\n", + " dask_cupy_distance_tspec,\n", + " verify_cudf_to_numba_tspec,\n", + " verify_cudf_to_cupy_tspec\n", + "]\n", + "\n", + "task_graph = TaskGraph(task_list)\n", + "task_graph.draw()" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "HEAD df_w_cudf:\n", + " x y distance_cudf\n", + "0 0.729991 0.418487 0.841438\n", + "1 0.887041 0.875456 1.246301\n", + "2 0.763563 0.220629 0.794799\n", + "3 0.454909 0.976449 1.077217\n", + "4 0.108439 0.707618 0.715878\n", + "\n", + "HEAD df_w_numba:\n", + " x y distance_numba\n", + "0 0.729991 0.418487 0.841438\n", + "1 0.887041 0.875456 1.246301\n", + "2 0.763563 0.220629 0.794799\n", + "3 0.454909 0.976449 1.077217\n", + "4 0.108439 0.707618 0.715878\n", + "\n", + "HEAD df_w_cupy:\n", + " x y distance_cupy\n", + "0 0.729991 0.418487 0.841438\n", + "1 0.887041 0.875456 1.246301\n", + "2 0.763563 0.220629 0.794799\n", + "3 0.454909 0.976449 1.077217\n", + "4 0.108439 0.707618 0.715878\n", + "\n", + "Max Difference cudf to numba: 2.220446049250313e-16\n", + "Max Difference cudf to cupy: 2.220446049250313e-16\n" + ] + } + ], + "source": [ + "out_list = [\n", + " 'distance_by_cudf.distance_df',\n", + " 'distance_by_numba.distance_df',\n", + " 'distance_by_cupy.distance_df',\n", + " 'verify_cudf_to_numba.max_diff',\n", + " 'verify_cudf_to_cupy.max_diff'\n", + "]\n", + "\n", + "(ddf_w_cudf, ddf_w_numba, ddf_w_cupy,\n", + " mdiff_cudf_to_numba, mdiff_cudf_to_cupy) = task_graph.run(out_list)\n", + "\n", + "print('HEAD df_w_cudf:\\n{}\\n'.format(ddf_w_cudf.head()))\n", + "print('HEAD df_w_numba:\\n{}\\n'.format(ddf_w_numba.head()))\n", + "print('HEAD df_w_cupy:\\n{}\\n'.format(ddf_w_cupy.head()))\n", + "print('Max Difference cudf to numba: {}'.format(mdiff_cudf_to_numba))\n", + "print('Max Difference cudf to cupy: {}'.format(mdiff_cudf_to_cupy))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The final illustration is how to save and load a task graph to a file for re-use." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "task_graph.save_taskgraph('custom_wflow.gq.yaml')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The gQuant TaskGraph file is created and saved. You can double click on it to open it up in the JupyterLab to edit it.\n", + "\n", + "Or you can display it by gQuant widget and play with it interactively " + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "ab7d2484abe64998bcb17e69802f9a67", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "GQuantWidget(sub=HBox(), value=[OrderedDict([('id', 'points_task'), ('type', 'PointNode'), ('conf', {'npts': 1…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "task_graph = TaskGraph.load_taskgraph('custom_wflow.gq.yaml')\n", + "task_graph.draw()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Of course you can run it by callign `run` method." + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "242b0b34d6b245f0a1b807d290bdc436", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Tab(children=(Output(), Output(), Output(), Output(), Output(), Output(layout=Layout(border='1px solid black')…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "\n", + "# update npartitions in case the scheduler is running with\n", + "# different number of workers than what was saved.\n", + "npartitions = len(client.scheduler_info()['workers'])\n", + "replace_spec = {\n", + " 'distributed_points': {\n", + " TaskSpecSchema.conf: {'npartitions': npartitions},\n", + " }\n", + "}\n", + "\n", + "out_list = [\n", + " 'distance_by_cudf.distance_df',\n", + " 'distance_by_numba.distance_df',\n", + " 'distance_by_cupy.distance_df',\n", + " 'verify_cudf_to_numba.max_diff',\n", + " 'verify_cudf_to_cupy.max_diff'\n", + "]\n", + "\n", + "task_graph.run(out_list, replace=replace_spec, formated=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "HEAD df_w_cudf:\n", + " x y distance_cudf\n", + "0 0.729991 0.418487 0.841438\n", + "1 0.887041 0.875456 1.246301\n", + "2 0.763563 0.220629 0.794799\n", + "3 0.454909 0.976449 1.077217\n", + "4 0.108439 0.707618 0.715878\n", + "\n", + "HEAD df_w_numba:\n", + " x y distance_numba\n", + "0 0.729991 0.418487 0.841438\n", + "1 0.887041 0.875456 1.246301\n", + "2 0.763563 0.220629 0.794799\n", + "3 0.454909 0.976449 1.077217\n", + "4 0.108439 0.707618 0.715878\n", + "\n", + "HEAD df_w_cupy:\n", + " x y distance_cupy\n", + "0 0.729991 0.418487 0.841438\n", + "1 0.887041 0.875456 1.246301\n", + "2 0.763563 0.220629 0.794799\n", + "3 0.454909 0.976449 1.077217\n", + "4 0.108439 0.707618 0.715878\n", + "\n" + ] + } + ], + "source": [ + "\n", + "print('HEAD df_w_cudf:\\n{}\\n'.format(ddf_w_cudf.head()))\n", + "print('HEAD df_w_numba:\\n{}\\n'.format(ddf_w_numba.head()))\n", + "print('HEAD df_w_cupy:\\n{}\\n'.format(ddf_w_cupy.head()))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Conclusion\n", + "\n", + "Using customized GPU kernels allows data scientists to implement and incorporate advanced algorithms. We demonstrated implementations using Numba and CuPy.\n", + "\n", + "The Numba approach enables data scientists to write GPU kernels directly in the Python language. Numba is easy to use for implementing and accelerating computations. However there is some overhead incurred for compiling the kernels whenever the Numba GPU kernels are used for the first time in a Python process. Currently Numba library only supports primitive data types. Some advanced CUDA programming features, such as function pointers and function recursions are not supported. \n", + "\n", + "The Cupy method is very flexible, because data scientists are writing C/C++ GPU kernels with CUDA directly. All the CUDA programming features are supported. CuPy compiles the kernel and caches the device code to the filesystem. The launch overhead is low. Also, the GPU kernel is built statically resulting in runtime efficiency. However it might be harder for data scientists to use, because C/C++ programming is more complicated. \n", + "\n", + "Below is a brief summary comparison table:\n", + "\n", + "| Methods | Development Difficulty | Flexibility | Efficiency | Latency |\n", + "|---|---|---|---|---|\n", + "| Numba method | medium | medium | low | high |\n", + "| CuPy method | hard | high | high | low |\n", + "\n", + "We recommend that the data scientists select the approach appropriate for their task taking into consideration the efficiency, latency, difficulty and flexibility of their workflow. \n", + "\n", + "In this blog, we showed how to wrap the customized GPU kernels in gQuant nodes. Also, by taking advantage of having the gQuant handle the low-level Dask interfaces for the developer, we demonstrated how to use the gQuant workflow with Dask distributed computations." + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "# Clean up\n", + "\n", + "# Shutdown the Dask cluster\n", + "client.close()\n", + "cluster.close()" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'status': 'ok', 'restart': True}" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import IPython\n", + "app = IPython.Application.instance()\n", + "app.kernel.do_shutdown(True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/gQuant/plugins/rapids_plugin/notebooks/06_xgboost_trade.ipynb b/gQuant/plugins/rapids_plugin/notebooks/06_xgboost_trade.ipynb new file mode 100644 index 00000000..0cf9083b --- /dev/null +++ b/gQuant/plugins/rapids_plugin/notebooks/06_xgboost_trade.ipynb @@ -0,0 +1,1193 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Trade with XGBoost algorithm\n", + "## Background\n", + "In the [portfolio trade example](https://github.com/rapidsai/gQuant/blob/master/notebooks/04_portfolio_trade.ipynb), we use gQuant to backtest a simple mean reversion trading strategy on 5000 stocks.\n", + "It shows decent performance by tweaking the moving average window size. Searching for alpha signal is the ultimate goal for the trading companies. A lot of different methods are used to do so. Machine learning approach\n", + "is one of those. It has the benefits of extracting important information in the data automatically given enough computation. There are a few popular machine learning algrithoms, including SVM, Random forest tree etc. Amoung those, XGBoost is known to be a very powerful machine \n", + "learning method that is winning a lot of [ML competitions](https://medium.com/syncedreview/tree-boosting-with-xgboost-why-does-xgboost-win-every-machine-learning-competition-ca8034c0b283). Luckily, the [RAPIDS library](https://github.com/rapidsai) accelerates the XGBoost ML algorithm in the GPU so that we can easily take advantage of it in the gQuant. \n", + "\n", + "In this notebook, we are going to demo how to use gQuant to backtest a XGBoost based trading stragty.\n", + "\n", + "\n", + "## Environment Preparation\n", + "\n", + "### Download the example Datasets\n", + "Before getting started, let's download the example datasets if not presen" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset is already present. No need to re-download it.\n" + ] + } + ], + "source": [ + "! ((test ! -f './data/stock_price_hist.csv.gz' || test ! -f './data/security_master.csv.gz') && \\\n", + " cd .. && bash download_data.sh) || echo \"Dataset is already present. No need to re-download it.\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Prepare for running in Dask environment\n", + "\n", + "Let's start the Dask local cluster environment for distributed computation.\n", + "\n", + "Dask provides a web-based dashboard to help to track progress, identify performance issues, and debug failures. To learn more about Dask dashboard, just follow this [link](https://distributed.dask.org/en/latest/web.html).\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n", + "

Client

\n", + "\n", + "
\n", + "

Cluster

\n", + "
    \n", + "
  • Workers: 2
  • \n", + "
  • Cores: 2
  • \n", + "
  • Memory: 100.00 GB
  • \n", + "
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Start the Dask local cluster environment for distrubuted computation\n", + "from dask_cuda import LocalCUDACluster\n", + "from dask.distributed import Client\n", + "\n", + "cluster = LocalCUDACluster()\n", + "client = Client(cluster)\n", + "client\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Though our stock dataset is small enough to fit in a single 16G GPU, to show how to do distributed computation, we will split the dataframe into small pieces to be loaded by different workers in the cluster.\n", + "\n", + "Notice this step is need only if the dataset is not split in multiple files yet.\n", + "\n", + "First use this simple taskgraph to load data then sort it by the asset id and datatime:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "970f1e4cbc3043bb990e0dcefaa6a01f", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "GQuantWidget(sub=HBox(), value=[OrderedDict([('id', 'stock_data'), ('type', 'CsvStockLoader'), ('conf', {'file…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import sys; sys.path.insert(0, '..')\n", + "import warnings\n", + "from gquant.dataframe_flow import TaskGraph\n", + "import ipywidgets as widgets\n", + "import os\n", + "warnings.simplefilter(\"ignore\")\n", + "task_graph = TaskGraph.load_taskgraph('../taskgraphs/sort_stocks.gq.yaml')\n", + "input_cached, = task_graph.run()\n", + "task_graph.draw()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "convert the sorted stock data into partitions and save it into csv files. Note, the data is slited in a way that the same asset belongs to the same partition" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['/home/yi/Projects/demo_gquant_install/notebooks/many-small/0.csv',\n", + " '/home/yi/Projects/demo_gquant_install/notebooks/many-small/1.csv',\n", + " '/home/yi/Projects/demo_gquant_install/notebooks/many-small/2.csv',\n", + " '/home/yi/Projects/demo_gquant_install/notebooks/many-small/3.csv',\n", + " '/home/yi/Projects/demo_gquant_install/notebooks/many-small/4.csv',\n", + " '/home/yi/Projects/demo_gquant_install/notebooks/many-small/5.csv',\n", + " '/home/yi/Projects/demo_gquant_install/notebooks/many-small/6.csv',\n", + " '/home/yi/Projects/demo_gquant_install/notebooks/many-small/7.csv']" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import dask.dataframe as dd\n", + "import os\n", + "num_partitions = 8\n", + "\n", + "os.makedirs('many-small', exist_ok=True)\n", + "dd.from_pandas(input_cached.set_index('asset'), npartitions=num_partitions).reset_index().to_csv('many-small/*.csv', index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note, this notebook requires `cudf` of version >=0.8.0. It can be checked by following command" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.17.0\n" + ] + } + ], + "source": [ + "import cudf\n", + "print(cudf.__version__)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## The toy example\n", + "To mimic the end-to-end quantitative analyst task, we are going to backtest a XGBoost trading strategy. \n", + "\n", + "We will reuse the preprocessing steps as shown in the portfolio trade notebook example. \n", + "\n", + "The workflow includes following steps:\n", + "\n", + "1. Preprocess the datasets.\n", + "\n", + "4. Compute the features based on different technical indicators \n", + "\n", + "5. Split the data in training and testing and build a XGBoost model based on the training data. From the XGBoost model, compute the trading signals for all the data points.\n", + "\n", + "5. Run backtesting and compute the returns from this strategy for each of the days and stock symbols \n", + "\n", + "6. Run a simple portfolio optimization by averaging the stocks together for each of the trading days.\n", + "\n", + "7. Compute the sharpe ratio and cumulative return results for both training and testing datasets\n", + "\n", + "The whole workflow can be organized into a TaskGraph, which are fully described in a `.gq.yaml` file.\n", + "\n", + "Each nodes has a unique id, a node type, configuration parameters and input nodes ids. gQuant takes this yaml file, wires it into a graph to visualize it.\n", + "\n", + "First let's load the proprocess TaskGraph:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "6eb535c0bcee4a3d8b92cdd41c991967", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "GQuantWidget(sub=HBox(), value=[OrderedDict([('id', 'stock_data'), ('type', 'CsvStockLoader'), ('conf', {'file…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "task_graph = TaskGraph.load_taskgraph('../taskgraphs/preprocess.gq.yaml')\n", + "task_graph.draw()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Lode the whole TaskGraph by `load_taskgraph` command. Note the preprocess TaskGraph is included inside the `preprocess` `Composite Node`." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "49e4668f4fdf4a5cb7656b4fe42b7fd0", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "GQuantWidget(sub=HBox(), value=[OrderedDict([('id', 'stock_data'), ('type', 'CsvStockLoader'), ('conf', {'file…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "task_graph = TaskGraph.load_taskgraph('../taskgraphs/xgboost_trade.gq.yaml')\n", + "task_graph.draw()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The features used for XGBoost algorithm are prepared in the `xgboost` Task node, where `cuIndicator` module is used to compute the technical indicators in the GPU for all the stock symbols. `xgboost` is the Task node that is used to compute the trading signals from the stock technical indicators. Each of the gQuant Task node is implemented by overwriting `meta_setup`, `process`, `ports_setup`, `conf_chema` methods of the Node base class. Please refer to [customize nodes notebook](https://github.com/rapidsai/gQuant/blob/master/notebooks/05_customize_nodes.ipynb) for details. Following is the source code for \"XGBoostStrategyNode\":" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "class XGBoostStrategyNode(_PortTypesMixin, Node):\n", + " \"\"\"\n", + " This is the Node used to compute trading signal from XGBoost Strategy.\n", + " It requires the following conf fields:\n", + " \"train_date\": a date string of \"Y-m-d\" format. All the data points\n", + " before this date is considered as training, otherwise as testing. If\n", + " not provided, all the data points are considered as training.\n", + " \"xgboost_parameters\": a dictionary of any legal parameters for XGBoost\n", + " models. It overwrites the default parameters used in the process method\n", + " \"no_feature\": specifying a list of columns in the input dataframe that\n", + " should NOT be considered as training features.\n", + " \"target\": the column that is considered as \"target\" in machine learning\n", + " algorithm\n", + " It requires the \"datetime\" column for spliting the data points and adds a\n", + " new column \"signal\" to be used for backtesting.\n", + " The detailed computation steps are listed in the process method's docstring\n", + " \"\"\"\n", + "\n", + " def init(self):\n", + " _PortTypesMixin.init(self)\n", + " self.INPUT_PORT_NAME = 'stock_in'\n", + " self.OUTPUT_PORT_NAME = 'stock_out'\n", + "\n", + " def meta_setup(self):\n", + " # if 'no_feature' in self.conf:\n", + " # retention = self.conf['no_feature']\n", + " # else:\n", + " cols_required = {'datetime': 'date',\n", + " \"asset\": \"int64\"}\n", + " # self.delayed_process = True\n", + " required = {\n", + " self.INPUT_PORT_NAME: cols_required\n", + " }\n", + " retention = {}\n", + " retention['signal'] = 'float64'\n", + " # _PortTypesMixin.retention_meta_setup(self, retention)\n", + "\n", + " input_meta = self.get_input_meta()\n", + " if self.INPUT_PORT_NAME not in input_meta:\n", + " col_from_inport = required[self.INPUT_PORT_NAME]\n", + " else:\n", + " col_from_inport = input_meta[self.INPUT_PORT_NAME]\n", + " # delete the columns from the inputs\n", + " if 'no_feature' in self.conf:\n", + " for key in self.conf['no_feature']:\n", + " if key in col_from_inport:\n", + " retention[key] = col_from_inport[key]\n", + " metadata = MetaData(inports=required,\n", + " outports={self.OUTPUT_PORT_NAME: retention})\n", + " return metadata\n", + "\n", + " def ports_setup(self):\n", + " types = [cudf.DataFrame,\n", + " dask_cudf.DataFrame]\n", + " return _PortTypesMixin.ports_setup_from_types(self, types)\n", + "\n", + " def conf_schema(self):\n", + " json = {\n", + " \"title\": \"XGBoost Node configure\",\n", + " \"type\": \"object\",\n", + " \"description\": \"\"\"Split the data into training and testing based on\n", + " 'train_data', train a XGBoost model based on the training data,\n", + " make predictions for all the data points, compute the trading.\n", + " \"\"\",\n", + " \"properties\": {\n", + " \"num_of_rounds\": {\n", + " \"type\": \"number\",\n", + " \"description\": \"\"\"The number of rounds for boosting\"\"\",\n", + " \"default\": 100\n", + " },\n", + " \"train_date\": {\n", + " \"type\": \"string\",\n", + " \"description\": \"\"\"the date to splite train and validation\n", + " dataset\"\"\"\n", + " },\n", + " \"target\": {\n", + " \"type\": \"string\",\n", + " \"description\": \"the column used as dependent variable\"\n", + " },\n", + " \"no_feature\": {\n", + " \"type\": \"array\",\n", + " \"items\": {\n", + " \"type\": \"string\",\n", + " },\n", + " \"description\": \"\"\"columns in the input dataframe that\n", + " should NOT be considered as training features.\"\"\"\n", + " },\n", + " \"xgboost_parameters\": {\n", + " \"type\": \"object\",\n", + " \"description\": \"xgoobst parameters\",\n", + " \"properties\": {\n", + " 'max_depth': {\n", + " \"type\": \"number\",\n", + " \"description\": \"Maximum depth of a tree.\",\n", + " \"default\": 8\n", + " },\n", + " \"max_leaves\": {\n", + " \"type\": \"number\",\n", + " \"description\": \"maximum number of tree leaves\",\n", + " \"default\": 2**8\n", + " },\n", + " \"gamma\": {\n", + " \"type\": \"number\",\n", + " \"description\": \"\"\"Minimum loss reduction required\n", + " to make a further partition on a leaf node of the\n", + " tree.\"\"\",\n", + " \"default\": 0\n", + " },\n", + " \"objective\": {\n", + " \"type\": \"string\",\n", + " \"enum\": [\"reg:squarederror\", \"reg:squaredlogerror\",\n", + " \"reg:logistic\", \"reg:pseudohubererror\"],\n", + " \"description\": \"\"\"Specify the learning task and\n", + " the corresponding learning objective.\"\"\",\n", + " \"default\": \"reg:squarederror\"\n", + " }\n", + " }\n", + " }\n", + " },\n", + " \"required\": [\"target\", \"num_of_rounds\"],\n", + " }\n", + " ui = {\n", + " \"train_date\": {\n", + " \"ui:widget\": \"alt-date\",\n", + " \"ui:options\": {\n", + " \"yearsRange\": [1985, 2025],\n", + " \"hideNowButton\": True,\n", + " \"hideClearButton\": True,\n", + " },\n", + " },\n", + " }\n", + " input_meta = self.get_input_meta()\n", + " if self.INPUT_PORT_NAME in input_meta:\n", + " col_from_inport = input_meta[self.INPUT_PORT_NAME]\n", + " enums = [col for col in col_from_inport.keys()]\n", + " json['properties']['no_feature']['items']['enum'] = enums\n", + " json['properties']['target']['enum'] = enums\n", + " return ConfSchema(json=json, ui=ui)\n", + " else:\n", + " return ConfSchema(json=json, ui=ui)\n", + "\n", + " def process(self, inputs):\n", + " \"\"\"\n", + " The process is doing following things:\n", + " 1. split the data into training and testing based on provided\n", + " conf['train_date']. If it is not provided, all the data is\n", + " treated as training data.\n", + " 2. train a XGBoost model based on the training data\n", + " 3. Make predictions for all the data points including training and\n", + " testing.\n", + " 4. From the prediction of returns, compute the trading signals that\n", + " can be used in the backtesting.\n", + " Arguments\n", + " -------\n", + " inputs: list\n", + " list of input dataframes.\n", + " Returns\n", + " -------\n", + " dataframe\n", + " \"\"\"\n", + " dxgb_params = {\n", + " 'max_depth': 8,\n", + " 'max_leaves': 2 ** 8,\n", + " 'tree_method': 'gpu_hist',\n", + " 'objective': 'reg:squarederror',\n", + " 'grow_policy': 'lossguide',\n", + " }\n", + " # num_of_rounds = 100\n", + " if 'xgboost_parameters' in self.conf:\n", + " dxgb_params.update(self.conf['xgboost_parameters'])\n", + " input_df = inputs[self.INPUT_PORT_NAME]\n", + " model_df = input_df\n", + " train_cols = set(model_df.columns) - set(\n", + " self.conf['no_feature'])\n", + " train_cols = list(train_cols - set([self.conf['target']]))\n", + "\n", + " if isinstance(input_df, dask_cudf.DataFrame):\n", + " # get the client\n", + " client = dask.distributed.client.default_client()\n", + " if 'train_date' in self.conf:\n", + " train_date = datetime.datetime.strptime(self.conf['train_date'], # noqa: F841, E501\n", + " '%Y-%m-%d')\n", + " model_df = model_df[model_df.datetime < train_date]\n", + " train = model_df[train_cols]\n", + " target = model_df[self.conf['target']]\n", + " dmatrix = xgb.dask.DaskDMatrix(client, train, label=target)\n", + " bst = xgb.dask.train(client, dxgb_params, dmatrix,\n", + " num_boost_round=self.conf[\"num_of_rounds\"])\n", + "\n", + " dtrain = xgb.dask.DaskDMatrix(client, input_df[train_cols])\n", + " prediction = xgb.dask.predict(client, bst, dtrain).persist()\n", + " pred_df = dask_cudf.from_dask_dataframe(\n", + " prediction.to_dask_dataframe())\n", + " pred_df.index = input_df.index\n", + " input_df['signal'] = pred_df\n", + " elif isinstance(input_df, cudf.DataFrame):\n", + " if 'train_date' in self.conf:\n", + " train_date = datetime.datetime.strptime(self.conf['train_date'], # noqa: F841, E501\n", + " '%Y-%m-%d')\n", + " model_df = model_df.query('datetime<@train_date')\n", + " train = model_df[train_cols]\n", + " target = model_df[self.conf['target']]\n", + " dmatrix = xgb.DMatrix(train, label=target)\n", + " bst = xgb.train(dxgb_params, dmatrix,\n", + " num_boost_round=self.conf[\"num_of_rounds\"])\n", + " infer_dmatrix = xgb.DMatrix(input_df[train_cols])\n", + " prediction = cudf.Series(bst.predict(infer_dmatrix),\n", + " nan_as_null=False,\n", + " index=input_df.index\n", + " ).astype('float64')\n", + " input_df['signal'] = prediction\n", + "\n", + " input_df['tmp'] = (input_df['asset'] -\n", + " input_df['asset'].shift(1)).fillna(1)\n", + " input_df['tmp'] = (input_df['tmp'] != 0).astype('int32')\n", + " tmp = input_df['tmp']\n", + " input_df['tmp'] = tmp.where(tmp != 1, None)\n", + " input_df = input_df.dropna(subset=['tmp'])\n", + " input_df = input_df.drop('tmp', axis=1)\n", + "\n", + " # convert the signal to trading action\n", + " # 1 is buy and -1 is sell\n", + " # It predicts the tomorrow's return (shift -1)\n", + " # We shift 1 for trading actions so that it acts on the second day\n", + " input_df['signal'] = ((\n", + " input_df['signal'] >= 0).astype('float') * 2 - 1).shift(1)\n", + "\n", + " # remove the bad datapints\n", + " input_df = input_df.dropna()\n", + " remaining = list(self.conf['no_feature']) + ['signal']\n", + " return {self.OUTPUT_PORT_NAME: input_df[remaining]}\n", + "\n" + ] + } + ], + "source": [ + "import inspect\n", + "from gquant_rapids_plugin.strategy import XGBoostStrategyNode\n", + "\n", + "print(inspect.getsource(XGBoostStrategyNode))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### XGBoost Trading Strategy Performance\n", + "Similar to tensorflow, gQuant graph is evaluated by specifying the output nodes and input nodes replacement. We first look at the column result from data preparation node." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "output meta of node node_technical_indicator:\n", + "MetaData(inports={'stock_in': {'indicator': 'int32', 'high': 'float64', 'low': 'float64', 'close': 'float64', 'volume': 'float64', 'returns': 'float64'}}, outports={'stock_out': {'CH_OS_10_20': 'float64', 'BO_BA_b1_10': 'float64', 'BO_BA_b2_10': 'float64', 'SHIFT_-1': 'float64', 'indicator': 'int32', 'returns': 'float64', 'datetime': 'date', 'asset': 'int64', 'volume': 'float64', 'close': 'float64', 'open': 'float64', 'high': 'float64', 'low': 'float64'}})\n" + ] + } + ], + "source": [ + "from pprint import pprint\n", + "print('output meta of node node_technical_indicator:')\n", + "task_graph.build()\n", + "pprint(task_graph['technical_indicator'].meta_setup())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It adds the columns \"BO_BA_b1_10\", \"BO_BA_b2_10\", 'CH_OS_10_20\" as features and \"SHFIT_-1\" as the target, which is the return of next day. A good feature should be the one that provides highest information about the next day return. In the case we have no prior information about it,\n", + "we can compute as many features as we like and leave it to the XGBoost to find the right combination of those features. \n", + "\n", + "Evaluate the leaf nodes of the backtesting graph by gQuant `run` method." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id:stock_data process time:4.270s\n", + "id:preprocess process time:0.761s\n", + "id:sort_after process time:0.056s\n", + "id:technical_indicator process time:2.571s\n", + "id:xgboost process time:2.495s\n", + "id:backtest process time:0.001s\n", + "id:train_df process time:0.125s\n", + "id:portfolio_opt_train process time:0.012s\n", + "id:sharpe_ratio_trn process time:0.001s\n", + "id:cumulative_return_trn process time:0.016s\n", + "id:validation_df process time:0.006s\n", + "id:portfolio_opt_validation process time:0.009s\n", + "id:sharpe_ratio_val process time:0.001s\n", + "id:cumulative_return_val process time:0.015s\n", + "CPU times: user 10 s, sys: 1.43 s, total: 11.4 s\n", + "Wall time: 11.1 s\n" + ] + } + ], + "source": [ + "%%time\n", + "output_list = ['sharpe_ratio_trn.sharpe_out',\n", + " 'cumulative_return_trn.cum_return',\n", + " 'sharpe_ratio_val.sharpe_out',\n", + " 'cumulative_return_val.cum_return',\n", + " 'sort_after.out']\n", + "o_gpu = task_graph.run(output_list, profile=True)\n", + "cached_sort = o_gpu['sort_after.out']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Define a function to organized the plot results. " + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "99ac1d90b02b407b9a9df08658f437b8", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "VBox(children=(Figure(axes=[Axis(label='Cumulative return', orientation='vertical', scale=LinearScale(), side=…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# define the function to format the plots\n", + "def plot_figures(o):\n", + " # format the figures\n", + " figure_width = '1200px'\n", + " figure_height = '400px'\n", + " sharpe_number = o['sharpe_ratio_trn.sharpe_out']\n", + " cum_return_train = o['cumulative_return_trn.cum_return']\n", + " cum_return_train.layout.height = figure_height\n", + " cum_return_train.layout.width = figure_width\n", + " cum_return_train.title = 'Training P & L %.3f' % (sharpe_number)\n", + " sharpe_number = o['sharpe_ratio_val.sharpe_out']\n", + " cum_return_test = o['cumulative_return_val.cum_return']\n", + " cum_return_test.layout.height = figure_height\n", + " cum_return_test.layout.width = figure_width\n", + " cum_return_test.title = 'Testing P & L %.3f' % (sharpe_number)\n", + "\n", + " return widgets.VBox([cum_return_train, cum_return_test])\n", + "plot_figures(o_gpu)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The XGBoost model does a good job to predict the next day of return. It overfits in the training dataset and gets Sharpe Ratio of 5 as shown in the figure above. In the testing period, it gets Sharpe Ratio of 1.\n", + "\n", + "The example model runs in a single GPU because of the small dataset. But in real world, the dataset usually is so large that it doesn't fit in a single GPU. Luckily, the XGBoost library natively supports multiple nodes and multiple GPU training by using Dask. You can scale out the computation using Dask dataframe.\n", + "\n", + "To show how easy it is to do distributed computation, let's run the above exmaple in the Dask environment for educational purpose. \n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To run the whole workflow, simply change the `preprocess` node to get Dask Dataframe and run the graph again. Here we look at the testing results:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id:stock_data process time:0.010s\n", + "id:preprocess process time:7.655s\n", + "id:xgboost process time:3.325s\n", + "id:backtest process time:0.010s\n", + "id:train_df process time:0.143s\n", + "id:portfolio_opt_train process time:0.033s\n", + "id:sharpe_ratio_trn process time:1.407s\n", + "id:cumulative_return_trn process time:1.193s\n", + "id:validation_df process time:0.007s\n", + "id:portfolio_opt_validation process time:0.032s\n", + "id:sharpe_ratio_val process time:1.172s\n", + "id:cumulative_return_val process time:1.268s\n", + "CPU times: user 8.24 s, sys: 801 ms, total: 9.04 s\n", + "Wall time: 29.7 s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "replace_spec = {'preprocess': {\"inputs\": {\"sort_node@in\": \"stock_data.dask_cudf_out\"}}}\n", + "o_gpu = task_graph.run(replace=replace_spec, profile=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "98b03eaa90714b2381a5563914964296", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "VBox(children=(Figure(axes=[Axis(label='Cumulative return', orientation='vertical', scale=LinearScale(), side=…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plot_figures(o_gpu)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "Clearly, 3 feautres is way too little here. gQuant implmented 36 technical indicators. We can change the configuration of node_technical_indicator node to include more features." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "chaikin_para0 = 10\n", + "chaikin_para1 = 20\n", + "bollinger_para = 10\n", + "macd_para0 = 2\n", + "macd_para1 = 3\n", + "rsi_para0 = 5\n", + "atr_para0 = 10\n", + "sod_para = 2\n", + "mflow_para = 3\n", + "findex_para = 5\n", + "adis_para = 5\n", + "ccindex_para = 5\n", + "bvol_para = 3\n", + "vindex_para = 3\n", + "mindex_para0 = 10\n", + "mindex_para1 = 15\n", + "tindex_para0 = 5\n", + "tindex_para1 = 10\n", + "emove_para = 5\n", + "cc_para = 15\n", + "kchannel_para = 10\n", + "indicator_conf = {\n", + " \"indicators\": [\n", + " {\"function\": \"port_chaikin_oscillator\",\n", + " \"columns\": [\"high\", \"low\", \"close\", \"volume\"],\n", + " \"args\": [chaikin_para0, chaikin_para1]\n", + " },\n", + " {\"function\": \"port_bollinger_bands\",\n", + " \"columns\": [\"close\"],\n", + " \"args\": [bollinger_para],\n", + " \"outputs\": [\"b1\", \"b2\"]\n", + " },\n", + " {\"function\": \"port_macd\",\n", + " \"columns\": [\"close\"],\n", + " \"args\": [macd_para0, macd_para1],\n", + " \"outputs\": [\"MACDsign\", \"MACDdiff\"]\n", + " },\n", + " {\"function\": \"port_relative_strength_index\",\n", + " \"columns\": [\"high\", \"low\"],\n", + " \"args\": [rsi_para0],\n", + " },\n", + " {\"function\": \"port_average_true_range\",\n", + " \"columns\": [\"high\", \"low\", \"close\"],\n", + " \"args\": [atr_para0],\n", + " },\n", + " {\"function\": \"port_stochastic_oscillator_k\",\n", + " \"columns\": [\"high\", \"low\", \"close\"],\n", + " \"args\": [],\n", + " },\n", + " {\"function\": \"port_stochastic_oscillator_d\",\n", + " \"columns\": [\"high\", \"low\", \"close\"],\n", + " \"args\": [sod_para],\n", + " },\n", + " {\"function\": \"port_money_flow_index\",\n", + " \"columns\": [\"high\", \"low\", \"close\", \"volume\"],\n", + " \"args\": [mflow_para],\n", + " },\n", + " {\"function\": \"port_force_index\",\n", + " \"columns\": [\"close\", \"volume\"],\n", + " \"args\": [findex_para],\n", + " },\n", + " {\"function\": \"port_ultimate_oscillator\",\n", + " \"columns\": [\"high\",\"low\",\"close\"],\n", + " \"args\": [],\n", + " },\n", + " {\"function\": \"port_accumulation_distribution\",\n", + " \"columns\": [\"high\",\"low\",\"close\",\"volume\"],\n", + " \"args\": [adis_para],\n", + " },\n", + " {\"function\": \"port_commodity_channel_index\",\n", + " \"columns\": [\"high\",\"low\",\"close\"],\n", + " \"args\": [ccindex_para],\n", + " },\n", + " {\"function\": \"port_on_balance_volume\",\n", + " \"columns\": [\"close\", \"volume\"],\n", + " \"args\": [bvol_para],\n", + " },\n", + " {\"function\": \"port_vortex_indicator\",\n", + " \"columns\": [\"high\", \"low\", \"close\"],\n", + " \"args\": [vindex_para],\n", + " },\n", + " {\"function\": \"port_kst_oscillator\",\n", + " \"columns\": [\"close\"],\n", + " \"args\": [3, 4, 5, 6, 7, 8, 9, 10],\n", + " },\n", + " {\"function\": \"port_mass_index\",\n", + " \"columns\": [\"high\", \"low\"],\n", + " \"args\": [mindex_para0, mindex_para1],\n", + " },\n", + " {\"function\": \"port_true_strength_index\",\n", + " \"columns\": [\"close\"],\n", + " \"args\": [tindex_para0, tindex_para1],\n", + " },\n", + " {\"function\": \"port_ease_of_movement\",\n", + " \"columns\": [\"high\", \"low\", \"volume\"],\n", + " \"args\": [emove_para],\n", + " },\n", + " {\"function\": \"port_coppock_curve\",\n", + " \"columns\": [\"close\"],\n", + " \"args\": [cc_para],\n", + " },\n", + " {\"function\": \"port_keltner_channel\",\n", + " \"columns\": [\"high\", \"low\", \"close\"],\n", + " \"args\": [kchannel_para],\n", + " \"outputs\": [\"KelChD\", \"KelChM\", \"KelChU\"]\n", + " },\n", + " {\"function\": \"port_ppsr\",\n", + " \"columns\": [\"high\", \"low\", \"close\"],\n", + " \"args\": [],\n", + " \"outputs\": [\"PP\", \"R1\", \"S1\", \"R2\", \"S2\", \"R3\", \"S3\"]\n", + " },\n", + " {\"function\": \"port_shift\",\n", + " \"columns\": [\"returns\"],\n", + " \"args\": [-1]\n", + " } \n", + " ],\n", + " \"remove_na\": True\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "49e4668f4fdf4a5cb7656b4fe42b7fd0", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "GQuantWidget(cache={'nodes': [{'width': 140, 'id': 'stock_data', 'type': 'CsvStockLoader', 'schema': {'title':…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "task_graph.draw()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Run the backtesting again" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id:technical_indicator process time:2.659s\n", + "id:xgboost process time:5.527s\n", + "id:backtest process time:0.002s\n", + "id:train_df process time:0.007s\n", + "id:portfolio_opt_train process time:0.011s\n", + "id:sharpe_ratio_trn process time:0.001s\n", + "id:cumulative_return_trn process time:0.015s\n", + "id:validation_df process time:0.005s\n", + "id:portfolio_opt_validation process time:0.009s\n", + "id:sharpe_ratio_val process time:0.001s\n", + "id:cumulative_return_val process time:0.015s\n", + "CPU times: user 7.86 s, sys: 1.37 s, total: 9.22 s\n", + "Wall time: 8.89 s\n" + ] + } + ], + "source": [ + "%%time\n", + "replace_spec = {}\n", + "replace_spec['technical_indicator'] = {\"conf\": indicator_conf}\n", + "\n", + "replace_spec['sort_after'] = {\"load\": {'out': cached_sort}}\n", + "\n", + "o_gpu = task_graph.run(replace=replace_spec, profile=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c37ca8d7dc8a487fbe99d187748ecb48", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "VBox(children=(Figure(axes=[Axis(label='Cumulative return', orientation='vertical', scale=LinearScale(), side=…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plot_figures(o_gpu)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We get Sharpe Ratio of `1.93` in the testing dataset, not bad!\n", + "\n", + "Using `min_volume=400.0`, it selects 1558 stocks. Setting a lower threshhold, it can include more stocks for the backtesting and hence increase the Sharpe Ratio. But it runs out of memory of single GPU. We have shown Dask can help to break down the large task into small tasks and schedule them a distributed environment. So we can handle dataset of any sizes in this way:" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id:stock_data process time:0.007s\n", + "id:preprocess process time:5.938s\n", + "id:xgboost process time:9.109s\n", + "id:backtest process time:0.010s\n", + "id:train_df process time:0.007s\n", + "id:portfolio_opt_train process time:0.032s\n", + "id:sharpe_ratio_trn process time:2.444s\n", + "id:cumulative_return_trn process time:2.464s\n", + "id:validation_df process time:0.006s\n", + "id:portfolio_opt_validation process time:0.032s\n", + "id:sharpe_ratio_val process time:2.344s\n", + "id:cumulative_return_val process time:2.389s\n", + "CPU times: user 10 s, sys: 2.84 s, total: 12.9 s\n", + "Wall time: 43.4 s\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "4f37ade2840b4a6db394f4c07f5bca47", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "VBox(children=(Figure(axes=[Axis(label='Cumulative return', orientation='vertical', scale=LinearScale()), Axis…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%time\n", + "min_volume = 4.0\n", + "min_rate = -10.0\n", + "max_rate = 10.0\n", + "replace_spec={}\n", + "replace_spec['technical_indicator'] = {\"conf\": indicator_conf}\n", + "\n", + "replace_spec['node_filterValue']={\"conf\": [{\"column\": \"volume_mean\", \"min\": min_volume},\n", + " {\"column\": \"returns_max\", \"max\": max_rate},\n", + " {\"column\": \"returns_min\", \"min\": min_rate}]}\n", + "replace_spec['preprocess'] = {\"conf\": {\"subnodes_conf\": {\n", + " \"value_filter\": {\n", + " \"conf\": [{\"column\": \"average_volume\", \"min\": min_volume},\n", + " {\"column\": \"max_return\", \"max\": max_rate},\n", + " {\"column\": \"min_return\", \"min\": min_rate}]\n", + " },\n", + " \"drop_columns\": {\n", + " \"conf\": {\n", + " \"columns\": [\"average_volume\", \"min_return\", \"max_return\"]\n", + " }\n", + " }\n", + " },\n", + " \"taskgraph\": \"taskgraphs/preprocess.gq.yaml\",\n", + " \"input\": [\"sort_node.in\"],\n", + " \"output\": [\"drop_columns.out\"]\n", + " },\n", + " \"inputs\": {\"sort_node@in\": \"stock_data.dask_cudf_out\"}}\n", + "\n", + "o_gpu = task_graph.run(replace=replace_spec, profile=True)\n", + "plot_figures(o_gpu)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "dd477080878b49c28174a3c3b1218369", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "VBox(children=(Figure(axes=[Axis(label='Cumulative return', orientation='vertical', scale=LinearScale()), Axis…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plot_figures(o_gpu)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We get Sharpe Ratio of `4.7` in the testing dataset. This is a great improvement!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Strategy parameter search\n", + "Quantitative analyst usually need to explore different parameters for their trading strategy. The exploration process is an iterative process. gQuant help to speed up this by allowing using cached dataframe and evaluating the sub-graphs.\n", + "\n", + "To find the optimal technical indicator parameters for this XGBoost strategy, we build a wiget to search the parameter interactively. " + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "4907f1afde0445f6bea3c99150c57d5a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "VBox(children=(VBox(children=(IntRangeSlider(value=(10, 20), continuous_update=False, description='Chaikin', m…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import plotutils\n", + "replace_spec={}\n", + "replace_spec['technical_indicator'] = {\"conf\": indicator_conf}\n", + "replace_spec['sort_after'] = {\"load\": {'out': cached_sort}}\n", + "plotutils.getXGBoostWidget(replace_spec, task_graph, plot_figures)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Conclusions\n", + "In this notebook, we demoed how to use gQuant to backtest XGBoost trading strategy. It is convenient and efficient to use indicator node from the gQuant to compute features for all the stocks in the dataset in the GPU. The XGBoost training are computed in the GPU, so we can get the results quickly. This example shows the XGBoost algorithm's power in finding trading signals. We can achieve close to 2 raw Sharpe ratio in the testing time period." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'status': 'ok', 'restart': True}" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import IPython\n", + "app = IPython.Application.instance()\n", + "app.kernel.do_shutdown(True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/gQuant/plugins/rapids_plugin/notebooks/07_fractional_differencing.ipynb b/gQuant/plugins/rapids_plugin/notebooks/07_fractional_differencing.ipynb new file mode 100644 index 00000000..4743f05e --- /dev/null +++ b/gQuant/plugins/rapids_plugin/notebooks/07_fractional_differencing.ipynb @@ -0,0 +1,806 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Fractional Differencing\n", + "\n", + "### Background\n", + "Fractional Differencing is a signal processing technique that is used to remove the non-stationarity from the time series while maintaining as much memory as possible. It is widely used in FSI to prepare training data for machine learning algorithms. In this [open-source project](https://github.com/ritchieng/fractional_differencing_gpu/blob/master/notebooks/gpu_fractional_differencing.ipynb) done by Ensemble Capital, fractional differencing computation is accelerated via `cudf.appy_chunk` method in the GPU. It achieves hundreds of times acceleration compared with CPU implementation in their [report](https://www.researchgate.net/publication/335159299_GFD_GPU_Fractional_Differencing_for_Rapid_Large-scale_Stationarizing_of_Time_Series_Data_while_Minimizing_Memory_Loss). \n", + "Using `apply_rows` and `apply_chunks` method from the cudf library is the easiest way of customizing GPU computations as covered in this [blog](https://medium.com/rapids-ai/user-defined-functions-in-rapids-cudf-2d7c3fc2728d). However, it is not the most efficient way.\n", + "\n", + "In this notebook, we are going to show how to use Numba to do fractional differencing computation efficiently. As gQuant wrap the fractional differencing function in the computation node, we are going to show it is easy for data scientists to compute fractional differencing signals and use them to generate alpha signals. \n", + "\n", + "### Environment Preparation" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import sys; sys.path.insert(0, '..')\n", + "\n", + "import warnings\n", + "import gquant\n", + "import ipywidgets as widgets\n", + "import os\n", + "import time\n", + "import numpy as np\n", + "from numba import cuda\n", + "import cudf\n", + "import inspect\n", + "from numba import njit\n", + "from numba import prange\n", + "from gquant.dataframe_flow.task import load_modules\n", + "from gquant_rapids_plugin.cuindicator import get_weights_floored, fractional_diff\n", + "warnings.simplefilter(\"ignore\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copy the fractional differencing code from the [open-source project](https://github.com/ritchieng/fractional_differencing_gpu/blob/master/notebooks/gpu_fractional_differencing.ipynb). We will use this as our benchmark reference" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def moving_dot_product_kernel(in_data, out, window_size, weights):\n", + " # Set the first window_size-1 rows in each chunk to np.nan due \n", + " # insufficient history\n", + " for i in range(cuda.threadIdx.x, window_size - 1, cuda.blockDim.x):\n", + " out[i] = np.nan\n", + " \n", + " # Compute dot product of preceding window_size rows\n", + " for i in range(cuda.threadIdx.x + window_size - 1, in_data.size, cuda.blockDim.x):\n", + " rolling_dot_product = 0.0\n", + " \n", + " k = 0\n", + " for j in range(i - window_size + 1, i + 1):\n", + " rolling_dot_product += in_data[j] * weights[k][0]\n", + " k += 1\n", + " \n", + " out[i] = rolling_dot_product \n", + " \n", + "def frac_diff_gpu(df, d, floor=1e-3):\n", + " r\"\"\"Fractionally difference time series via GPU.\n", + " \n", + " Args:\n", + " df (pd.DataFrame): dataframe of raw time series values.\n", + " d (float): differencing value from 0 to 1 where > 1 has no FD.\n", + " floor (float): minimum value of weights, ignoring anything smaller.\n", + " \"\"\"\n", + " \n", + " # Bring dataframe to GPU, reset index for GPU dot product kernel\n", + " # gdf_raw = cudf.from_pandas(df).reset_index(drop=True)\n", + " gdf_raw = df\n", + " gdf_raw.columns = ['in_data']\n", + "\n", + " # Get weights window\n", + " weights = get_weights_floored(d=d, num_k=len(gdf_raw), floor=floor)\n", + " weights_window_size = len(weights)\n", + " \n", + " # Reverse weights and as contiguous\n", + " weights = np.ascontiguousarray(weights[::-1])\n", + " \n", + " # Bring weights to GPU\n", + " gdf_weights = cudf.DataFrame()\n", + " gdf_weights[gdf_raw.columns[0]] = weights.reshape(-1)\n", + "\n", + " # Length of data\n", + " data_length = len(gdf_raw)\n", + "\n", + " # T4: max of 518 threads per block.\n", + " # V100: max 1024 threads per block\n", + " threads_per_block = 518\n", + "\n", + " # Chunk size split\n", + " # This has to be improved, but as a v0.1, it's sufficient to show speed-up\n", + " # Up to easily 100 million data points\n", + " trunk_size = data_length\n", + "\n", + " # Get fractionally differenced time series through GPU function\n", + " gdf_raw_fd = gdf_raw.apply_chunks(moving_dot_product_kernel,\n", + " incols=['in_data'],\n", + " outcols=dict(out=np.float64),\n", + " kwargs=dict(window_size=weights_window_size, weights=weights),\n", + " chunks=list(range(0, data_length, trunk_size)) + [data_length],\n", + " tpb=threads_per_block)\n", + " \n", + " # Bring to CPU for normal manipulation\n", + " # df_raw_fd = gdf_raw_fd.to_pandas().dropna().iloc[:-1, 1]\n", + " \n", + " return gdf_raw_fd, weights" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Following is the gQuant's fractional differencing implementation via Numba library" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "def fractional_diff(input_arr, d=0.5, floor=1e-3, min_periods=None,\n", + " thread_tile=2, number_of_threads=512):\n", + " \"\"\"\n", + " The fractional difference computation method.\n", + "\n", + " Arguments:\n", + " -------\n", + " input_arr: numba.cuda.DeviceNDArray or cudf.Series\n", + " the input array to compute the fractional difference\n", + " d: float\n", + " the differencing value. range from 0 to 1\n", + " floor: float\n", + " minimum value for the weights for computational efficiency.\n", + " min_periods: int\n", + " default the lengths of the weights. Need at least min_periods of\n", + " non-na elements to get fractional difference value\n", + " thread_tile: int\n", + " each thread will be responsible for `thread_tile` number of\n", + " elements in window computation\n", + " number_of_threads: int\n", + " number of threads in a block for CUDA computation\n", + "\n", + " Returns\n", + " -------\n", + " (numba.cuda.DeviceNDArray, np.array)\n", + " the computed fractional difference array and the weight array tuple\n", + "\n", + " \"\"\"\n", + " if isinstance(input_arr, numba.cuda.cudadrv.devicearray.DeviceNDArray):\n", + " gpu_in = input_arr\n", + " else:\n", + " gpu_in = input_arr.to_gpu_array()\n", + "\n", + " # compute the weights for the fractional difference\n", + " weights = get_weights_floored(d=d,\n", + " num_k=len(input_arr),\n", + " floor=floor)[::-1, 0]\n", + " weights_out = np.ascontiguousarray(weights)\n", + " weights = numba.cuda.to_device(weights_out)\n", + "\n", + " window = len(weights)\n", + "\n", + " if min_periods is None:\n", + " min_periods = window\n", + " else:\n", + " min_periods = min_periods\n", + "\n", + " number_of_threads = number_of_threads\n", + " array_len = len(gpu_in)\n", + "\n", + " # allocate the output array\n", + " gpu_out = numba.cuda.device_array_like(gpu_in)\n", + "\n", + " number_of_blocks = \\\n", + " (array_len + (number_of_threads * thread_tile - 1)) // \\\n", + " (number_of_threads * thread_tile)\n", + "\n", + " shared_buffer_size = (number_of_threads * thread_tile +\n", + " window - 1 + window)\n", + "\n", + " # call the conv kernel\n", + " kernel[(number_of_blocks,),\n", + " (number_of_threads,),\n", + " 0,\n", + " shared_buffer_size * 8](gpu_in,\n", + " weights,\n", + " gpu_out,\n", + " window,\n", + " array_len,\n", + " thread_tile,\n", + " min_periods)\n", + " return gpu_out, weights_out\n", + "\n" + ] + } + ], + "source": [ + "print(inspect.getsource(fractional_diff))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It launches the Numba kernel, which defined as:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "@cuda.jit(device=True)\n", + "def conv_window(shared, history_len, out_arr, window_size,\n", + " arr_len, offset, offset2, min_size):\n", + " \"\"\"\n", + " This function is to do convolution for one thread\n", + "\n", + " Arguments:\n", + " ------\n", + " shared: numba.cuda.DeviceNDArray\n", + " 3 chunks of data are stored in the shared memory\n", + " the first [0, window_size) elements is the chunk of data that is\n", + " necessary to compute the first convolution element.\n", + " then [window_size, window_size + thread_tile * blockDim) elements\n", + " are the inputs allocated for this block of threads\n", + " the last [window_size + thread_tile,\n", + " window_size + thread_tile + window_size) is to store the kernel values\n", + " history_len: int\n", + " total number of historical elements available for this chunk of data\n", + " out_arr: numba.cuda.DeviceNDArray\n", + " output gpu_array of size of `thread_tile`\n", + " window_size: int\n", + " the number of elements in the kernel\n", + " arr_len: int\n", + " the chunk array length, same as `thread_tile`\n", + " offset: int\n", + " indicate the starting index of the chunk array in the shared for\n", + " this thread.\n", + " offset: int\n", + " indicate the starting position of the weights/kernel array\n", + " min_size: int\n", + " the minimum number of non-na elements\n", + " \"\"\"\n", + " for i in range(arr_len):\n", + " if i + history_len < window_size-1:\n", + " out_arr[i] = np.nan\n", + " else:\n", + " s = 0.0\n", + " average_size = 0\n", + " for j in range(0, window_size):\n", + " if not (cmath.isnan(\n", + " shared[offset + i - j])):\n", + " s += (shared[offset + i - j] *\n", + " shared[offset2 + window_size - 1 - j])\n", + " average_size += 1\n", + " if average_size >= min_size:\n", + " out_arr[i] = s\n", + " else:\n", + " out_arr[i] = np.nan\n", + "\n", + " \n", + "@cuda.jit\n", + "def kernel(in_arr, weight_arr, out_arr, window,\n", + " arr_len, thread_tile, min_size):\n", + " \"\"\"\n", + " This kernel is to do 1D convlution on `in_arr` array with `weight_arr`\n", + " as kernel. The results is saved on `out_arr`.\n", + "\n", + " Arguments:\n", + " ------\n", + " in_arr: numba.cuda.DeviceNDArray\n", + " input gpu array\n", + " weight_arr: numba.cuda.DeviceNDArray\n", + " convolution kernel gpu array\n", + " out_arr: numba.cuda.DeviceNDArray\n", + " output gpu_array\n", + " window: int\n", + " the number of elements in the weight_arr\n", + " arr_len: int\n", + " the input/output array length\n", + " thread_tile: int\n", + " each thread is responsible for `thread_tile` number of elements\n", + " min_size: int\n", + " the minimum number of non-na elements\n", + " \"\"\"\n", + " shared = cuda.shared.array(shape=0,\n", + " dtype=numba.float64)\n", + " block_size = cuda.blockDim.x # total number of threads\n", + " tx = cuda.threadIdx.x\n", + " # Block id in a 1D grid\n", + " bid = cuda.blockIdx.x\n", + " starting_id = bid * block_size * thread_tile\n", + "\n", + " # copy the thread_tile * number_of_thread_per_block into the shared\n", + " for j in range(thread_tile):\n", + " offset = tx + j * block_size\n", + " if (starting_id + offset) < arr_len:\n", + " shared[offset + window - 1] = in_arr[\n", + " starting_id + offset]\n", + " cuda.syncthreads()\n", + "\n", + " # copy the window - 1 into the shared\n", + " for j in range(0, window - 1, block_size):\n", + " if (((tx + j) <\n", + " window - 1) and (\n", + " starting_id - window + 1 + tx + j >= 0)):\n", + " shared[tx + j] = \\\n", + " in_arr[starting_id - window + 1 + tx + j]\n", + " cuda.syncthreads()\n", + " # copy the weights into the shared\n", + " for j in range(0, window, block_size):\n", + " element_id = tx + j\n", + " if (((tx + j) < window) and (element_id < window)):\n", + " shared[thread_tile * block_size + window - 1 + tx +\n", + " j] = weight_arr[tx + j]\n", + " cuda.syncthreads()\n", + " # slice the shared memory for each threads\n", + " start_shared = tx * thread_tile\n", + " his_len = min(window - 1,\n", + " starting_id + tx * thread_tile)\n", + " # slice the global memory for each threads\n", + " start = starting_id + tx * thread_tile\n", + " end = min(starting_id + (tx + 1) * thread_tile, arr_len)\n", + " sub_outarr = out_arr[start:end]\n", + " sub_len = end - start\n", + " conv_window(shared, his_len, sub_outarr,\n", + " window, sub_len,\n", + " window - 1 + start_shared,\n", + " thread_tile * block_size + window - 1,\n", + " min_size)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Fractional differencing is essentially doing 1D convolution computation with the kernel values set to be the weights computed from get_weights_floored. Check the original notebook for the details of the meanings of the weights. To make convolution computation faster, we divide the long input array into small chunks and send to different thread blocks. All the array chunks and the weights are loaded into the GPU shared memory for fast IO. The device function conv_window is doing the convolution computation for one thread.\n", + "\n", + "To make a fair comparsion with CPU implementation, we implemented an efficient CPU version of the fractional differencing calculation. It is accelerated by numba.njit that take advantage of multiple cores of the CPU and fastmath compiler optimization." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "@njit(fastmath=True, parallel=True)\n", + "def moving_dot_product_cpu(in_data, out, window_size, weights):\n", + " # Set the first window_size-1 rows in each chunk to np.nan due \n", + " # insufficient history\n", + " for i in prange(0, window_size - 1):\n", + " out[i] = np.nan\n", + " \n", + " # Compute dot product of preceding window_size rows\n", + " for i in prange(window_size - 1, len(in_data)):\n", + " rolling_dot_product = 0.0\n", + " \n", + " k = 0\n", + " for j in range(i - window_size + 1, i + 1):\n", + " rolling_dot_product += in_data[j] * weights[k]\n", + " k += 1\n", + " \n", + " out[i] = rolling_dot_product \n", + "\n", + "def cpu_fractional_diff(input_arr, d=0.5, floor=1e-3):\n", + "\n", + " # compute the weights for the fractional difference\n", + " weights = get_weights_floored(d=d,\n", + " num_k=len(input_arr),\n", + " floor=floor)[::-1, 0]\n", + " weights_out = np.ascontiguousarray(weights)\n", + " weights = weights_out\n", + " weights_window_size = len(weights)\n", + " window = len(weights)\n", + " out = np.zeros_like(input_arr)\n", + " moving_dot_product_cpu(input_arr, out, weights_window_size, weights)\n", + " return out" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Fractional differencing is essentially doing 1D convolution computation with the kernel values set to be the weights computed from `get_weights_floored`. Check the original [notebook](https://github.com/ritchieng/fractional_differencing_gpu/blob/master/notebooks/gpu_fractional_differencing.ipynb) for the details of the meanings of the weights. To make convolution computation faster, we divide the long input array into small chunks and send to different thread blocks. All the array chunks and the weights are loaded into the GPU shared memory for fast IO. The device function `conv_window` is doing the convolution computation for one thread.\n", + "\n", + "We can compare the performance of gQuant GPU implementation vs the original one and CPU implementation:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "array size 100000, Ensemble: time 0.273 s, gQuant GPU Time 0.374 s, gQuant CPU Time 0.499, speed up 0.73, speed up vs CPU 1.33, error 0.0000 \n", + "array size 1000000, Ensemble: time 0.144 s, gQuant GPU Time 0.007 s, gQuant CPU Time 0.035, speed up 20.04, speed up vs CPU 4.83, error 0.0000 \n", + "array size 10000000, Ensemble: time 1.023 s, gQuant GPU Time 0.031 s, gQuant CPU Time 0.125, speed up 32.87, speed up vs CPU 4.02, error 0.0000 \n", + "array size 100000000, Ensemble: time 10.207 s, gQuant GPU Time 0.267 s, gQuant CPU Time 1.057, speed up 38.18, speed up vs CPU 3.95, error 0.0000 \n" + ] + } + ], + "source": [ + "for i in range(5, 9):\n", + " df_raw = cudf.DataFrame()\n", + " ran_array = np.random.rand(10**int(i))\n", + " df_raw['in'] = ran_array\n", + " df_raw2 = cudf.DataFrame()\n", + " df_raw2['in'] = ran_array\n", + "\n", + " # Start timer\n", + " start = time.time()\n", + " df_raw_fd_from_gpu, weights = frac_diff_gpu(df_raw, d=0.5, floor=5e-5)\n", + " # End timer\n", + " end = time.time()\n", + " duration = end - start\n", + "\n", + " start = time.time()\n", + " gquant_gpu, weights = fractional_diff(df_raw2['in'], d=0.5, floor=5e-5)\n", + " cuda.synchronize()\n", + " end = time.time()\n", + " optimized_duration = end - start\n", + " #(df_raw_fd_from_gpu.values)\n", + " \n", + " \n", + " start = time.time()\n", + " cpu_result = cpu_fractional_diff(ran_array, d=0.5, floor=5e-5)\n", + " end = time.time()\n", + " cpu_duration = end - start\n", + " \n", + " err = np.abs(df_raw_fd_from_gpu['out'].to_array()[weights.size-1:] - np.array(gquant_gpu)[weights.size-1:]).max()\n", + " err = max(np.abs(df_raw_fd_from_gpu['out'].to_array()[weights.size-1:] - cpu_result[weights.size-1:]).max(), err)\n", + " print('array size %d, Ensemble: time %.3f s, gQuant GPU Time %.3f s, gQuant CPU Time %.3f, speed up %.2f, speed up vs CPU %.2f, error %.4f ' % (10**int(i), duration, optimized_duration, cpu_duration, duration / optimized_duration, cpu_duration/optimized_duration, err))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For the array of length 100m, gQuant can achieve 100x speedup compare with the Ensemble Capitial's GPU implementatoin and 30x speed up compared with multiple core CPU." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Use the fractional differencing signal to trade stocks\n", + "\n", + "We will use the same [XGBoost example](https://github.com/rapidsai/gQuant/blob/master/notebooks/06_xgboost_trade.ipynbx) to do backtest with fractional differencing signals. The workflow includes the following steps:\n", + "\n", + "1. Preprocess the datasets.\n", + "\n", + "2. Compute the features based on different fractional differencing signals of the closing prices of the stocks \n", + "\n", + "3. Split the data in training and testing and build a XGBoost model based on the training data. From the XGBoost model, compute the trading signals for all the data points.\n", + "\n", + "4. Run backtesting and compute the returns from this strategy for each of the days and stock symbols \n", + "\n", + "5. Run a simple portfolio optimization by averaging the stocks together for each of the trading days.\n", + "\n", + "6. Compute the Sharpe ratio and cumulative return results for both training and testing datasets\n", + "\n", + "The whole workflow can be organized into a computation graph, which are fully described in a yaml file. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Each nodes has a unique id, a node type, configuration parameters and input nodes ids. gQuant takes this yaml file, wires it into a graph to visualize it. " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "d1a3e7ad5d224709a8e092c3e4140b36", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "GQuantWidget(sub=HBox(), value=[OrderedDict([('id', 'stock_data'), ('type', 'CsvStockLoader'), ('conf', {'file…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%reset -s -f\n", + "import sys\n", + "import os\n", + "sys.path.append('..')\n", + "import gquant\n", + "from gquant.dataframe_flow import TaskGraph\n", + "import ipywidgets as widgets\n", + "import warnings\n", + "warnings.simplefilter(\"ignore\")\n", + "\n", + "task_graph = TaskGraph.load_taskgraph('../taskgraphs/xgboost_trade.gq.yaml')\n", + "task_graph.draw()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The features used for XGBoost algorithm are prepared in the `xgboost` Task node, where `cuIndicator` module is used to compute the technical indicators in the GPU for all the stock symbols. `xgboost` is the Task node that is used to compute the trading signals from the stock technical indicators. Each of the gQuant Task node is implemented by overwriting `meta_setup`, `process`, `ports_setup`, `conf_chema` methods of the Node base class. Please refer to [customize nodes notebook](https://github.com/rapidsai/gQuant/blob/master/notebooks/05_customize_nodes.ipynb) for details. Following is the source code for \"XGBoostStrategyNode\":" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# define the function to format the plots\n", + "def plot_figures(o):\n", + " # format the figures\n", + " figure_width = '1200px'\n", + " figure_height = '400px'\n", + " sharpe_number = o['sharpe_ratio_trn.sharpe_out']\n", + " cum_return_train = o['cumulative_return_trn.cum_return']\n", + " cum_return_train.layout.height = figure_height\n", + " cum_return_train.layout.width = figure_width\n", + " cum_return_train.title = 'Training P & L %.3f' % (sharpe_number)\n", + " sharpe_number = o['sharpe_ratio_val.sharpe_out']\n", + " cum_return_test = o['cumulative_return_val.cum_return']\n", + " cum_return_test.layout.height = figure_height\n", + " cum_return_test.layout.width = figure_width\n", + " cum_return_test.title = 'Testing P & L %.3f' % (sharpe_number)\n", + " return widgets.VBox([cum_return_train, cum_return_test])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this example, we are going to add 5 fractional differencing signals from the closing prices " + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "indicator_conf = {\n", + " \"indicators\": [\n", + " {\"function\": \"port_fractional_diff\",\n", + " \"columns\": [\"close\"],\n", + " \"args\": [0.5]\n", + " },\n", + " {\"function\": \"port_fractional_diff\",\n", + " \"columns\": [\"close\"],\n", + " \"args\": [0.3]\n", + " },\n", + " {\"function\": \"port_fractional_diff\",\n", + " \"columns\": [\"close\"],\n", + " \"args\": [0.1]\n", + " },\n", + " {\"function\": \"port_fractional_diff\",\n", + " \"columns\": [\"close\"],\n", + " \"args\": [0.7]\n", + " },\n", + " {\"function\": \"port_fractional_diff\",\n", + " \"columns\": [\"close\"],\n", + " \"args\": [0.9]\n", + " },\n", + " {\"function\": \"port_shift\",\n", + " \"columns\": [\"returns\"],\n", + " \"args\": [-1]\n", + " } \n", + " ],\n", + " \"remove_na\": True\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Run the backtest" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id:stock_data process time:4.146s\n", + "id:preprocess process time:0.752s\n", + "id:sort_after process time:0.056s\n", + "id:technical_indicator process time:0.459s\n", + "id:xgboost process time:2.345s\n", + "id:backtest process time:0.001s\n", + "id:train_df process time:0.144s\n", + "id:portfolio_opt_train process time:0.011s\n", + "id:sharpe_ratio_trn process time:0.001s\n", + "id:cumulative_return_trn process time:0.015s\n", + "id:validation_df process time:0.005s\n", + "id:portfolio_opt_validation process time:0.009s\n", + "id:sharpe_ratio_val process time:0.001s\n", + "id:cumulative_return_val process time:0.014s\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "aabd384e30614c609ee3f881b974d53c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "VBox(children=(Figure(axes=[Axis(label='Cumulative return', orientation='vertical', scale=LinearScale()), Axis…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "\n", + "replace_spec = {}\n", + "replace_spec['technical_indicator'] = {\"conf\": indicator_conf}\n", + "\n", + "o_gpu = task_graph.run(replace=replace_spec, profile=True)\n", + "\n", + "plot_figures(o_gpu)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We get Sharpe Ratio of `1.01` just from the fractional differencing signals of the closing prices\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To visualize the computed fractional differencing signals, we can make a TaskGraph to visualize it. We put the XGboost trade TaskGraph into a composite node. We select the asset with id `22123` and plot 4 fractional differencing signals with different `d` values. Check the updated graph below. Note, there are 2 layers of composite nodes in the following graph. As you can see, composite node is a powerful way of organizing the TaskGraphs. " + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a75a2bfde85b4e9f82674b7c8f98f12a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "GQuantWidget(sub=HBox(), value=[OrderedDict([('id', 'stock_data'), ('type', 'CsvStockLoader'), ('conf', {'file…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "task_graph = TaskGraph.load_taskgraph('../taskgraphs/visualize_frac_diff.gq.yaml')\n", + "task_graph.draw()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b3c117b3f5634731938dbdd0fe2525c1", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Tab(children=(Output(), Output(layout=Layout(border='1px solid black'), outputs=({'output_type': 'stream', 'na…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "task_graph.run(formated=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can run the sub-graph just for plotting the signals." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note, smaller `d` value signal has more memory information but not as stationary as the high `d` value signals. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Conclusions\n", + "In this notebook, we demoed how to use Numba to implemement fractional differencing calculation in GPU. It achieves 100x speed up compared with the method done by Ensemble Capital. We also showed it is easy to use gQuant to compute fractional difference and run backtests" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'status': 'ok', 'restart': True}" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import IPython\n", + "app = IPython.Application.instance()\n", + "app.kernel.do_shutdown(True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/08_gquant_machine_learning.ipynb b/gQuant/plugins/rapids_plugin/notebooks/08_gquant_machine_learning.ipynb similarity index 95% rename from notebooks/08_gquant_machine_learning.ipynb rename to gQuant/plugins/rapids_plugin/notebooks/08_gquant_machine_learning.ipynb index a5ac9eb1..84796421 100644 --- a/notebooks/08_gquant_machine_learning.ipynb +++ b/gQuant/plugins/rapids_plugin/notebooks/08_gquant_machine_learning.ipynb @@ -59,7 +59,7 @@ "\n", "

Client

\n", "\n", "\n", @@ -75,7 +75,7 @@ "" ], "text/plain": [ - "" + "" ] }, "execution_count": 2, @@ -114,18 +114,18 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "34f39e2a8ef64d758d9929b6aff44abe", + "model_id": "e95e03a0301344d0ae20857a659bfd34", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "GQuantWidget(sub=HBox(), value=[OrderedDict([('id', 'data_gen'), ('type', 'ClassificationData'), ('conf', {'n_…" + "GQuantWidget(sub=HBox(), value=[OrderedDict([('id', 'data_gen'), ('type', 'ClassificationData'), ('conf', {'cl…" ] }, "metadata": {}, @@ -165,13 +165,13 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "20ac20a0160b4c9599eb6792d1fec45c", + "model_id": "7d0d0d15571341ee9564159a88be0db1", "version_major": 2, "version_minor": 0 }, @@ -208,18 +208,18 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "550036d1896c4125b25470468e47d9e8", + "model_id": "1caa94fded3c424db5b619d05173757b", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "GQuantWidget(sub=HBox(), value=[OrderedDict([('id', 'data_gen'), ('type', 'ClassificationData'), ('conf', {'n_…" + "GQuantWidget(sub=HBox(), value=[OrderedDict([('id', 'data_gen'), ('type', 'ClassificationData'), ('conf', {'cl…" ] }, "metadata": {}, @@ -245,18 +245,18 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "9a22506effe94bd489930088de60a9f4", + "model_id": "f1d8da9435864f7f9f4bb308c84d0711", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "GQuantWidget(sub=HBox(), value=[OrderedDict([('id', 'data_gen'), ('type', 'ClassificationData'), ('conf', {'n_…" + "GQuantWidget(sub=HBox(), value=[OrderedDict([('id', 'data_gen'), ('type', 'ClassificationData'), ('conf', {'cl…" ] }, "metadata": {}, @@ -285,18 +285,18 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "cef6af7d426746abaa6e08aed8b987f9", + "model_id": "3d64b8d9b6fb4db989fa399790e75bd5", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "GQuantWidget(sub=HBox(), value=[OrderedDict([('id', 'data_gen'), ('type', 'ClassificationData'), ('conf', {'n_…" + "GQuantWidget(sub=HBox(), value=[OrderedDict([('id', 'data_gen'), ('type', 'ClassificationData'), ('conf', {'cl…" ] }, "metadata": {}, @@ -463,7 +463,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.7" + "version": "3.8.2" } }, "nbformat": 4, diff --git a/notebooks/09_gquant_machine_hpo.ipynb b/gQuant/plugins/rapids_plugin/notebooks/09_gquant_machine_hpo.ipynb similarity index 100% rename from notebooks/09_gquant_machine_hpo.ipynb rename to gQuant/plugins/rapids_plugin/notebooks/09_gquant_machine_hpo.ipynb diff --git a/notebooks/11_streamz.ipynb b/gQuant/plugins/rapids_plugin/notebooks/10_streamz.ipynb.ipynb similarity index 99% rename from notebooks/11_streamz.ipynb rename to gQuant/plugins/rapids_plugin/notebooks/10_streamz.ipynb.ipynb index 1e235447..5fc5166d 100644 --- a/notebooks/11_streamz.ipynb +++ b/gQuant/plugins/rapids_plugin/notebooks/10_streamz.ipynb.ipynb @@ -105,7 +105,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "As shown above, this source node has one output port which outputs `streamz.Stream` object. It has no configuration, so we just leave it empty. In the `meta_setup` method, we specify that the element type in the stream is a number. The down-stream node can connect to it only if its `self.required` has the same key value pair. The `Stream` could stream many types of data and just knowing that something is a `streamz.Stream` is not enough to know what is actually being streamed. Using columns setup enables one to implement meta-typecking enforcement i.e. above one expects the stream output to contain `{'element': 'number'}` which is just some custom type specification. When the outputs are dataframes (pandas, cudf, dask-dataframes) then the columns has a concrete meaning i.e. what columns are present and what are their types in the dataframes.\n", + "As shown above, this source node has one output port which outputs `streamz.Stream` object. It has no configuration, so we just leave it empty. In the `meta_setup` method, we specify that the element type in the stream is a number. The down-stream node can connect to it only if its `MetaData.inports` has the same key value pair. The `Stream` could stream many types of data and just knowing that something is a `streamz.Stream` is not enough to know what is actually being streamed. Using columns setup enables one to implement meta-typecking enforcement i.e. above one expects the stream output to contain `{'element': 'number'}` which is just some custom type specification. When the outputs are dataframes (pandas, cudf, dask-dataframes) then the columns has a concrete meaning i.e. what columns are present and what are their types in the dataframes.\n", "\n", "In the `process` method, `StreamNode` outputs the `stream.Stream()` as the source end of the pipeline. Later, we will use the `emit` method to add numbers to it." ] @@ -173,7 +173,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "`TransformNode` definition is similar to `StreamNode`, but it has an input port `stream_in`. It defines the key value pair `element->number` in the `self.required` dictionary, so it is compatible to connect to the source node. \n", + "`TransformNode` definition is similar to `StreamNode`, but it has an input port `stream_in`. It defines the key value pair `element->number` in the `MetaData.inports` dictionary, so it is compatible to connect to the source node. \n", "\n", "In the `process` method, it maps each elements in the stream by `double` function." ] @@ -369,7 +369,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "511ae5e7169349d4935b6fdbde89c721", + "model_id": "d388c2b064f949fdb5be20ebe0717bdc", "version_major": 2, "version_minor": 0 }, @@ -402,7 +402,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "6137f8f7ea194e5b970179ebb3837b1f", + "model_id": "0aabb8adc0d842eabdd3d38dd78eaca5", "version_major": 2, "version_minor": 0 }, @@ -681,7 +681,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "e77a11291ffd4ab29c3c60a61133ebc5", + "model_id": "57bb637730d74bd19969a98aebc57b16", "version_major": 2, "version_minor": 0 }, @@ -731,12 +731,12 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "0cfe7e0d461f4f2c8ee8c688ce0e92bf", + "model_id": "2872e0f7dbe34d25804f03616ca90be0", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "Figure(animation_duration=10, axes=[Axis(label='x', scale=LinearScale()), Axis(label='y', orientation='vertica…" + "Figure(animation_duration=10, axes=[Axis(label='x', scale=LinearScale(), side='bottom'), Axis(label='y', orien…" ] }, "metadata": {}, @@ -835,12 +835,6 @@ " }\n", " iports_conn = self.get_connected_inports()\n", " nports = len(iports_conn)\n", - " if nports > 2:\n", - " self.required = {\n", - " iport: {\n", - " 'element': 'numbers'\n", - " } for iport in iports_conn\n", - " }\n", " columns_out = {\n", " 'stream_out': {'element': 'numbers'}\n", " }\n", @@ -886,7 +880,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "628ac22e7afb4475a584e3521c3727d4", + "model_id": "eab5330a56954bccb8ba78429bda9166", "version_major": 2, "version_minor": 0 }, @@ -936,12 +930,12 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "ea867d84b2f341858b906a34d652309b", + "model_id": "35326b6d4e6844baaf8618632a1f7773", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "Figure(animation_duration=10, axes=[Axis(label='x', scale=LinearScale()), Axis(label='y', orientation='vertica…" + "Figure(animation_duration=10, axes=[Axis(label='x', scale=LinearScale(), side='bottom'), Axis(label='y', orien…" ] }, "metadata": {}, @@ -1280,7 +1274,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "f48951f72e8544f4a13c1106a7d1b5d5", + "model_id": "2dcccf77e41e43429d97142fe23736e0", "version_major": 2, "version_minor": 0 }, @@ -1329,12 +1323,12 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "5831237559254e978e910da8aadfead2", + "model_id": "98df014eec71488abb30b271ea2c3fdf", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "Figure(animation_duration=10, axes=[Axis(label='x', scale=LinearScale()), Axis(label='y', orientation='vertica…" + "Figure(animation_duration=10, axes=[Axis(label='x', scale=LinearScale(), side='bottom'), Axis(label='y', orien…" ] }, "metadata": {}, @@ -4269,6 +4263,13 @@ "app.kernel.do_shutdown(True)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": null, @@ -4293,7 +4294,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.7" + "version": "3.8.5" } }, "nbformat": 4, diff --git a/notebooks/asian_barrier_option/Makefile b/gQuant/plugins/rapids_plugin/notebooks/asian_barrier_option/Makefile similarity index 100% rename from notebooks/asian_barrier_option/Makefile rename to gQuant/plugins/rapids_plugin/notebooks/asian_barrier_option/Makefile diff --git a/notebooks/asian_barrier_option/README.md b/gQuant/plugins/rapids_plugin/notebooks/asian_barrier_option/README.md similarity index 100% rename from notebooks/asian_barrier_option/README.md rename to gQuant/plugins/rapids_plugin/notebooks/asian_barrier_option/README.md diff --git a/notebooks/asian_barrier_option/cuda_pricing.cu b/gQuant/plugins/rapids_plugin/notebooks/asian_barrier_option/cuda_pricing.cu similarity index 100% rename from notebooks/asian_barrier_option/cuda_pricing.cu rename to gQuant/plugins/rapids_plugin/notebooks/asian_barrier_option/cuda_pricing.cu diff --git a/notebooks/asian_barrier_option/deep_learning_nemo.ipynb b/gQuant/plugins/rapids_plugin/notebooks/asian_barrier_option/deep_learning_nemo.ipynb similarity index 100% rename from notebooks/asian_barrier_option/deep_learning_nemo.ipynb rename to gQuant/plugins/rapids_plugin/notebooks/asian_barrier_option/deep_learning_nemo.ipynb diff --git a/notebooks/asian_barrier_option/deep_learning_option_1.ipynb b/gQuant/plugins/rapids_plugin/notebooks/asian_barrier_option/deep_learning_option_1.ipynb similarity index 100% rename from notebooks/asian_barrier_option/deep_learning_option_1.ipynb rename to gQuant/plugins/rapids_plugin/notebooks/asian_barrier_option/deep_learning_option_1.ipynb diff --git a/notebooks/asian_barrier_option/deep_learning_option_2.ipynb b/gQuant/plugins/rapids_plugin/notebooks/asian_barrier_option/deep_learning_option_2.ipynb similarity index 100% rename from notebooks/asian_barrier_option/deep_learning_option_2.ipynb rename to gQuant/plugins/rapids_plugin/notebooks/asian_barrier_option/deep_learning_option_2.ipynb diff --git a/notebooks/asian_barrier_option/docker/Dockerfile b/gQuant/plugins/rapids_plugin/notebooks/asian_barrier_option/docker/Dockerfile similarity index 100% rename from notebooks/asian_barrier_option/docker/Dockerfile rename to gQuant/plugins/rapids_plugin/notebooks/asian_barrier_option/docker/Dockerfile diff --git a/notebooks/asian_barrier_option/download_data.sh b/gQuant/plugins/rapids_plugin/notebooks/asian_barrier_option/download_data.sh similarity index 100% rename from notebooks/asian_barrier_option/download_data.sh rename to gQuant/plugins/rapids_plugin/notebooks/asian_barrier_option/download_data.sh diff --git a/notebooks/asian_barrier_option/elu_activation/CMakeLists.txt b/gQuant/plugins/rapids_plugin/notebooks/asian_barrier_option/elu_activation/CMakeLists.txt similarity index 100% rename from notebooks/asian_barrier_option/elu_activation/CMakeLists.txt rename to gQuant/plugins/rapids_plugin/notebooks/asian_barrier_option/elu_activation/CMakeLists.txt diff --git a/notebooks/asian_barrier_option/elu_activation/log/common.h b/gQuant/plugins/rapids_plugin/notebooks/asian_barrier_option/elu_activation/log/common.h similarity index 100% rename from notebooks/asian_barrier_option/elu_activation/log/common.h rename to gQuant/plugins/rapids_plugin/notebooks/asian_barrier_option/elu_activation/log/common.h diff --git a/notebooks/asian_barrier_option/elu_activation/log/logger.cpp b/gQuant/plugins/rapids_plugin/notebooks/asian_barrier_option/elu_activation/log/logger.cpp similarity index 100% rename from notebooks/asian_barrier_option/elu_activation/log/logger.cpp rename to gQuant/plugins/rapids_plugin/notebooks/asian_barrier_option/elu_activation/log/logger.cpp diff --git a/notebooks/asian_barrier_option/elu_activation/log/logger.h b/gQuant/plugins/rapids_plugin/notebooks/asian_barrier_option/elu_activation/log/logger.h similarity index 100% rename from notebooks/asian_barrier_option/elu_activation/log/logger.h rename to gQuant/plugins/rapids_plugin/notebooks/asian_barrier_option/elu_activation/log/logger.h diff --git a/notebooks/asian_barrier_option/elu_activation/log/logging.h b/gQuant/plugins/rapids_plugin/notebooks/asian_barrier_option/elu_activation/log/logging.h similarity index 100% rename from notebooks/asian_barrier_option/elu_activation/log/logging.h rename to gQuant/plugins/rapids_plugin/notebooks/asian_barrier_option/elu_activation/log/logging.h diff --git a/notebooks/asian_barrier_option/elu_activation/plugins/eluPlugin.cu b/gQuant/plugins/rapids_plugin/notebooks/asian_barrier_option/elu_activation/plugins/eluPlugin.cu similarity index 100% rename from notebooks/asian_barrier_option/elu_activation/plugins/eluPlugin.cu rename to gQuant/plugins/rapids_plugin/notebooks/asian_barrier_option/elu_activation/plugins/eluPlugin.cu diff --git a/notebooks/asian_barrier_option/elu_activation/plugins/eluPlugin.h b/gQuant/plugins/rapids_plugin/notebooks/asian_barrier_option/elu_activation/plugins/eluPlugin.h similarity index 100% rename from notebooks/asian_barrier_option/elu_activation/plugins/eluPlugin.h rename to gQuant/plugins/rapids_plugin/notebooks/asian_barrier_option/elu_activation/plugins/eluPlugin.h diff --git a/notebooks/asian_barrier_option/elu_activation/plugins/pluginKernels.h b/gQuant/plugins/rapids_plugin/notebooks/asian_barrier_option/elu_activation/plugins/pluginKernels.h similarity index 100% rename from notebooks/asian_barrier_option/elu_activation/plugins/pluginKernels.h rename to gQuant/plugins/rapids_plugin/notebooks/asian_barrier_option/elu_activation/plugins/pluginKernels.h diff --git a/notebooks/asian_barrier_option/elu_activation/plugins/pluginUtil.h b/gQuant/plugins/rapids_plugin/notebooks/asian_barrier_option/elu_activation/plugins/pluginUtil.h similarity index 100% rename from notebooks/asian_barrier_option/elu_activation/plugins/pluginUtil.h rename to gQuant/plugins/rapids_plugin/notebooks/asian_barrier_option/elu_activation/plugins/pluginUtil.h diff --git a/notebooks/asian_barrier_option/helper_cuda.h b/gQuant/plugins/rapids_plugin/notebooks/asian_barrier_option/helper_cuda.h similarity index 100% rename from notebooks/asian_barrier_option/helper_cuda.h rename to gQuant/plugins/rapids_plugin/notebooks/asian_barrier_option/helper_cuda.h diff --git a/notebooks/asian_barrier_option/helper_string.h b/gQuant/plugins/rapids_plugin/notebooks/asian_barrier_option/helper_string.h similarity index 100% rename from notebooks/asian_barrier_option/helper_string.h rename to gQuant/plugins/rapids_plugin/notebooks/asian_barrier_option/helper_string.h diff --git a/notebooks/asian_barrier_option/index.ipynb b/gQuant/plugins/rapids_plugin/notebooks/asian_barrier_option/index.ipynb similarity index 100% rename from notebooks/asian_barrier_option/index.ipynb rename to gQuant/plugins/rapids_plugin/notebooks/asian_barrier_option/index.ipynb diff --git a/notebooks/asian_barrier_option/mc_pricing.ipynb b/gQuant/plugins/rapids_plugin/notebooks/asian_barrier_option/mc_pricing.ipynb similarity index 100% rename from notebooks/asian_barrier_option/mc_pricing.ipynb rename to gQuant/plugins/rapids_plugin/notebooks/asian_barrier_option/mc_pricing.ipynb diff --git a/notebooks/asian_barrier_option/tensorrt.ipynb b/gQuant/plugins/rapids_plugin/notebooks/asian_barrier_option/tensorrt.ipynb similarity index 100% rename from notebooks/asian_barrier_option/tensorrt.ipynb rename to gQuant/plugins/rapids_plugin/notebooks/asian_barrier_option/tensorrt.ipynb diff --git a/notebooks/cuIndicator/indicator_demo.ipynb b/gQuant/plugins/rapids_plugin/notebooks/cuIndicator/indicator_demo.ipynb similarity index 100% rename from notebooks/cuIndicator/indicator_demo.ipynb rename to gQuant/plugins/rapids_plugin/notebooks/cuIndicator/indicator_demo.ipynb diff --git a/notebooks/cuIndicator/rsi_perf.ipynb b/gQuant/plugins/rapids_plugin/notebooks/cuIndicator/rsi_perf.ipynb similarity index 100% rename from notebooks/cuIndicator/rsi_perf.ipynb rename to gQuant/plugins/rapids_plugin/notebooks/cuIndicator/rsi_perf.ipynb diff --git a/tests/unit/__init__.py b/gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/__init__.py similarity index 100% rename from tests/unit/__init__.py rename to gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/__init__.py diff --git a/notebooks/cuIndicator/viz/accumulation_distribution.py b/gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/accumulation_distribution.py similarity index 100% rename from notebooks/cuIndicator/viz/accumulation_distribution.py rename to gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/accumulation_distribution.py diff --git a/notebooks/cuIndicator/viz/admi.py b/gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/admi.py similarity index 100% rename from notebooks/cuIndicator/viz/admi.py rename to gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/admi.py diff --git a/notebooks/cuIndicator/viz/average_true_range.py b/gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/average_true_range.py similarity index 100% rename from notebooks/cuIndicator/viz/average_true_range.py rename to gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/average_true_range.py diff --git a/notebooks/cuIndicator/viz/bollinger_bands.py b/gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/bollinger_bands.py similarity index 100% rename from notebooks/cuIndicator/viz/bollinger_bands.py rename to gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/bollinger_bands.py diff --git a/notebooks/cuIndicator/viz/ch_oscillator.py b/gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/ch_oscillator.py similarity index 100% rename from notebooks/cuIndicator/viz/ch_oscillator.py rename to gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/ch_oscillator.py diff --git a/notebooks/cuIndicator/viz/commodity_channel_index.py b/gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/commodity_channel_index.py similarity index 100% rename from notebooks/cuIndicator/viz/commodity_channel_index.py rename to gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/commodity_channel_index.py diff --git a/notebooks/cuIndicator/viz/coppock_curve.py b/gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/coppock_curve.py similarity index 100% rename from notebooks/cuIndicator/viz/coppock_curve.py rename to gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/coppock_curve.py diff --git a/notebooks/cuIndicator/viz/donchian_channel.py b/gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/donchian_channel.py similarity index 100% rename from notebooks/cuIndicator/viz/donchian_channel.py rename to gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/donchian_channel.py diff --git a/notebooks/cuIndicator/viz/ease_of_movement.py b/gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/ease_of_movement.py similarity index 100% rename from notebooks/cuIndicator/viz/ease_of_movement.py rename to gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/ease_of_movement.py diff --git a/notebooks/cuIndicator/viz/ewa.py b/gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/ewa.py similarity index 100% rename from notebooks/cuIndicator/viz/ewa.py rename to gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/ewa.py diff --git a/notebooks/cuIndicator/viz/force_index.py b/gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/force_index.py similarity index 100% rename from notebooks/cuIndicator/viz/force_index.py rename to gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/force_index.py diff --git a/notebooks/cuIndicator/viz/keltner_channel.py b/gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/keltner_channel.py similarity index 100% rename from notebooks/cuIndicator/viz/keltner_channel.py rename to gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/keltner_channel.py diff --git a/notebooks/cuIndicator/viz/kst_oscillator.py b/gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/kst_oscillator.py similarity index 100% rename from notebooks/cuIndicator/viz/kst_oscillator.py rename to gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/kst_oscillator.py diff --git a/notebooks/cuIndicator/viz/ma.py b/gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/ma.py similarity index 100% rename from notebooks/cuIndicator/viz/ma.py rename to gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/ma.py diff --git a/notebooks/cuIndicator/viz/macd.py b/gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/macd.py similarity index 100% rename from notebooks/cuIndicator/viz/macd.py rename to gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/macd.py diff --git a/notebooks/cuIndicator/viz/mass_index.py b/gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/mass_index.py similarity index 100% rename from notebooks/cuIndicator/viz/mass_index.py rename to gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/mass_index.py diff --git a/notebooks/cuIndicator/viz/momentum.py b/gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/momentum.py similarity index 100% rename from notebooks/cuIndicator/viz/momentum.py rename to gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/momentum.py diff --git a/notebooks/cuIndicator/viz/money_flow_index.py b/gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/money_flow_index.py similarity index 100% rename from notebooks/cuIndicator/viz/money_flow_index.py rename to gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/money_flow_index.py diff --git a/notebooks/cuIndicator/viz/on_balance_volume.py b/gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/on_balance_volume.py similarity index 100% rename from notebooks/cuIndicator/viz/on_balance_volume.py rename to gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/on_balance_volume.py diff --git a/notebooks/cuIndicator/viz/parabolic_sar.py b/gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/parabolic_sar.py similarity index 100% rename from notebooks/cuIndicator/viz/parabolic_sar.py rename to gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/parabolic_sar.py diff --git a/notebooks/cuIndicator/viz/rate_of_change.py b/gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/rate_of_change.py similarity index 100% rename from notebooks/cuIndicator/viz/rate_of_change.py rename to gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/rate_of_change.py diff --git a/notebooks/cuIndicator/viz/rsi.py b/gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/rsi.py similarity index 100% rename from notebooks/cuIndicator/viz/rsi.py rename to gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/rsi.py diff --git a/notebooks/cuIndicator/viz/stochastic_oscillator_d.py b/gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/stochastic_oscillator_d.py similarity index 100% rename from notebooks/cuIndicator/viz/stochastic_oscillator_d.py rename to gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/stochastic_oscillator_d.py diff --git a/notebooks/cuIndicator/viz/stochastic_oscillator_k.py b/gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/stochastic_oscillator_k.py similarity index 100% rename from notebooks/cuIndicator/viz/stochastic_oscillator_k.py rename to gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/stochastic_oscillator_k.py diff --git a/notebooks/cuIndicator/viz/trix.py b/gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/trix.py similarity index 100% rename from notebooks/cuIndicator/viz/trix.py rename to gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/trix.py diff --git a/notebooks/cuIndicator/viz/true_strength_index.py b/gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/true_strength_index.py similarity index 100% rename from notebooks/cuIndicator/viz/true_strength_index.py rename to gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/true_strength_index.py diff --git a/notebooks/cuIndicator/viz/ultimate_oscillator.py b/gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/ultimate_oscillator.py similarity index 100% rename from notebooks/cuIndicator/viz/ultimate_oscillator.py rename to gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/ultimate_oscillator.py diff --git a/notebooks/cuIndicator/viz/vortex_indicator.py b/gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/vortex_indicator.py similarity index 100% rename from notebooks/cuIndicator/viz/vortex_indicator.py rename to gQuant/plugins/rapids_plugin/notebooks/cuIndicator/viz/vortex_indicator.py diff --git a/notebooks/custom_port_nodes.py b/gQuant/plugins/rapids_plugin/notebooks/custom_port_nodes.py similarity index 99% rename from notebooks/custom_port_nodes.py rename to gQuant/plugins/rapids_plugin/notebooks/custom_port_nodes.py index b6f83b82..615f7afc 100644 --- a/notebooks/custom_port_nodes.py +++ b/gQuant/plugins/rapids_plugin/notebooks/custom_port_nodes.py @@ -5,7 +5,6 @@ import cudf import dask_cudf import dask -import rmm from gquant.dataframe_flow import Node, MetaData from gquant.dataframe_flow import NodePorts, PortsSpecSchema from gquant.dataframe_flow import ConfSchema @@ -236,8 +235,7 @@ def process(self, inputs): number_of_threads = 16 number_of_blocks = ((len(df) - 1) // number_of_threads) + 1 # Inits device array by setting 0 for each index. - # df['distance_numba'] = 0.0 - darr = rmm.device_array(len(df)) + darr = cuda.device_array(len(df)) distance_kernel[(number_of_blocks,), (number_of_threads,)]( df['x'], df['y'], diff --git a/notebooks/images/add_composite_node.gif b/gQuant/plugins/rapids_plugin/notebooks/images/add_composite_node.gif similarity index 100% rename from notebooks/images/add_composite_node.gif rename to gQuant/plugins/rapids_plugin/notebooks/images/add_composite_node.gif diff --git a/notebooks/images/average.gif b/gQuant/plugins/rapids_plugin/notebooks/images/average.gif similarity index 100% rename from notebooks/images/average.gif rename to gQuant/plugins/rapids_plugin/notebooks/images/average.gif diff --git a/notebooks/images/clean.gif b/gQuant/plugins/rapids_plugin/notebooks/images/clean.gif similarity index 100% rename from notebooks/images/clean.gif rename to gQuant/plugins/rapids_plugin/notebooks/images/clean.gif diff --git a/notebooks/images/clean_up_feature.gif b/gQuant/plugins/rapids_plugin/notebooks/images/clean_up_feature.gif similarity index 100% rename from notebooks/images/clean_up_feature.gif rename to gQuant/plugins/rapids_plugin/notebooks/images/clean_up_feature.gif diff --git a/notebooks/images/csv_out.gif b/gQuant/plugins/rapids_plugin/notebooks/images/csv_out.gif similarity index 100% rename from notebooks/images/csv_out.gif rename to gQuant/plugins/rapids_plugin/notebooks/images/csv_out.gif diff --git a/notebooks/images/explore_data.gif b/gQuant/plugins/rapids_plugin/notebooks/images/explore_data.gif similarity index 100% rename from notebooks/images/explore_data.gif rename to gQuant/plugins/rapids_plugin/notebooks/images/explore_data.gif diff --git a/notebooks/images/get_return_feature.gif b/gQuant/plugins/rapids_plugin/notebooks/images/get_return_feature.gif similarity index 100% rename from notebooks/images/get_return_feature.gif rename to gQuant/plugins/rapids_plugin/notebooks/images/get_return_feature.gif diff --git a/notebooks/images/loader_csv.gif b/gQuant/plugins/rapids_plugin/notebooks/images/loader_csv.gif similarity index 100% rename from notebooks/images/loader_csv.gif rename to gQuant/plugins/rapids_plugin/notebooks/images/loader_csv.gif diff --git a/notebooks/images/portfolio/add_indicator.gif b/gQuant/plugins/rapids_plugin/notebooks/images/portfolio/add_indicator.gif similarity index 100% rename from notebooks/images/portfolio/add_indicator.gif rename to gQuant/plugins/rapids_plugin/notebooks/images/portfolio/add_indicator.gif diff --git a/notebooks/images/portfolio/add_volume_min_max_return.gif b/gQuant/plugins/rapids_plugin/notebooks/images/portfolio/add_volume_min_max_return.gif similarity index 100% rename from notebooks/images/portfolio/add_volume_min_max_return.gif rename to gQuant/plugins/rapids_plugin/notebooks/images/portfolio/add_volume_min_max_return.gif diff --git a/notebooks/images/portfolio/backtest.gif b/gQuant/plugins/rapids_plugin/notebooks/images/portfolio/backtest.gif similarity index 100% rename from notebooks/images/portfolio/backtest.gif rename to gQuant/plugins/rapids_plugin/notebooks/images/portfolio/backtest.gif diff --git a/notebooks/images/portfolio/change_parameters.gif b/gQuant/plugins/rapids_plugin/notebooks/images/portfolio/change_parameters.gif similarity index 100% rename from notebooks/images/portfolio/change_parameters.gif rename to gQuant/plugins/rapids_plugin/notebooks/images/portfolio/change_parameters.gif diff --git a/notebooks/images/portfolio/clean_up_for_backtest.gif b/gQuant/plugins/rapids_plugin/notebooks/images/portfolio/clean_up_for_backtest.gif similarity index 100% rename from notebooks/images/portfolio/clean_up_for_backtest.gif rename to gQuant/plugins/rapids_plugin/notebooks/images/portfolio/clean_up_for_backtest.gif diff --git a/notebooks/images/portfolio/create_composite.gif b/gQuant/plugins/rapids_plugin/notebooks/images/portfolio/create_composite.gif similarity index 100% rename from notebooks/images/portfolio/create_composite.gif rename to gQuant/plugins/rapids_plugin/notebooks/images/portfolio/create_composite.gif diff --git a/notebooks/images/portfolio/filter_value.gif b/gQuant/plugins/rapids_plugin/notebooks/images/portfolio/filter_value.gif similarity index 100% rename from notebooks/images/portfolio/filter_value.gif rename to gQuant/plugins/rapids_plugin/notebooks/images/portfolio/filter_value.gif diff --git a/notebooks/images/portfolio/run_dask_trade.gif b/gQuant/plugins/rapids_plugin/notebooks/images/portfolio/run_dask_trade.gif similarity index 100% rename from notebooks/images/portfolio/run_dask_trade.gif rename to gQuant/plugins/rapids_plugin/notebooks/images/portfolio/run_dask_trade.gif diff --git a/notebooks/images/portfolio/run_pandas.gif b/gQuant/plugins/rapids_plugin/notebooks/images/portfolio/run_pandas.gif similarity index 100% rename from notebooks/images/portfolio/run_pandas.gif rename to gQuant/plugins/rapids_plugin/notebooks/images/portfolio/run_pandas.gif diff --git a/notebooks/images/xgboost/categorical_variable.gif b/gQuant/plugins/rapids_plugin/notebooks/images/xgboost/categorical_variable.gif similarity index 100% rename from notebooks/images/xgboost/categorical_variable.gif rename to gQuant/plugins/rapids_plugin/notebooks/images/xgboost/categorical_variable.gif diff --git a/notebooks/images/xgboost/create_composite_node.gif b/gQuant/plugins/rapids_plugin/notebooks/images/xgboost/create_composite_node.gif similarity index 100% rename from notebooks/images/xgboost/create_composite_node.gif rename to gQuant/plugins/rapids_plugin/notebooks/images/xgboost/create_composite_node.gif diff --git a/notebooks/images/xgboost/create_node.gif b/gQuant/plugins/rapids_plugin/notebooks/images/xgboost/create_node.gif similarity index 100% rename from notebooks/images/xgboost/create_node.gif rename to gQuant/plugins/rapids_plugin/notebooks/images/xgboost/create_node.gif diff --git a/notebooks/images/xgboost/custom_node.gif b/gQuant/plugins/rapids_plugin/notebooks/images/xgboost/custom_node.gif similarity index 100% rename from notebooks/images/xgboost/custom_node.gif rename to gQuant/plugins/rapids_plugin/notebooks/images/xgboost/custom_node.gif diff --git a/notebooks/images/xgboost/dask_and_sub_graph.gif b/gQuant/plugins/rapids_plugin/notebooks/images/xgboost/dask_and_sub_graph.gif similarity index 100% rename from notebooks/images/xgboost/dask_and_sub_graph.gif rename to gQuant/plugins/rapids_plugin/notebooks/images/xgboost/dask_and_sub_graph.gif diff --git a/notebooks/images/xgboost/distributed_inference.gif b/gQuant/plugins/rapids_plugin/notebooks/images/xgboost/distributed_inference.gif similarity index 100% rename from notebooks/images/xgboost/distributed_inference.gif rename to gQuant/plugins/rapids_plugin/notebooks/images/xgboost/distributed_inference.gif diff --git a/notebooks/images/xgboost/forest_inference.gif b/gQuant/plugins/rapids_plugin/notebooks/images/xgboost/forest_inference.gif similarity index 100% rename from notebooks/images/xgboost/forest_inference.gif rename to gQuant/plugins/rapids_plugin/notebooks/images/xgboost/forest_inference.gif diff --git a/notebooks/images/xgboost/normalize.gif b/gQuant/plugins/rapids_plugin/notebooks/images/xgboost/normalize.gif similarity index 100% rename from notebooks/images/xgboost/normalize.gif rename to gQuant/plugins/rapids_plugin/notebooks/images/xgboost/normalize.gif diff --git a/notebooks/images/xgboost/prepare_stock_data.gif b/gQuant/plugins/rapids_plugin/notebooks/images/xgboost/prepare_stock_data.gif similarity index 100% rename from notebooks/images/xgboost/prepare_stock_data.gif rename to gQuant/plugins/rapids_plugin/notebooks/images/xgboost/prepare_stock_data.gif diff --git a/notebooks/images/xgboost/split_the_dataset.gif b/gQuant/plugins/rapids_plugin/notebooks/images/xgboost/split_the_dataset.gif similarity index 100% rename from notebooks/images/xgboost/split_the_dataset.gif rename to gQuant/plugins/rapids_plugin/notebooks/images/xgboost/split_the_dataset.gif diff --git a/notebooks/images/xgboost/train_and_infer.gif b/gQuant/plugins/rapids_plugin/notebooks/images/xgboost/train_and_infer.gif similarity index 100% rename from notebooks/images/xgboost/train_and_infer.gif rename to gQuant/plugins/rapids_plugin/notebooks/images/xgboost/train_and_infer.gif diff --git a/notebooks/images/xgboost/visualize_data.gif b/gQuant/plugins/rapids_plugin/notebooks/images/xgboost/visualize_data.gif similarity index 100% rename from notebooks/images/xgboost/visualize_data.gif rename to gQuant/plugins/rapids_plugin/notebooks/images/xgboost/visualize_data.gif diff --git a/notebooks/images/xgboost/xgboost_metrics.gif b/gQuant/plugins/rapids_plugin/notebooks/images/xgboost/xgboost_metrics.gif similarity index 100% rename from notebooks/images/xgboost/xgboost_metrics.gif rename to gQuant/plugins/rapids_plugin/notebooks/images/xgboost/xgboost_metrics.gif diff --git a/notebooks/images/xgboost/xgboost_stock_data.gif b/gQuant/plugins/rapids_plugin/notebooks/images/xgboost/xgboost_stock_data.gif similarity index 100% rename from notebooks/images/xgboost/xgboost_stock_data.gif rename to gQuant/plugins/rapids_plugin/notebooks/images/xgboost/xgboost_stock_data.gif diff --git a/notebooks/mortgage_e2e_gquant/mortgage_common.py b/gQuant/plugins/rapids_plugin/notebooks/mortgage_e2e_gquant/mortgage_common.py similarity index 100% rename from notebooks/mortgage_e2e_gquant/mortgage_common.py rename to gQuant/plugins/rapids_plugin/notebooks/mortgage_e2e_gquant/mortgage_common.py diff --git a/notebooks/mortgage_e2e_gquant/mortgage_e2e_gquant.ipynb b/gQuant/plugins/rapids_plugin/notebooks/mortgage_e2e_gquant/mortgage_e2e_gquant.ipynb similarity index 100% rename from notebooks/mortgage_e2e_gquant/mortgage_e2e_gquant.ipynb rename to gQuant/plugins/rapids_plugin/notebooks/mortgage_e2e_gquant/mortgage_e2e_gquant.ipynb diff --git a/notebooks/mortgage_e2e_gquant/mortgage_gquant_plugins.py b/gQuant/plugins/rapids_plugin/notebooks/mortgage_e2e_gquant/mortgage_gquant_plugins.py similarity index 100% rename from notebooks/mortgage_e2e_gquant/mortgage_gquant_plugins.py rename to gQuant/plugins/rapids_plugin/notebooks/mortgage_e2e_gquant/mortgage_gquant_plugins.py diff --git a/notebooks/mortgage_e2e_gquant/mortgage_run_workflow_daskdistrib.py b/gQuant/plugins/rapids_plugin/notebooks/mortgage_e2e_gquant/mortgage_run_workflow_daskdistrib.py similarity index 100% rename from notebooks/mortgage_e2e_gquant/mortgage_run_workflow_daskdistrib.py rename to gQuant/plugins/rapids_plugin/notebooks/mortgage_e2e_gquant/mortgage_run_workflow_daskdistrib.py diff --git a/notebooks/mortgage_e2e_gquant/mortgage_run_workflow_local.py b/gQuant/plugins/rapids_plugin/notebooks/mortgage_e2e_gquant/mortgage_run_workflow_local.py similarity index 100% rename from notebooks/mortgage_e2e_gquant/mortgage_run_workflow_local.py rename to gQuant/plugins/rapids_plugin/notebooks/mortgage_e2e_gquant/mortgage_run_workflow_local.py diff --git a/notebooks/plotutils.py b/gQuant/plugins/rapids_plugin/notebooks/plotutils.py similarity index 100% rename from notebooks/plotutils.py rename to gQuant/plugins/rapids_plugin/notebooks/plotutils.py diff --git a/gQuant/plugins/rapids_plugin/setup.py b/gQuant/plugins/rapids_plugin/setup.py new file mode 100644 index 00000000..d8eba41b --- /dev/null +++ b/gQuant/plugins/rapids_plugin/setup.py @@ -0,0 +1,28 @@ +from setuptools import setup, find_packages + +setup( + name='gquant_rapids_plugin', + install_requires=[ + "bqplot", "tables", "ray[tune]" + ], + packages=find_packages(include=['gquant_rapids_plugin', + 'gquant_rapids_plugin.analysis', + 'gquant_rapids_plugin.backtest', + 'gquant_rapids_plugin.dataloader', + 'gquant_rapids_plugin.ml', + 'gquant_rapids_plugin.portofolio', + 'gquant_rapids_plugin.strategy', + 'gquant_rapids_plugin.cuindicator', + 'gquant_rapids_plugin.transform']), + entry_points={ + 'gquant.plugin': + ['gquant_rapids_plugin = gquant_rapids_plugin', + 'gquant_rapids_plugin.analysis = gquant_rapids_plugin.analysis', + 'gquant_rapids_plugin.backtest = gquant_rapids_plugin.backtest', + 'gquant_rapids_plugin.dataloader = gquant_rapids_plugin.dataloader', + 'gquant_rapids_plugin.ml = gquant_rapids_plugin.ml', + 'gquant_rapids_plugin.portofolio = gquant_rapids_plugin.portofolio', + 'gquant_rapids_plugin.strategy = gquant_rapids_plugin.strategy', + 'gquant_rapids_plugin.transform = gquant_rapids_plugin.transform'], + } +) diff --git a/gQuant/plugins/rapids_plugin/taskgraphs/dask_tutorial.gq.yaml b/gQuant/plugins/rapids_plugin/taskgraphs/dask_tutorial.gq.yaml new file mode 100644 index 00000000..298fea36 --- /dev/null +++ b/gQuant/plugins/rapids_plugin/taskgraphs/dask_tutorial.gq.yaml @@ -0,0 +1,35 @@ +- conf: + file: notebooks/data/stock_price_hist.csv.gz + path: notebooks/many-small + id: stock_data + inputs: {} + module: gquant_rapids_plugin.dataloader + type: CsvStockLoader +- conf: + keys: + - asset + - datetime + id: sort_node + inputs: + in: stock_data.dask_cudf_out + module: gquant_rapids_plugin.transform + type: SortNode +- conf: {} + id: '' + inputs: + in1: output_csv.df_out + type: Output_Collector +- conf: + column: volume + id: average_volume + inputs: + stock_in: sort_node.out + module: gquant_rapids_plugin.transform + type: AverageNode +- conf: + path: notebooks/dask_average_volume.csv + id: output_csv + inputs: + df_in: average_volume.stock_out + module: gquant_rapids_plugin.analysis + type: OutCsvNode diff --git a/taskgraphs/full_example.gq.yaml b/gQuant/plugins/rapids_plugin/taskgraphs/full_example.gq.yaml similarity index 81% rename from taskgraphs/full_example.gq.yaml rename to gQuant/plugins/rapids_plugin/taskgraphs/full_example.gq.yaml index 8a119d24..8dba32e6 100644 --- a/taskgraphs/full_example.gq.yaml +++ b/gQuant/plugins/rapids_plugin/taskgraphs/full_example.gq.yaml @@ -1,65 +1,65 @@ -- id: points_task - type: PointNode - conf: - npts: 80 +- conf: npartitions: 4 + npts: 80 + id: points_task inputs: {} -- id: numba - type: NumbaDistanceNode - conf: {} + type: PointNode +- conf: {} + id: numba inputs: points_df_in: points_task.points_df_out -- id: cupy - type: CupyDistanceNode - conf: {} + type: NumbaDistanceNode +- conf: {} + id: cupy inputs: points_df_in: points_task.points_df_out -- id: verify - type: VerifyNode - conf: + type: CupyDistanceNode +- conf: df1_col: distance_numba df2_col: distance_cupy + id: verify inputs: df1: numba.distance_df df2: cupy.distance_df -- id: cudf - type: DistanceNode - conf: {} + type: VerifyNode +- conf: {} + id: cudf inputs: points_df_in: points_task.points_df_out -- id: verify2 - type: VerifyNode - conf: + type: DistanceNode +- conf: df1_col: distance_cupy df2_col: distance_cudf + id: verify2 inputs: df1: cupy.distance_df df2: cudf.distance_df -- id: dask_cudf - type: DistanceNode - conf: {} + type: VerifyNode +- conf: {} + id: dask_cudf inputs: points_df_in: points_task.points_ddf_out module: custom_port_nodes -- id: verify_dask - type: VerifyNode - conf: + type: DistanceNode +- conf: df1_col: distance_cudf df2_col: distance_numba + id: verify_dask inputs: - df2: dask_numba.distance_df df1: dask_cudf.distance_df + df2: dask_numba.distance_df module: custom_port_nodes -- id: distributed_data - type: DistributedNode - conf: + type: VerifyNode +- conf: npartitions: 4 + id: distributed_data inputs: points_df_in: points_task.points_df_out module: custom_port_nodes -- id: dask_numba - type: NumbaDistanceNode - conf: {} + type: DistributedNode +- conf: {} + id: dask_numba inputs: points_df_in: distributed_data.points_ddf_out module: custom_port_nodes + type: NumbaDistanceNode diff --git a/taskgraphs/get_return_feature.gq.yaml b/gQuant/plugins/rapids_plugin/taskgraphs/get_return_feature.gq.yaml similarity index 52% rename from taskgraphs/get_return_feature.gq.yaml rename to gQuant/plugins/rapids_plugin/taskgraphs/get_return_feature.gq.yaml index 3a0c3db8..985a4d0d 100644 --- a/taskgraphs/get_return_feature.gq.yaml +++ b/gQuant/plugins/rapids_plugin/taskgraphs/get_return_feature.gq.yaml @@ -1,36 +1,36 @@ -- id: stock_data - type: CsvStockLoader - conf: +- conf: file: notebooks/data/stock_price_hist.csv.gz + id: stock_data inputs: {} - module: rapids_modules -- id: "" - type: Output_Collector - conf: {} + module: gquant_rapids_plugin.dataloader + type: CsvStockLoader +- conf: {} + id: '' inputs: in1: stock_data.cudf_out in2: add_return_feature.stock_out module: rapids_modules -- id: volume_filter - type: ValueFilterNode - conf: - - column: volume - min: 50 + type: Output_Collector +- conf: + - column: volume + min: 50 + id: volume_filter inputs: in: stock_data.cudf_out - module: rapids_modules -- id: sort_node - type: SortNode - conf: + module: gquant_rapids_plugin.transform + type: ValueFilterNode +- conf: keys: - - asset - - datetime + - asset + - datetime + id: sort_node inputs: in: volume_filter.out - module: rapids_modules -- id: add_return_feature - type: ReturnFeatureNode - conf: {} + module: gquant_rapids_plugin.transform + type: SortNode +- conf: {} + id: add_return_feature inputs: stock_in: sort_node.out - module: rapids_modules + module: gquant_rapids_plugin.transform + type: ReturnFeatureNode diff --git a/taskgraphs/portfolio_trade.gq.yaml b/gQuant/plugins/rapids_plugin/taskgraphs/portfolio_trade.gq.yaml similarity index 55% rename from taskgraphs/portfolio_trade.gq.yaml rename to gQuant/plugins/rapids_plugin/taskgraphs/portfolio_trade.gq.yaml index 541a9597..1414d4ed 100644 --- a/taskgraphs/portfolio_trade.gq.yaml +++ b/gQuant/plugins/rapids_plugin/taskgraphs/portfolio_trade.gq.yaml @@ -1,78 +1,78 @@ -- id: stock_data - type: CsvStockLoader - conf: +- conf: file: notebooks/data/stock_price_hist.csv.gz path: notebooks/many-small + id: stock_data inputs: {} - module: rapids_modules -- id: preprocess - type: CompositeNode - conf: + module: gquant_rapids_plugin.dataloader + type: CsvStockLoader +- conf: + input: + - sort_node.in + output: + - drop_columns.out subnode_ids: - - value_filter + - value_filter subnodes_conf: value_filter: conf: - - column: min_return - min: -10 - - column: max_return - max: 10 - - column: average_volume - min: 400 + - column: min_return + min: -10 + - column: max_return + max: 10 + - column: average_volume + min: 400 taskgraph: taskgraphs/preprocess.gq.yaml - input: - - sort_node.in - output: - - drop_columns.out + id: preprocess inputs: sort_node@in: stock_data.cudf_out -- id: sort_after - type: SortNode - conf: + type: CompositeNode +- conf: keys: - - asset - - datetime + - asset + - datetime + id: sort_after inputs: in: preprocess.drop_columns@out - module: rapids_modules -- id: exp_mean_reversion - type: PortExpMovingAverageStrategyNode - conf: + module: gquant_rapids_plugin.transform + type: SortNode +- conf: fast: 5 slow: 20 + id: exp_mean_reversion inputs: stock_in: sort_after.out - module: rapids_modules -- id: backtest - type: SimpleBackTestNode - conf: {} + module: gquant_rapids_plugin.strategy + type: PortExpMovingAverageStrategyNode +- conf: {} + id: backtest inputs: bardata_in: exp_mean_reversion.stock_out - module: rapids_modules -- id: portfolio_opt - type: SimpleAveragePortOpt - conf: {} + module: gquant_rapids_plugin.backtest + type: SimpleBackTestNode +- conf: {} + id: portfolio_opt inputs: stock_in: backtest.backtest_out - module: rapids_modules -- id: sharpe_ratio - type: SharpeRatioNode - conf: {} + module: gquant_rapids_plugin.portofolio + type: SimpleAveragePortOpt +- conf: {} + id: sharpe_ratio inputs: stock_in: portfolio_opt.stock_out - module: rapids_modules -- id: cumulative_return - type: CumReturnNode - conf: - points: 300 + module: gquant_rapids_plugin.analysis + type: SharpeRatioNode +- conf: label: cumulative return + points: 300 + id: cumulative_return inputs: in: portfolio_opt.stock_out - module: rapids_modules -- id: "" - type: Output_Collector - conf: {} + module: gquant_rapids_plugin.analysis + type: CumReturnNode +- conf: {} + id: '' inputs: in1: cumulative_return.cum_return in2: sharpe_ratio.sharpe_out module: rapids_modules + type: Output_Collector diff --git a/taskgraphs/preprocess.gq.yaml b/gQuant/plugins/rapids_plugin/taskgraphs/preprocess.gq.yaml similarity index 51% rename from taskgraphs/preprocess.gq.yaml rename to gQuant/plugins/rapids_plugin/taskgraphs/preprocess.gq.yaml index 9da3dff2..dda07c86 100644 --- a/taskgraphs/preprocess.gq.yaml +++ b/gQuant/plugins/rapids_plugin/taskgraphs/preprocess.gq.yaml @@ -1,128 +1,128 @@ -- id: stock_data - type: CsvStockLoader - conf: +- conf: file: notebooks/data/stock_price_hist.csv.gz path: notebooks/many-small/ + id: stock_data inputs: {} - module: rapids_modules -- id: sort_node - type: SortNode - conf: + module: gquant_rapids_plugin.dataloader + type: CsvStockLoader +- conf: keys: - - asset - - datetime + - asset + - datetime + id: sort_node inputs: in: stock_data.cudf_out - module: rapids_modules -- id: add_return_feature - type: ReturnFeatureNode - conf: {} + module: gquant_rapids_plugin.transform + type: SortNode +- conf: {} + id: add_return_feature inputs: stock_in: sort_node.out - module: rapids_modules -- id: find_stock_row - type: AssetIndicatorNode - conf: {} + module: gquant_rapids_plugin.transform + type: ReturnFeatureNode +- conf: {} + id: find_stock_row inputs: stock_in: add_return_feature.stock_out - module: rapids_modules -- id: average_volume - type: AverageNode - conf: + module: gquant_rapids_plugin.transform + type: AssetIndicatorNode +- conf: column: volume + id: average_volume inputs: stock_in: find_stock_row.stock_out - module: rapids_modules -- id: rename_average_volume - type: RenameNode - conf: - old: volume + module: gquant_rapids_plugin.transform + type: AverageNode +- conf: new: average_volume + old: volume + id: rename_average_volume inputs: in: average_volume.stock_out - module: rapids_modules -- id: merge_average_volume - type: LeftMergeNode - conf: + module: gquant_rapids_plugin.transform + type: RenameNode +- conf: column: asset + id: merge_average_volume inputs: left: find_stock_row.stock_out right: rename_average_volume.out - module: rapids_modules -- id: min_return - type: MinNode - conf: + module: gquant_rapids_plugin.transform + type: LeftMergeNode +- conf: column: returns + id: min_return inputs: in: find_stock_row.stock_out - module: rapids_modules -- id: rename_min_return - type: RenameNode - conf: - old: returns + module: gquant_rapids_plugin.transform + type: MinNode +- conf: new: min_return + old: returns + id: rename_min_return inputs: in: min_return.out - module: rapids_modules -- id: merge_min_return - type: LeftMergeNode - conf: + module: gquant_rapids_plugin.transform + type: RenameNode +- conf: column: asset + id: merge_min_return inputs: - right: rename_min_return.out left: merge_average_volume.merged - module: rapids_modules -- id: max_return - type: MaxNode - conf: + right: rename_min_return.out + module: gquant_rapids_plugin.transform + type: LeftMergeNode +- conf: column: returns + id: max_return inputs: in: find_stock_row.stock_out - module: rapids_modules -- id: rename_max_return - type: RenameNode - conf: - old: returns + module: gquant_rapids_plugin.transform + type: MaxNode +- conf: new: max_return + old: returns + id: rename_max_return inputs: in: max_return.out - module: rapids_modules -- id: merge_max_return - type: LeftMergeNode - conf: + module: gquant_rapids_plugin.transform + type: RenameNode +- conf: column: asset + id: merge_max_return inputs: - right: rename_max_return.out left: merge_min_return.merged - module: rapids_modules -- id: "" - type: Output_Collector - conf: {} + right: rename_max_return.out + module: gquant_rapids_plugin.transform + type: LeftMergeNode +- conf: {} + id: '' inputs: in1: drop_columns.out -- id: value_filter - type: ValueFilterNode - conf: - - column: average_volume - min: 10 - - column: min_return - min: -10 - - column: max_return - max: 10 + type: Output_Collector +- conf: + - column: average_volume + min: 10 + - column: min_return + min: -10 + - column: max_return + max: 10 + id: value_filter inputs: in: merge_max_return.merged - module: rapids_modules -- id: drop_columns - type: DropNode - conf: + module: gquant_rapids_plugin.transform + type: ValueFilterNode +- conf: columns: - - average_volume - - min_return - - max_return - - open - - high - - low - - volume + - average_volume + - min_return + - max_return + - open + - high + - low + - volume + id: drop_columns inputs: in: value_filter.out - module: rapids_modules + module: gquant_rapids_plugin.transform + type: DropNode diff --git a/taskgraphs/simple_trade.gq.yaml b/gQuant/plugins/rapids_plugin/taskgraphs/simple_trade.gq.yaml similarity index 57% rename from taskgraphs/simple_trade.gq.yaml rename to gQuant/plugins/rapids_plugin/taskgraphs/simple_trade.gq.yaml index 7775a632..e08d96d5 100644 --- a/taskgraphs/simple_trade.gq.yaml +++ b/gQuant/plugins/rapids_plugin/taskgraphs/simple_trade.gq.yaml @@ -1,26 +1,25 @@ -- id: stock_data - type: CsvStockLoader - conf: +- conf: file: notebooks/data/stock_price_hist.csv.gz + id: stock_data inputs: {} - module: rapids_modules -- id: stock_name - type: StockNameLoader - conf: + module: gquant_rapids_plugin.dataloader + type: CsvStockLoader +- conf: file: notebooks/data/security_master.csv.gz + id: stock_name inputs: {} - module: rapids_modules -- id: stock_selector - type: AssetFilterNode - conf: + module: gquant_rapids_plugin.dataloader + type: StockNameLoader +- conf: asset: 4330 + id: stock_selector inputs: name_map: stock_name.map_data stock_in: stock_data.cudf_out - module: rapids_modules -- id: "" - type: Output_Collector - conf: {} + module: gquant_rapids_plugin.transform + type: AssetFilterNode +- conf: {} + id: '' inputs: in1: stock_selector.stock_name in2: lineplot.lineplot @@ -29,69 +28,70 @@ in5: cumulative_return.cum_return in6: stock_data.cudf_out module: rapids_modules -- id: sort_node - type: SortNode - conf: + type: Output_Collector +- conf: keys: - - asset - - datetime + - asset + - datetime + id: sort_node inputs: in: stock_selector.stock_out - module: rapids_modules -- id: add_return - type: ReturnFeatureNode - conf: {} + module: gquant_rapids_plugin.transform + type: SortNode +- conf: {} + id: add_return inputs: stock_in: sort_node.out - module: rapids_modules -- id: mean_reversion - type: MovingAverageStrategyNode - conf: + module: gquant_rapids_plugin.transform + type: ReturnFeatureNode +- conf: fast: 5 slow: 10 + id: mean_reversion inputs: stock_in: add_return.stock_out - module: rapids_modules -- id: backtest - type: SimpleBackTestNode - conf: {} + module: gquant_rapids_plugin.strategy + type: MovingAverageStrategyNode +- conf: {} + id: backtest inputs: bardata_in: mean_reversion.stock_out - module: rapids_modules -- id: barplot - type: BarPlotNode - conf: - points: 300 + module: gquant_rapids_plugin.backtest + type: SimpleBackTestNode +- conf: label: barplot + points: 300 + id: barplot inputs: stock_in: backtest.backtest_out - module: rapids_modules -- id: lineplot - type: LinePlotNode - conf: + module: gquant_rapids_plugin.analysis + type: BarPlotNode +- conf: lines: - - column: ma_slow - label: slow - color: blue - - column: ma_fast - label: fast - color: green + - color: blue + column: ma_slow + label: slow + - color: green + column: ma_fast + label: fast points: 300 title: signals + id: lineplot inputs: in: backtest.backtest_out - module: rapids_modules -- id: sharpe_ratio - type: SharpeRatioNode - conf: {} + module: gquant_rapids_plugin.analysis + type: LinePlotNode +- conf: {} + id: sharpe_ratio inputs: stock_in: backtest.backtest_out - module: rapids_modules -- id: cumulative_return - type: CumReturnNode - conf: - points: 300 + module: gquant_rapids_plugin.analysis + type: SharpeRatioNode +- conf: label: cumulative_return + points: 300 + id: cumulative_return inputs: in: backtest.backtest_out - module: rapids_modules + module: gquant_rapids_plugin.analysis + type: CumReturnNode diff --git a/taskgraphs/sort_stocks.gq.yaml b/gQuant/plugins/rapids_plugin/taskgraphs/sort_stocks.gq.yaml similarity index 57% rename from taskgraphs/sort_stocks.gq.yaml rename to gQuant/plugins/rapids_plugin/taskgraphs/sort_stocks.gq.yaml index 323b3144..be104e32 100644 --- a/taskgraphs/sort_stocks.gq.yaml +++ b/gQuant/plugins/rapids_plugin/taskgraphs/sort_stocks.gq.yaml @@ -1,21 +1,21 @@ -- id: stock_data - type: CsvStockLoader - conf: +- conf: file: notebooks/data/stock_price_hist.csv.gz path: notebooks/many-small + id: stock_data inputs: {} - module: rapids_modules -- id: sort_node - type: SortNode - conf: + module: gquant_rapids_plugin.dataloader + type: CsvStockLoader +- conf: keys: - - asset - - datetime + - asset + - datetime + id: sort_node inputs: in: stock_data.cudf_out - module: rapids_modules -- id: "" - type: Output_Collector - conf: {} + module: gquant_rapids_plugin.transform + type: SortNode +- conf: {} + id: '' inputs: in1: sort_node.out + type: Output_Collector diff --git a/taskgraphs/streamz/gpu_double.gq.yaml b/gQuant/plugins/rapids_plugin/taskgraphs/streamz/gpu_double.gq.yaml similarity index 76% rename from taskgraphs/streamz/gpu_double.gq.yaml rename to gQuant/plugins/rapids_plugin/taskgraphs/streamz/gpu_double.gq.yaml index 7d1ff015..b339edfd 100644 --- a/taskgraphs/streamz/gpu_double.gq.yaml +++ b/gQuant/plugins/rapids_plugin/taskgraphs/streamz/gpu_double.gq.yaml @@ -1,55 +1,55 @@ -- id: source - type: StreamNode - conf: {} +- conf: {} + id: source inputs: {} module: streamz -- id: "" - type: Output_Collector - conf: {} + type: StreamNode +- conf: {} + id: '' inputs: in1: source.stream_out in2: plot.fig_out in3: print.stream_out -- id: slide_window - type: SlideWindowNode - conf: + type: Output_Collector +- conf: window: 50 + id: slide_window inputs: stream_in: source.stream_out module: streamz -- id: plot - type: PlotSinkNode - conf: {} + type: SlideWindowNode +- conf: {} + id: plot inputs: stream_in: to_stream.stream_out module: streamz -- id: convert - type: TupleToCudf - conf: {} + type: PlotSinkNode +- conf: {} + id: convert inputs: stream_in: slide_window.stream_out module: streamz -- id: to_dataframe - type: ToDataFrame - conf: {} + type: TupleToCudf +- conf: {} + id: to_dataframe inputs: stream_in: convert.stream_out module: streamz -- id: to_stream - type: ToStream - conf: {} + type: ToDataFrame +- conf: {} + id: to_stream inputs: df_in: double_in_gpu.df_out module: streamz -- id: double_in_gpu - type: GPUDouble - conf: {} + type: ToStream +- conf: {} + id: double_in_gpu inputs: df_in: to_dataframe.df_out module: streamz -- id: print - type: SinkNode - conf: {} + type: GPUDouble +- conf: {} + id: print inputs: stream_in: to_stream.stream_out module: streamz + type: SinkNode diff --git a/taskgraphs/streamz/gpu_double_two_branches.gq.yaml b/gQuant/plugins/rapids_plugin/taskgraphs/streamz/gpu_double_two_branches.gq.yaml similarity index 76% rename from taskgraphs/streamz/gpu_double_two_branches.gq.yaml rename to gQuant/plugins/rapids_plugin/taskgraphs/streamz/gpu_double_two_branches.gq.yaml index fc78c645..94e0c10c 100644 --- a/taskgraphs/streamz/gpu_double_two_branches.gq.yaml +++ b/gQuant/plugins/rapids_plugin/taskgraphs/streamz/gpu_double_two_branches.gq.yaml @@ -1,74 +1,74 @@ -- id: source - type: StreamNode - conf: {} +- conf: {} + id: source inputs: {} module: streamz -- id: "" - type: Output_Collector - conf: {} + type: StreamNode +- conf: {} + id: '' inputs: in1: source.stream_out in2: plot.fig_out in3: print.stream_out -- id: slide_window - type: SlideWindowNode - conf: + type: Output_Collector +- conf: window: 50 + id: slide_window inputs: stream_in: source.stream_out module: streamz -- id: plot - type: PlotSinkNode - conf: {} + type: SlideWindowNode +- conf: {} + id: plot inputs: stream_in: zip_streams.stream_out module: streamz -- id: convert - type: TupleToCudf - conf: {} + type: PlotSinkNode +- conf: {} + id: convert inputs: stream_in: slide_window.stream_out module: streamz -- id: to_dataframe - type: ToDataFrame - conf: {} + type: TupleToCudf +- conf: {} + id: to_dataframe inputs: stream_in: convert.stream_out module: streamz -- id: to_stream - type: ToStream - conf: {} + type: ToDataFrame +- conf: {} + id: to_stream inputs: df_in: double_in_gpu.df_out module: streamz -- id: double_in_gpu - type: GPUDouble - conf: {} + type: ToStream +- conf: {} + id: double_in_gpu inputs: df_in: to_dataframe.df_out module: streamz -- id: print - type: SinkNode - conf: {} + type: GPUDouble +- conf: {} + id: print inputs: stream_in: zip_streams.stream_out module: streamz -- id: double_again - type: GPUDouble - conf: {} + type: SinkNode +- conf: {} + id: double_again inputs: df_in: double_in_gpu.df_out module: streamz -- id: to_stream2 - type: ToStream - conf: {} + type: GPUDouble +- conf: {} + id: to_stream2 inputs: df_in: double_again.df_out module: streamz -- id: zip_streams - type: ZipNode - conf: {} + type: ToStream +- conf: {} + id: zip_streams inputs: stream1_in: to_stream.stream_out stream2_in: to_stream2.stream_out module: streamz + type: ZipNode diff --git a/taskgraphs/streamz/plot.gq.yaml b/gQuant/plugins/rapids_plugin/taskgraphs/streamz/plot.gq.yaml similarity index 77% rename from taskgraphs/streamz/plot.gq.yaml rename to gQuant/plugins/rapids_plugin/taskgraphs/streamz/plot.gq.yaml index 9f02fa00..677a05ff 100644 --- a/taskgraphs/streamz/plot.gq.yaml +++ b/gQuant/plugins/rapids_plugin/taskgraphs/streamz/plot.gq.yaml @@ -1,30 +1,30 @@ -- id: source - type: StreamNode - conf: {} +- conf: {} + id: source inputs: {} module: streamz -- id: double - type: TransformNode - conf: {} + type: StreamNode +- conf: {} + id: double inputs: stream_in: source.stream_out module: streamz -- id: "" - type: Output_Collector - conf: {} + type: TransformNode +- conf: {} + id: '' inputs: in1: source.stream_out in2: plot.fig_out -- id: slide_window - type: SlideWindowNode - conf: + type: Output_Collector +- conf: window: 50 + id: slide_window inputs: stream_in: double.stream_out module: streamz -- id: plot - type: PlotSinkNode - conf: {} + type: SlideWindowNode +- conf: {} + id: plot inputs: stream_in: slide_window.stream_out module: streamz + type: PlotSinkNode diff --git a/taskgraphs/streamz/two_branches.gq.yaml b/gQuant/plugins/rapids_plugin/taskgraphs/streamz/two_branches.gq.yaml similarity index 78% rename from taskgraphs/streamz/two_branches.gq.yaml rename to gQuant/plugins/rapids_plugin/taskgraphs/streamz/two_branches.gq.yaml index 631ad752..7f768c1b 100644 --- a/taskgraphs/streamz/two_branches.gq.yaml +++ b/gQuant/plugins/rapids_plugin/taskgraphs/streamz/two_branches.gq.yaml @@ -1,50 +1,50 @@ -- id: source - type: StreamNode - conf: {} +- conf: {} + id: source inputs: {} module: streamz -- id: double - type: TransformNode - conf: {} + type: StreamNode +- conf: {} + id: double inputs: stream_in: source.stream_out module: streamz -- id: "" - type: Output_Collector - conf: {} + type: TransformNode +- conf: {} + id: '' inputs: in1: source.stream_out in2: plot.fig_out -- id: slide_window - type: SlideWindowNode - conf: + type: Output_Collector +- conf: window: 50 + id: slide_window inputs: stream_in: double.stream_out module: streamz -- id: plot - type: PlotSinkNode - conf: {} + type: SlideWindowNode +- conf: {} + id: plot inputs: stream_in: zip.stream_out module: streamz -- id: double_again - type: TransformNode - conf: {} + type: PlotSinkNode +- conf: {} + id: double_again inputs: stream_in: double.stream_out module: streamz -- id: zip - type: ZipNode - conf: {} + type: TransformNode +- conf: {} + id: zip inputs: stream1_in: slide_window.stream_out stream2_in: slide_window2.stream_out module: streamz -- id: slide_window2 - type: SlideWindowNode - conf: + type: ZipNode +- conf: window: 50 + id: slide_window2 inputs: stream_in: double_again.stream_out module: streamz + type: SlideWindowNode diff --git a/taskgraphs/tutorial_intro.gq.yaml b/gQuant/plugins/rapids_plugin/taskgraphs/tutorial_intro.gq.yaml similarity index 57% rename from taskgraphs/tutorial_intro.gq.yaml rename to gQuant/plugins/rapids_plugin/taskgraphs/tutorial_intro.gq.yaml index 93a35988..44275ec2 100644 --- a/taskgraphs/tutorial_intro.gq.yaml +++ b/gQuant/plugins/rapids_plugin/taskgraphs/tutorial_intro.gq.yaml @@ -1,80 +1,80 @@ -- id: stock_data - type: CsvStockLoader - conf: +- conf: file: notebooks/data/stock_price_hist.csv.gz + id: stock_data inputs: {} - module: rapids_modules -- id: "" - type: Output_Collector - conf: {} + module: gquant_rapids_plugin.dataloader + type: CsvStockLoader +- conf: {} + id: '' inputs: in1: csv_output2.df_out in2: csv_output1.df_out module: rapids_modules -- id: get_return - type: CompositeNode - conf: + type: Output_Collector +- conf: + input: + - volume_filter.in + output: + - add_return_feature.stock_out subnode_ids: - - volume_filter + - volume_filter subnodes_conf: volume_filter: conf: - - column: volume - min: 100 + - column: volume + min: 100 taskgraph: taskgraphs/get_return_feature.gq.yaml - input: - - volume_filter.in - output: - - add_return_feature.stock_out + id: get_return inputs: volume_filter@in: stock_data.cudf_out -- id: average_volume - type: AverageNode - conf: + type: CompositeNode +- conf: column: volume + id: average_volume inputs: stock_in: get_return.add_return_feature@stock_out - module: rapids_modules -- id: average_return + module: gquant_rapids_plugin.transform type: AverageNode - conf: +- conf: column: returns + id: average_return inputs: stock_in: get_return.add_return_feature@stock_out - module: rapids_modules -- id: stock_name - type: StockNameLoader - conf: + module: gquant_rapids_plugin.transform + type: AverageNode +- conf: file: notebooks/data/security_master.csv.gz + id: stock_name inputs: {} - module: rapids_modules -- id: left_merge1 - type: LeftMergeNode - conf: + module: gquant_rapids_plugin.dataloader + type: StockNameLoader +- conf: column: asset + id: left_merge1 inputs: left: average_volume.stock_out right: stock_name.stock_name - module: rapids_modules -- id: left_merge2 + module: gquant_rapids_plugin.transform type: LeftMergeNode - conf: +- conf: column: asset + id: left_merge2 inputs: left: average_return.stock_out right: stock_name.stock_name - module: rapids_modules -- id: csv_output1 - type: OutCsvNode - conf: - path: /home/quant/gQuant/notebooks/average_return.csv + module: gquant_rapids_plugin.transform + type: LeftMergeNode +- conf: + path: notebooks/average_return.csv + id: csv_output1 inputs: df_in: left_merge2.merged - module: rapids_modules -- id: csv_output2 + module: gquant_rapids_plugin.analysis type: OutCsvNode - conf: - path: /home/quant/gQuant/notebooks/average_volume.csv +- conf: + path: notebooks/average_volume.csv + id: csv_output2 inputs: df_in: left_merge1.merged - module: rapids_modules + module: gquant_rapids_plugin.analysis + type: OutCsvNode diff --git a/gQuant/plugins/rapids_plugin/taskgraphs/visualize_frac_diff.gq.yaml b/gQuant/plugins/rapids_plugin/taskgraphs/visualize_frac_diff.gq.yaml new file mode 100644 index 00000000..f131af34 --- /dev/null +++ b/gQuant/plugins/rapids_plugin/taskgraphs/visualize_frac_diff.gq.yaml @@ -0,0 +1,113 @@ +- conf: + file: notebooks/data/stock_price_hist.csv.gz + path: notebooks/many-small + id: stock_data + inputs: {} + module: gquant_rapids_plugin.dataloader + type: CsvStockLoader +- conf: + asset: 22123 + id: asset_filter + inputs: + stock_in: xgboost_graph.technical_indicator@stock_out + module: gquant_rapids_plugin.transform + type: AssetFilterNode +- conf: {} + id: '' + inputs: + in1: lineplot.lineplot + type: Output_Collector +- conf: + lines: + - color: blue + column: FR_DI_0.1 + label: d 0.1 + - color: orange + column: FR_DI_0.3 + label: d 0.3 + - color: green + column: FR_DI_0.5 + label: d 0.5 + - color: black + column: FR_DI_0.7 + label: d 0.7 + points: 300 + title: signals + id: lineplot + inputs: + in: asset_filter.stock_out + module: gquant_rapids_plugin.analysis + type: LinePlotNode +- conf: + input: + - preprocess.sort_node@in + output: + - technical_indicator.stock_out + subnode_ids: + - preprocess + - technical_indicator + subnodes_conf: + preprocess: + conf: + input: + - sort_node.in + output: + - drop_columns.out + subnode_ids: + - value_filter + - drop_columns + subnodes_conf: + drop_columns: + conf: + columns: + - average_volume + - min_return + - max_return + value_filter: + conf: + - column: average_volume + min: 400 + - column: min_return + min: -10 + - column: max_return + max: 10 + taskgraph: taskgraphs/preprocess.gq.yaml + technical_indicator: + conf: + indicators: + - args: + - 0.9 + columns: + - close + function: port_fractional_diff + - args: + - 0.7 + columns: + - close + function: port_fractional_diff + - args: + - 0.5 + columns: + - close + function: port_fractional_diff + - args: + - 0.3 + columns: + - close + function: port_fractional_diff + - args: + - 0.1 + columns: + - close + function: port_fractional_diff + - args: + - -1 + columns: + - returns + function: port_shift + remove_na: true + taskgraph: taskgraphs/xgboost_trade.gq.yaml + id: xgboost_graph + inputs: + preprocess@sort_node@in: stock_data.cudf_out + type: CompositeNode diff --git a/taskgraphs/xgboost_example/data_generator.gq.yaml b/gQuant/plugins/rapids_plugin/taskgraphs/xgboost_example/data_generator.gq.yaml similarity index 54% rename from taskgraphs/xgboost_example/data_generator.gq.yaml rename to gQuant/plugins/rapids_plugin/taskgraphs/xgboost_example/data_generator.gq.yaml index cb483df3..32c937e7 100644 --- a/taskgraphs/xgboost_example/data_generator.gq.yaml +++ b/gQuant/plugins/rapids_plugin/taskgraphs/xgboost_example/data_generator.gq.yaml @@ -1,73 +1,73 @@ -- id: data_gen - type: ClassificationData - conf: - n_samples: 10000 +- conf: + class_sep: 1 + dtype: float64 + flip_y: 0.01 + hypercube: true + n_classes: 2 + n_clusters_per_class: 2 n_features: 10 n_informative: 4 + n_parts: 4 n_redundant: 0 n_repeated: 0 - n_classes: 2 - n_clusters_per_class: 2 - flip_y: 0.01 - class_sep: 1 - hypercube: true - shift: 0 + n_samples: 10000 + order: F scale: 1 + shift: 0 shuffle: true - order: F - dtype: float64 - n_parts: 4 + id: data_gen inputs: {} - module: rapids_modules -- id: "" - type: Output_Collector - conf: {} + module: gquant_rapids_plugin.dataloader + type: ClassificationData +- conf: {} + id: '' inputs: in1: drop_x2_x3.out -- id: x2_to_sign - type: AddSignIndicatorNode - conf: - sign: x2_sign + type: Output_Collector +- conf: column: x2 + sign: x2_sign + id: x2_to_sign inputs: in: data_gen.cudf_out - module: rapids_modules -- id: x3_to_sign + module: gquant_rapids_plugin.transform type: AddSignIndicatorNode - conf: - sign: x3_sign +- conf: column: x3 + sign: x3_sign + id: x3_to_sign inputs: in: x2_to_sign.out - module: rapids_modules -- id: drop_x2_x3 - type: DropNode - conf: + module: gquant_rapids_plugin.transform + type: AddSignIndicatorNode +- conf: columns: - - x2 - - x3 - - x3_sign - - x2_sign + - x2 + - x3 + - x3_sign + - x2_sign + id: drop_x2_x3 inputs: in: one_hot_encoding.out - module: rapids_modules -- id: one_hot_encoding - type: OneHotEncodingNode - conf: - - cats: - - 0 - - 1 - prefix_sep: _ - dtype: float64 - column: x3_sign - prefix: x3 - - cats: - - 0 - - 1 - prefix_sep: _ - dtype: float64 - column: x2_sign - prefix: x2 + module: gquant_rapids_plugin.transform + type: DropNode +- conf: + - cats: + - 0 + - 1 + column: x3_sign + dtype: float64 + prefix: x3 + prefix_sep: _ + - cats: + - 0 + - 1 + column: x2_sign + dtype: float64 + prefix: x2 + prefix_sep: _ + id: one_hot_encoding inputs: in: x3_to_sign.out - module: rapids_modules + module: gquant_rapids_plugin.transform + type: OneHotEncodingNode diff --git a/taskgraphs/xgboost_example/hyper_parameters_search.gq.yaml b/gQuant/plugins/rapids_plugin/taskgraphs/xgboost_example/hyper_parameters_search.gq.yaml similarity index 53% rename from taskgraphs/xgboost_example/hyper_parameters_search.gq.yaml rename to gQuant/plugins/rapids_plugin/taskgraphs/xgboost_example/hyper_parameters_search.gq.yaml index 35810e55..9751b937 100644 --- a/taskgraphs/xgboost_example/hyper_parameters_search.gq.yaml +++ b/gQuant/plugins/rapids_plugin/taskgraphs/xgboost_example/hyper_parameters_search.gq.yaml @@ -1,211 +1,211 @@ -- id: data_gen - type: ClassificationData - conf: - n_samples: 10000 +- conf: + class_sep: 1 + dtype: float64 + flip_y: 0.01 + hypercube: true + n_classes: 2 + n_clusters_per_class: 2 n_features: 10 n_informative: 4 + n_parts: 4 n_redundant: 0 n_repeated: 0 - n_classes: 2 - n_clusters_per_class: 2 - flip_y: 0.01 - class_sep: 1 - hypercube: true - shift: 0 + n_samples: 10000 + order: F scale: 1 + shift: 0 shuffle: true - order: F - dtype: float64 - n_parts: 4 + id: data_gen inputs: {} - module: rapids_modules -- id: "" - type: Output_Collector - conf: {} + module: gquant_rapids_plugin.dataloader + type: ClassificationData +- conf: {} + id: '' inputs: in1: xgboost_model.train_roc@roc_curve in2: xgboost_model.test_roc@roc_curve in3: hpo.conf_out in4: hpo.train_roc@roc_curve in5: hpo.test_roc@roc_curve -- id: data_generator - type: CompositeNode - conf: + type: Output_Collector +- conf: input: - - x2_to_sign.in + - x2_to_sign.in output: - - drop_x2_x3.out + - drop_x2_x3.out subnode_ids: - - data_gen + - data_gen subnodes_conf: data_gen: conf: - n_samples: 10000 + class_sep: 1 + dtype: float64 + flip_y: 0.1 + hypercube: true + n_classes: 2 + n_clusters_per_class: 2 n_features: 10 n_informative: 4 + n_parts: 4 n_redundant: 0 n_repeated: 0 - n_classes: 2 - n_clusters_per_class: 2 - flip_y: 0.1 - class_sep: 1 - hypercube: true - shift: 0 - scale: 1 - shuffle: true + n_samples: 10000 order: F - dtype: float64 - n_parts: 4 random_state: 10 + scale: 1 + shift: 0 + shuffle: true taskgraph: taskgraphs/xgboost_example/data_generator.gq.yaml + id: data_generator inputs: x2_to_sign@in: data_gen.cudf_out -- id: data_splitter - type: DataSplittingNode - conf: - train_size: 0.8 + type: CompositeNode +- conf: target: y + train_size: 0.8 + id: data_splitter inputs: in: data_generator.drop_x2_x3@out - module: rapids_modules -- id: xgboost_model - type: ContextCompositeNode - conf: - input: - - train_norm.df_in - - test_norm.df_in - output: - - train_infer.out - - test_infer.out - - train_roc.roc_curve - - test_roc.roc_curve + module: gquant_rapids_plugin.ml + type: DataSplittingNode +- conf: context: - target: - type: string - value: y + depth: map: - - node_id: train_xgboost - xpath: train_xgboost.conf.target - - node_id: train_roc - xpath: train_roc.conf.label - - node_id: test_roc - xpath: test_roc.conf.label + - node_id: train_xgboost + xpath: train_xgboost.conf.xgboost_parameters.max_depth + type: number + value: 1 + eta: + map: + - node_id: train_xgboost + xpath: train_xgboost.conf.xgboost_parameters.eta + type: number + value: 0.1 features: + map: + - node_id: train_norm + xpath: train_norm.conf.columns + - node_id: train_xgboost + xpath: train_xgboost.conf.columns type: array_string value: - - y - map: - - node_id: train_norm - xpath: train_norm.conf.columns - - node_id: train_xgboost - xpath: train_xgboost.conf.columns + - y inclusive: - type: boolean map: - - node_id: train_norm - xpath: train_norm.conf.include - - node_id: train_xgboost - xpath: train_xgboost.conf.include + - node_id: train_norm + xpath: train_norm.conf.include + - node_id: train_xgboost + xpath: train_xgboost.conf.include + type: boolean value: false - depth: - type: number - value: 1 - map: - - node_id: train_xgboost - xpath: train_xgboost.conf.xgboost_parameters.max_depth - eta: - type: number - value: 0.1 + target: map: - - node_id: train_xgboost - xpath: train_xgboost.conf.xgboost_parameters.eta + - node_id: train_xgboost + xpath: train_xgboost.conf.target + - node_id: train_roc + xpath: train_roc.conf.label + - node_id: test_roc + xpath: test_roc.conf.label + type: string + value: y + input: + - train_norm.df_in + - test_norm.df_in + output: + - train_infer.out + - test_infer.out + - train_roc.roc_curve + - test_roc.roc_curve subnodes_conf: {} taskgraph: taskgraphs/xgboost_example/xgboost_model_with_metrics.gq.yaml + id: xgboost_model inputs: test_norm@df_in: data_splitter.test train_norm@df_in: data_splitter.train -- id: hpo - type: GridRandomSearchNode - conf: - parameters: - - search: - function: grid_search - args: - - 1 - - 3 - - 5 - name: depth - - search: - function: uniform - args: - - 0.1 - - 0.8 - name: eta - metrics: - - train_roc.value - - test_roc.value + type: ContextCompositeNode +- conf: best: - mode: max metric: test_roc.value - tune: - local_dir: ./ray - name: exp - num_samples: 1 - resources_per_trial: - cpu: 1 - gpu: 1 - input: - - train_norm.df_in - - test_norm.df_in - output: - - train_infer.out - - test_infer.out - - train_roc.roc_curve - - test_roc.roc_curve + mode: max context: - target: - type: string - value: y + depth: map: - - node_id: train_xgboost - xpath: train_xgboost.conf.target - - node_id: train_roc - xpath: train_roc.conf.label - - node_id: test_roc - xpath: test_roc.conf.label + - node_id: train_xgboost + xpath: train_xgboost.conf.xgboost_parameters.max_depth + type: number + value: 3 + eta: + map: + - node_id: train_xgboost + xpath: train_xgboost.conf.xgboost_parameters.eta + type: number + value: 0.2719547419145216 features: + map: + - node_id: train_norm + xpath: train_norm.conf.columns + - node_id: train_xgboost + xpath: train_xgboost.conf.columns type: array_string value: - - y - map: - - node_id: train_norm - xpath: train_norm.conf.columns - - node_id: train_xgboost - xpath: train_xgboost.conf.columns + - y inclusive: - type: boolean map: - - node_id: train_norm - xpath: train_norm.conf.include - - node_id: train_xgboost - xpath: train_xgboost.conf.include + - node_id: train_norm + xpath: train_norm.conf.include + - node_id: train_xgboost + xpath: train_xgboost.conf.include + type: boolean value: false - depth: - type: number - value: 3 - map: - - node_id: train_xgboost - xpath: train_xgboost.conf.xgboost_parameters.max_depth - eta: - type: number - value: 0.2719547419145216 + target: map: - - node_id: train_xgboost - xpath: train_xgboost.conf.xgboost_parameters.eta + - node_id: train_xgboost + xpath: train_xgboost.conf.target + - node_id: train_roc + xpath: train_roc.conf.label + - node_id: test_roc + xpath: test_roc.conf.label + type: string + value: y + input: + - train_norm.df_in + - test_norm.df_in + metrics: + - train_roc.value + - test_roc.value + output: + - train_infer.out + - test_infer.out + - train_roc.roc_curve + - test_roc.roc_curve + parameters: + - name: depth + search: + args: + - 1 + - 3 + - 5 + function: grid_search + - name: eta + search: + args: + - 0.1 + - 0.8 + function: uniform subnodes_conf: {} taskgraph: taskgraphs/xgboost_example/xgboost_model_with_metrics.gq.yaml + tune: + local_dir: ./ray + name: exp + num_samples: 1 + resources_per_trial: + cpu: 1 + gpu: 1 + id: hpo inputs: conf_in: xgboost_model.conf_out - train_norm@df_in: data_splitter.train test_norm@df_in: data_splitter.test - module: rapids_modules + train_norm@df_in: data_splitter.train + module: gquant_rapids_plugin.ml + type: GridRandomSearchNode diff --git a/taskgraphs/xgboost_example/metrics.gq.yaml b/gQuant/plugins/rapids_plugin/taskgraphs/xgboost_example/metrics.gq.yaml similarity index 75% rename from taskgraphs/xgboost_example/metrics.gq.yaml rename to gQuant/plugins/rapids_plugin/taskgraphs/xgboost_example/metrics.gq.yaml index 08c8f25e..172f6aec 100644 --- a/taskgraphs/xgboost_example/metrics.gq.yaml +++ b/gQuant/plugins/rapids_plugin/taskgraphs/xgboost_example/metrics.gq.yaml @@ -1,106 +1,106 @@ -- id: data_gen - type: ClassificationData - conf: - n_samples: 10000 +- conf: + class_sep: 1 + dtype: float64 + flip_y: 0.01 + hypercube: true + n_classes: 2 + n_clusters_per_class: 2 n_features: 10 n_informative: 4 + n_parts: 4 n_redundant: 0 n_repeated: 0 - n_classes: 2 - n_clusters_per_class: 2 - flip_y: 0.01 - class_sep: 1 - hypercube: true - shift: 0 + n_samples: 10000 + order: F scale: 1 + shift: 0 shuffle: true - order: F - dtype: float64 - n_parts: 4 + id: data_gen inputs: {} - module: rapids_modules -- id: "" - type: Output_Collector - conf: {} + module: gquant_rapids_plugin.dataloader + type: ClassificationData +- conf: {} + id: '' inputs: in1: train_roc.roc_curve in2: test_roc.roc_curve in3: feature_importance.importance_curve -- id: data_generator - type: CompositeNode - conf: + type: Output_Collector +- conf: input: - - x2_to_sign.in + - x2_to_sign.in output: - - drop_x2_x3.out + - drop_x2_x3.out subnode_ids: - - data_gen + - data_gen subnodes_conf: data_gen: conf: - n_samples: 10000 + class_sep: 1 + dtype: float64 + flip_y: 0.1 + hypercube: true + n_classes: 2 + n_clusters_per_class: 2 n_features: 10 n_informative: 4 + n_parts: 4 n_redundant: 0 n_repeated: 0 - n_classes: 2 - n_clusters_per_class: 2 - flip_y: 0.1 - class_sep: 1 - hypercube: true - shift: 0 - scale: 1 - shuffle: true + n_samples: 10000 order: F - dtype: float64 - n_parts: 4 random_state: 10 + scale: 1 + shift: 0 + shuffle: true taskgraph: taskgraphs/xgboost_example/data_generator.gq.yaml + id: data_generator inputs: x2_to_sign@in: data_gen.cudf_out -- id: xgboost_model type: CompositeNode - conf: +- conf: input: - - train_norm.df_in - - test_norm.df_in + - train_norm.df_in + - test_norm.df_in output: - - train_infer.out - - test_infer.out - - train_xgboost.model_out + - train_infer.out + - test_infer.out + - train_xgboost.model_out subnodes_conf: {} taskgraph: taskgraphs/xgboost_example/xgboost_model.gq.yaml + id: xgboost_model inputs: test_norm@df_in: data_splitter.test train_norm@df_in: data_splitter.train -- id: data_splitter - type: DataSplittingNode - conf: - train_size: 0.8 + type: CompositeNode +- conf: target: y + train_size: 0.8 + id: data_splitter inputs: in: data_generator.drop_x2_x3@out - module: rapids_modules -- id: train_roc - type: RocCurveNode - conf: + module: gquant_rapids_plugin.ml + type: DataSplittingNode +- conf: label: y prediction: predict + id: train_roc inputs: in: xgboost_model.train_infer@out - module: rapids_modules -- id: test_roc + module: gquant_rapids_plugin.analysis type: RocCurveNode - conf: +- conf: label: y prediction: predict + id: test_roc inputs: in: xgboost_model.test_infer@out - module: rapids_modules -- id: feature_importance - type: ImportanceCurveNode - conf: + module: gquant_rapids_plugin.analysis + type: RocCurveNode +- conf: type: gain + id: feature_importance inputs: in: xgboost_model.train_xgboost@model_out - module: rapids_modules + module: gquant_rapids_plugin.analysis + type: ImportanceCurveNode diff --git a/taskgraphs/xgboost_example/ml_preprocess.gq.yaml b/gQuant/plugins/rapids_plugin/taskgraphs/xgboost_example/ml_preprocess.gq.yaml similarity index 75% rename from taskgraphs/xgboost_example/ml_preprocess.gq.yaml rename to gQuant/plugins/rapids_plugin/taskgraphs/xgboost_example/ml_preprocess.gq.yaml index 11387f91..3002367c 100644 --- a/taskgraphs/xgboost_example/ml_preprocess.gq.yaml +++ b/gQuant/plugins/rapids_plugin/taskgraphs/xgboost_example/ml_preprocess.gq.yaml @@ -1,64 +1,64 @@ -- id: "" - type: Output_Collector - conf: {} +- conf: {} + id: '' inputs: in1: train_norm.df_out in2: test_norm.df_out -- id: data_generator - type: CompositeNode - conf: + type: Output_Collector +- conf: output: - - drop_x2_x3.out + - drop_x2_x3.out subnode_ids: - - data_gen + - data_gen subnodes_conf: data_gen: conf: - n_samples: 10000 + class_sep: 1 + dtype: float64 + flip_y: 0.1 + hypercube: true + n_classes: 2 + n_clusters_per_class: 2 n_features: 10 n_informative: 4 + n_parts: 4 n_redundant: 0 n_repeated: 0 - n_classes: 2 - n_clusters_per_class: 2 - flip_y: 0.1 - class_sep: 1 - hypercube: true - shift: 0 - scale: 1 - shuffle: true + n_samples: 10000 order: F - dtype: float64 - n_parts: 4 random_state: 10 + scale: 1 + shift: 0 + shuffle: true taskgraph: taskgraphs/xgboost_example/data_generator.gq.yaml + id: data_generator inputs: {} -- id: data_splitter - type: DataSplittingNode - conf: - train_size: 0.8 + type: CompositeNode +- conf: target: y + train_size: 0.8 + id: data_splitter inputs: in: data_generator.drop_x2_x3@out - module: rapids_modules -- id: train_norm - type: NormalizationNode - conf: + module: gquant_rapids_plugin.ml + type: DataSplittingNode +- conf: columns: - - x3_0 - - x3_1 - - x2_0 - - x2_1 - - y + - x3_0 + - x3_1 + - x2_0 + - x2_1 + - y include: false + id: train_norm inputs: df_in: data_splitter.train - module: rapids_modules -- id: test_norm + module: gquant_rapids_plugin.transform type: NormalizationNode - conf: +- conf: include: true + id: test_norm inputs: - norm_data_in: train_norm.norm_data_out df_in: data_splitter.test - module: rapids_modules + norm_data_in: train_norm.norm_data_out + module: gquant_rapids_plugin.transform + type: NormalizationNode diff --git a/gQuant/plugins/rapids_plugin/taskgraphs/xgboost_example/stock_data.gq.yaml b/gQuant/plugins/rapids_plugin/taskgraphs/xgboost_example/stock_data.gq.yaml new file mode 100644 index 00000000..0e043934 --- /dev/null +++ b/gQuant/plugins/rapids_plugin/taskgraphs/xgboost_example/stock_data.gq.yaml @@ -0,0 +1,229 @@ +- conf: + file: notebooks/data/stock_price_hist.csv.gz + path: notebooks/many-small + id: stock_data + inputs: {} + module: gquant_rapids_plugin.dataloader + type: CsvStockLoader +- conf: {} + id: '' + inputs: + in1: drop_col.out + type: Output_Collector +- conf: + input: + - preprocess.sort_node@in + output: + - technical_indicator.stock_out + subnode_ids: + - technical_indicator + subnodes_conf: + technical_indicator: + conf: + indicators: + - args: + - 10 + columns: + - close + function: port_bollinger_bands + - args: + - 2 + - 3 + columns: + - high + - low + - close + - volume + function: port_chaikin_oscillator + - args: + - 2 + - 3 + columns: + - close + function: port_macd + - args: + - 2 + columns: + - high + - low + function: port_relative_strength_index + - args: + - 2 + columns: + - high + - low + - close + function: port_average_true_range + - args: + - 2 + columns: + - high + - low + - close + function: port_stochastic_oscillator_k + - args: + - 2 + columns: + - high + - low + - close + function: port_stochastic_oscillator_d + - args: + - 2 + columns: + - high + - low + - close + - volume + function: port_money_flow_index + - args: + - 2 + columns: + - close + - volume + function: port_force_index + - args: + - 2 + columns: + - high + - low + - close + function: port_ultimate_oscillator + - args: + - 2 + columns: + - high + - low + - close + - volume + function: port_accumulation_distribution + - args: + - 2 + columns: + - high + - low + - close + function: port_commodity_channel_index + - args: + - 2 + columns: + - close + - volume + function: port_on_balance_volume + - args: + - 2 + columns: + - high + - low + - close + function: port_vortex_indicator + - args: + - 3 + - 4 + - 5 + - 6 + - 7 + - 8 + - 9 + - 10 + columns: + - close + function: port_kst_oscillator + - args: + - 2 + - 3 + columns: + - high + - low + function: port_mass_index + - args: + - 2 + - 3 + columns: + - close + function: port_true_strength_index + - args: + - 2 + columns: + - high + - low + - volume + function: port_ease_of_movement + - args: + - 2 + columns: + - close + function: port_coppock_curve + - args: + - 2 + columns: + - high + - low + - close + function: port_keltner_channel + - args: + - 2 + columns: + - high + - low + - close + function: port_ppsr + - args: + - 0.9 + columns: + - close + function: port_fractional_diff + - args: + - 0.7 + columns: + - close + function: port_fractional_diff + - args: + - 0.5 + columns: + - close + function: port_fractional_diff + - args: + - 0.3 + columns: + - close + function: port_fractional_diff + - args: + - 0.1 + columns: + - close + function: port_fractional_diff + - args: + - -1 + columns: + - returns + function: port_shift + remove_na: true + taskgraph: taskgraphs/xgboost_trade.gq.yaml + id: stock_feature + inputs: + preprocess@sort_node@in: stock_data.cudf_out + type: CompositeNode +- conf: + column: SHIFT_-1 + sign: sign + id: pos_neg_return + inputs: + in: stock_feature.technical_indicator@stock_out + module: gquant_rapids_plugin.transform + type: AddSignIndicatorNode +- conf: + columns: + - indicator + - datetime + - asset + - SHIFT_-1 + - open + - high + - low + - close + id: drop_col + inputs: + in: pos_neg_return.out + module: gquant_rapids_plugin.transform + type: DropNode diff --git a/taskgraphs/xgboost_example/tree_inference.gq.yaml b/gQuant/plugins/rapids_plugin/taskgraphs/xgboost_example/tree_inference.gq.yaml similarity index 71% rename from taskgraphs/xgboost_example/tree_inference.gq.yaml rename to gQuant/plugins/rapids_plugin/taskgraphs/xgboost_example/tree_inference.gq.yaml index dbac5f6c..be3d762b 100644 --- a/taskgraphs/xgboost_example/tree_inference.gq.yaml +++ b/gQuant/plugins/rapids_plugin/taskgraphs/xgboost_example/tree_inference.gq.yaml @@ -1,124 +1,124 @@ -- id: data_gen - type: ClassificationData - conf: - n_samples: 10000 +- conf: + class_sep: 1 + dtype: float64 + flip_y: 0.01 + hypercube: true + n_classes: 2 + n_clusters_per_class: 2 n_features: 10 n_informative: 4 + n_parts: 4 n_redundant: 0 n_repeated: 0 - n_classes: 2 - n_clusters_per_class: 2 - flip_y: 0.01 - class_sep: 1 - hypercube: true - shift: 0 + n_samples: 10000 + order: F scale: 1 + shift: 0 shuffle: true - order: F - dtype: float64 - n_parts: 4 + id: data_gen inputs: {} - module: rapids_modules -- id: "" - type: Output_Collector - conf: {} + module: gquant_rapids_plugin.dataloader + type: ClassificationData +- conf: {} + id: '' inputs: in1: model_export.filename in2: test_data.out in3: test_infer.out in4: xgboost_infer.out -- id: data_generator - type: CompositeNode - conf: + type: Output_Collector +- conf: input: - - x2_to_sign.in + - x2_to_sign.in output: - - drop_x2_x3.out + - drop_x2_x3.out subnode_ids: - - data_gen + - data_gen subnodes_conf: data_gen: conf: - n_samples: 10000 + class_sep: 1 + dtype: float64 + flip_y: 0.1 + hypercube: true + n_classes: 2 + n_clusters_per_class: 2 n_features: 10 n_informative: 4 + n_parts: 4 n_redundant: 0 n_repeated: 0 - n_classes: 2 - n_clusters_per_class: 2 - flip_y: 0.1 - class_sep: 1 - hypercube: true - shift: 0 - scale: 1 - shuffle: true + n_samples: 10000 order: F - dtype: float64 - n_parts: 4 random_state: 10 + scale: 1 + shift: 0 + shuffle: true taskgraph: taskgraphs/xgboost_example/data_generator.gq.yaml + id: data_generator inputs: x2_to_sign@in: data_gen.dask_cudf_out -- id: xgboost_model type: CompositeNode - conf: +- conf: input: - - train_norm.df_in - - test_norm.df_in + - train_norm.df_in + - test_norm.df_in output: - - train_infer.out - - test_infer.out - - train_xgboost.model_out - - train_norm.df_out - - test_norm.df_out + - train_infer.out + - test_infer.out + - train_xgboost.model_out + - train_norm.df_out + - test_norm.df_out subnode_ids: [] subnodes_conf: {} taskgraph: taskgraphs/xgboost_example/xgboost_model.gq.yaml + id: xgboost_model inputs: test_norm@df_in: data_splitter.test train_norm@df_in: data_splitter.train -- id: data_splitter - type: DataSplittingNode - conf: - train_size: 0.8 + type: CompositeNode +- conf: target: y + train_size: 0.8 + id: data_splitter inputs: in: data_generator.drop_x2_x3@out - module: rapids_modules -- id: model_export - type: XGBoostExportNode - conf: + module: gquant_rapids_plugin.ml + type: DataSplittingNode +- conf: path: xgboost_model_file + id: model_export inputs: model_in: xgboost_model.train_xgboost@model_out - module: rapids_modules -- id: tree_inference - type: ForestInferenceNode - conf: + module: gquant_rapids_plugin.analysis + type: XGBoostExportNode +- conf: columns: - - y + - y + file: xgboost_model_file include: false prediction: predict - file: xgboost_model_file + id: tree_inference inputs: - model_file: model_export.filename data_in: xgboost_model.test_norm@df_out - module: rapids_modules -- id: test_data - type: DaskComputeNode - conf: {} + model_file: model_export.filename + module: gquant_rapids_plugin.ml + type: ForestInferenceNode +- conf: {} + id: test_data inputs: in: xgboost_model.test_norm@df_out - module: rapids_modules -- id: test_infer + module: gquant_rapids_plugin.transform type: DaskComputeNode - conf: {} +- conf: {} + id: test_infer inputs: in: tree_inference.out - module: rapids_modules -- id: xgboost_infer + module: gquant_rapids_plugin.transform type: DaskComputeNode - conf: {} +- conf: {} + id: xgboost_infer inputs: in: xgboost_model.test_infer@out - module: rapids_modules + module: gquant_rapids_plugin.transform + type: DaskComputeNode diff --git a/taskgraphs/xgboost_example/xgboost_model.gq.yaml b/gQuant/plugins/rapids_plugin/taskgraphs/xgboost_example/xgboost_model.gq.yaml similarity index 78% rename from taskgraphs/xgboost_example/xgboost_model.gq.yaml rename to gQuant/plugins/rapids_plugin/taskgraphs/xgboost_example/xgboost_model.gq.yaml index 8c7e5d04..07dcf4e0 100644 --- a/taskgraphs/xgboost_example/xgboost_model.gq.yaml +++ b/gQuant/plugins/rapids_plugin/taskgraphs/xgboost_example/xgboost_model.gq.yaml @@ -1,133 +1,133 @@ -- id: data_gen - type: ClassificationData - conf: - n_samples: 10000 +- conf: + class_sep: 1 + dtype: float64 + flip_y: 0.01 + hypercube: true + n_classes: 2 + n_clusters_per_class: 2 n_features: 10 n_informative: 4 + n_parts: 4 n_redundant: 0 n_repeated: 0 - n_classes: 2 - n_clusters_per_class: 2 - flip_y: 0.01 - class_sep: 1 - hypercube: true - shift: 0 + n_samples: 10000 + order: F scale: 1 + shift: 0 shuffle: true - order: F - dtype: float64 - n_parts: 4 + id: data_gen inputs: {} - module: rapids_modules -- id: "" - type: Output_Collector - conf: {} + module: gquant_rapids_plugin.dataloader + type: ClassificationData +- conf: {} + id: '' inputs: in1: test_infer.out in2: train_infer.out -- id: data_generator - type: CompositeNode - conf: + type: Output_Collector +- conf: input: - - x2_to_sign.in + - x2_to_sign.in output: - - drop_x2_x3.out + - drop_x2_x3.out subnode_ids: - - data_gen + - data_gen subnodes_conf: data_gen: conf: - n_samples: 10000 + class_sep: 1 + dtype: float64 + flip_y: 0.1 + hypercube: true + n_classes: 2 + n_clusters_per_class: 2 n_features: 10 n_informative: 4 + n_parts: 4 n_redundant: 0 n_repeated: 0 - n_classes: 2 - n_clusters_per_class: 2 - flip_y: 0.1 - class_sep: 1 - hypercube: true - shift: 0 - scale: 1 - shuffle: true + n_samples: 10000 order: F - dtype: float64 - n_parts: 4 random_state: 10 + scale: 1 + shift: 0 + shuffle: true taskgraph: taskgraphs/xgboost_example/data_generator.gq.yaml + id: data_generator inputs: x2_to_sign@in: data_gen.cudf_out -- id: data_splitter - type: DataSplittingNode - conf: - train_size: 0.8 + type: CompositeNode +- conf: target: y + train_size: 0.8 + id: data_splitter inputs: in: data_generator.drop_x2_x3@out - module: rapids_modules -- id: train_norm - type: NormalizationNode - conf: + module: gquant_rapids_plugin.ml + type: DataSplittingNode +- conf: columns: - - x3_0 - - x3_1 - - x2_0 - - x2_1 - - y + - x3_0 + - x3_1 + - x2_0 + - x2_1 + - y include: false + id: train_norm inputs: df_in: data_splitter.train - module: rapids_modules -- id: test_norm + module: gquant_rapids_plugin.transform type: NormalizationNode - conf: +- conf: include: true + id: test_norm inputs: - norm_data_in: train_norm.norm_data_out df_in: data_splitter.test - module: rapids_modules -- id: train_xgboost - type: TrainXGBoostNode - conf: - num_of_rounds: 100 + norm_data_in: train_norm.norm_data_out + module: gquant_rapids_plugin.transform + type: NormalizationNode +- conf: columns: - - y + - y include: false + num_of_rounds: 100 + target: y xgboost_parameters: - eta: 0.3 - min_child_weight: 1 - subsample: 1 - sampling_method: uniform - colsample_bytree: 1 + alpha: 0 colsample_bylevel: 1 colsample_bynode: 1 - max_depth: 8 - max_leaves: 256 - grow_policy: depthwise + colsample_bytree: 1 + deterministic_histogram: false + eta: 0.3 gamma: 0 + grow_policy: depthwise lambda: 1 - alpha: 0 - tree_method: gpu_hist - single_precision_histogram: false - deterministic_histogram: false + max_depth: 8 + max_leaves: 256 + min_child_weight: 1 objective: binary:logistic - target: y + sampling_method: uniform + single_precision_histogram: false + subsample: 1 + tree_method: gpu_hist + id: train_xgboost inputs: in: train_norm.df_out - module: rapids_modules -- id: train_infer - type: InferXGBoostNode - conf: + module: gquant_rapids_plugin.ml + type: TrainXGBoostNode +- conf: prediction: predict + id: train_infer inputs: data_in: train_norm.df_out model_in: train_xgboost.model_out - module: rapids_modules -- id: test_infer + module: gquant_rapids_plugin.ml type: InferXGBoostNode - conf: +- conf: prediction: predict + id: test_infer inputs: data_in: test_norm.df_out model_in: train_xgboost.model_out - module: rapids_modules + module: gquant_rapids_plugin.ml + type: InferXGBoostNode diff --git a/taskgraphs/xgboost_example/xgboost_model_with_metrics.gq.yaml b/gQuant/plugins/rapids_plugin/taskgraphs/xgboost_example/xgboost_model_with_metrics.gq.yaml similarity index 76% rename from taskgraphs/xgboost_example/xgboost_model_with_metrics.gq.yaml rename to gQuant/plugins/rapids_plugin/taskgraphs/xgboost_example/xgboost_model_with_metrics.gq.yaml index 913dcf49..5826b40b 100644 --- a/taskgraphs/xgboost_example/xgboost_model_with_metrics.gq.yaml +++ b/gQuant/plugins/rapids_plugin/taskgraphs/xgboost_example/xgboost_model_with_metrics.gq.yaml @@ -1,149 +1,149 @@ -- id: data_gen - type: ClassificationData - conf: - n_samples: 10000 +- conf: + class_sep: 1 + dtype: float64 + flip_y: 0.01 + hypercube: true + n_classes: 2 + n_clusters_per_class: 2 n_features: 10 n_informative: 4 + n_parts: 4 n_redundant: 0 n_repeated: 0 - n_classes: 2 - n_clusters_per_class: 2 - flip_y: 0.01 - class_sep: 1 - hypercube: true - shift: 0 + n_samples: 10000 + order: F scale: 1 + shift: 0 shuffle: true - order: F - dtype: float64 - n_parts: 4 + id: data_gen inputs: {} - module: rapids_modules -- id: "" - type: Output_Collector - conf: {} + module: gquant_rapids_plugin.dataloader + type: ClassificationData +- conf: {} + id: '' inputs: in1: test_roc.roc_curve in2: train_roc.roc_curve -- id: data_generator - type: CompositeNode - conf: + type: Output_Collector +- conf: input: - - x2_to_sign.in + - x2_to_sign.in output: - - drop_x2_x3.out + - drop_x2_x3.out subnode_ids: - - data_gen + - data_gen subnodes_conf: data_gen: conf: - n_samples: 10000 + class_sep: 1 + dtype: float64 + flip_y: 0.1 + hypercube: true + n_classes: 2 + n_clusters_per_class: 2 n_features: 10 n_informative: 4 + n_parts: 4 n_redundant: 0 n_repeated: 0 - n_classes: 2 - n_clusters_per_class: 2 - flip_y: 0.1 - class_sep: 1 - hypercube: true - shift: 0 - scale: 1 - shuffle: true + n_samples: 10000 order: F - dtype: float64 - n_parts: 4 random_state: 10 + scale: 1 + shift: 0 + shuffle: true taskgraph: taskgraphs/xgboost_example/data_generator.gq.yaml + id: data_generator inputs: x2_to_sign@in: data_gen.cudf_out -- id: data_splitter - type: DataSplittingNode - conf: - train_size: 0.8 + type: CompositeNode +- conf: target: y + train_size: 0.8 + id: data_splitter inputs: in: data_generator.drop_x2_x3@out - module: rapids_modules -- id: train_norm - type: NormalizationNode - conf: + module: gquant_rapids_plugin.ml + type: DataSplittingNode +- conf: columns: - - x3_0 - - x3_1 - - x2_0 - - x2_1 - - y + - x3_0 + - x3_1 + - x2_0 + - x2_1 + - y include: false + id: train_norm inputs: df_in: data_splitter.train - module: rapids_modules -- id: test_norm + module: gquant_rapids_plugin.transform type: NormalizationNode - conf: +- conf: include: true + id: test_norm inputs: - norm_data_in: train_norm.norm_data_out df_in: data_splitter.test - module: rapids_modules -- id: train_xgboost - type: TrainXGBoostNode - conf: - num_of_rounds: 100 + norm_data_in: train_norm.norm_data_out + module: gquant_rapids_plugin.transform + type: NormalizationNode +- conf: columns: - - y + - y include: false + num_of_rounds: 100 + target: y xgboost_parameters: - eta: 0.3 - min_child_weight: 1 - subsample: 1 - sampling_method: uniform - colsample_bytree: 1 + alpha: 0 colsample_bylevel: 1 colsample_bynode: 1 - max_depth: 8 - max_leaves: 256 - grow_policy: depthwise + colsample_bytree: 1 + deterministic_histogram: false + eta: 0.3 gamma: 0 + grow_policy: depthwise lambda: 1 - alpha: 0 - tree_method: gpu_hist - single_precision_histogram: false - deterministic_histogram: false + max_depth: 8 + max_leaves: 256 + min_child_weight: 1 objective: binary:logistic - target: y + sampling_method: uniform + single_precision_histogram: false + subsample: 1 + tree_method: gpu_hist + id: train_xgboost inputs: in: train_norm.df_out - module: rapids_modules -- id: train_infer - type: InferXGBoostNode - conf: + module: gquant_rapids_plugin.ml + type: TrainXGBoostNode +- conf: prediction: predict + id: train_infer inputs: data_in: train_norm.df_out model_in: train_xgboost.model_out - module: rapids_modules -- id: test_infer + module: gquant_rapids_plugin.ml type: InferXGBoostNode - conf: +- conf: prediction: predict + id: test_infer inputs: data_in: test_norm.df_out model_in: train_xgboost.model_out - module: rapids_modules -- id: train_roc - type: RocCurveNode - conf: + module: gquant_rapids_plugin.ml + type: InferXGBoostNode +- conf: label: y prediction: predict + id: train_roc inputs: in: train_infer.out - module: rapids_modules -- id: test_roc + module: gquant_rapids_plugin.analysis type: RocCurveNode - conf: +- conf: label: y prediction: predict + id: test_roc inputs: in: test_infer.out - module: rapids_modules + module: gquant_rapids_plugin.analysis + type: RocCurveNode diff --git a/gQuant/plugins/rapids_plugin/taskgraphs/xgboost_example/xgboost_stock.gq.yaml b/gQuant/plugins/rapids_plugin/taskgraphs/xgboost_example/xgboost_stock.gq.yaml new file mode 100644 index 00000000..c767614f --- /dev/null +++ b/gQuant/plugins/rapids_plugin/taskgraphs/xgboost_example/xgboost_stock.gq.yaml @@ -0,0 +1,300 @@ +- conf: + file: notebooks/data/stock_price_hist.csv.gz + path: notebooks/many-small + id: stock_data + inputs: {} + module: gquant_rapids_plugin.dataloader + type: CsvStockLoader +- conf: {} + id: '' + inputs: + in1: test_roc.roc_curve + in2: train_roc.roc_curve + in3: feature_importance.importance_curve + in4: xgboost_model.train_xgboost@model_out + type: Output_Collector +- conf: + input: + - preprocess.sort_node@in + output: + - technical_indicator.stock_out + subnode_ids: + - technical_indicator + subnodes_conf: + technical_indicator: + conf: + indicators: + - args: + - 10 + columns: + - close + function: port_bollinger_bands + - args: + - 2 + - 3 + columns: + - high + - low + - close + - volume + function: port_chaikin_oscillator + - args: + - 2 + - 3 + columns: + - close + function: port_macd + - args: + - 2 + columns: + - high + - low + function: port_relative_strength_index + - args: + - 2 + columns: + - high + - low + - close + function: port_average_true_range + - args: + - 2 + columns: + - high + - low + - close + function: port_stochastic_oscillator_k + - args: + - 2 + columns: + - high + - low + - close + function: port_stochastic_oscillator_d + - args: + - 2 + columns: + - high + - low + - close + - volume + function: port_money_flow_index + - args: + - 2 + columns: + - close + - volume + function: port_force_index + - args: + - 2 + columns: + - high + - low + - close + function: port_ultimate_oscillator + - args: + - 2 + columns: + - high + - low + - close + - volume + function: port_accumulation_distribution + - args: + - 2 + columns: + - high + - low + - close + function: port_commodity_channel_index + - args: + - 2 + columns: + - close + - volume + function: port_on_balance_volume + - args: + - 2 + columns: + - high + - low + - close + function: port_vortex_indicator + - args: + - 3 + - 4 + - 5 + - 6 + - 7 + - 8 + - 9 + - 10 + columns: + - close + function: port_kst_oscillator + - args: + - 2 + - 3 + columns: + - high + - low + function: port_mass_index + - args: + - 2 + - 3 + columns: + - close + function: port_true_strength_index + - args: + - 2 + columns: + - high + - low + - volume + function: port_ease_of_movement + - args: + - 2 + columns: + - close + function: port_coppock_curve + - args: + - 2 + columns: + - high + - low + - close + function: port_keltner_channel + - args: + - 2 + columns: + - high + - low + - close + function: port_ppsr + - args: + - 0.9 + columns: + - close + function: port_fractional_diff + - args: + - 0.7 + columns: + - close + function: port_fractional_diff + - args: + - 0.5 + columns: + - close + function: port_fractional_diff + - args: + - 0.3 + columns: + - close + function: port_fractional_diff + - args: + - 0.1 + columns: + - close + function: port_fractional_diff + - args: + - -1 + columns: + - returns + function: port_shift + remove_na: true + taskgraph: taskgraphs/xgboost_trade.gq.yaml + id: stock_feature + inputs: + preprocess@sort_node@in: stock_data.dask_cudf_out + type: CompositeNode +- conf: + column: SHIFT_-1 + sign: sign + id: pos_neg_return + inputs: + in: stock_feature.technical_indicator@stock_out + module: gquant_rapids_plugin.transform + type: AddSignIndicatorNode +- conf: + columns: + - indicator + - datetime + - asset + - SHIFT_-1 + - open + - high + - low + - close + id: drop_col + inputs: + in: pos_neg_return.out + module: gquant_rapids_plugin.transform + type: DropNode +- conf: + target: sign + train_size: 0.8 + id: split_data + inputs: + in: drop_col.out + module: gquant_rapids_plugin.ml + type: DataSplittingNode +- conf: + train_norm: + conf: + columns: + - sign + include: false + train_xgboost: + conf: + columns: + - sign + include: false + num_of_rounds: 100 + target: sign + xgboost_parameters: + alpha: 0 + colsample_bylevel: 1 + colsample_bynode: 1 + colsample_bytree: 1 + deterministic_histogram: false + eta: 0.3 + gamma: 0 + grow_policy: depthwise + lambda: 1 + max_depth: 8 + max_leaves: 256 + min_child_weight: 1 + objective: binary:logistic + sampling_method: uniform + single_precision_histogram: false + subsample: 1 + tree_method: gpu_hist + id: xgboost_model + inputs: + test_norm@df_in: split_data.test + train_norm@df_in: split_data.train + module: my_node + type: CustXGBoostNode +- conf: + label: sign + prediction: predict + id: train_roc + inputs: + in: xgboost_model.train_infer@out + module: gquant_rapids_plugin.analysis + type: RocCurveNode +- conf: + label: sign + prediction: predict + id: test_roc + inputs: + in: xgboost_model.test_infer@out + module: gquant_rapids_plugin.analysis + type: RocCurveNode +- conf: + type: gain + id: feature_importance + inputs: + in: xgboost_model.train_xgboost@model_out + module: gquant_rapids_plugin.analysis + type: ImportanceCurveNode diff --git a/gQuant/plugins/rapids_plugin/taskgraphs/xgboost_example/xgboost_stock_hpo.gq.yaml b/gQuant/plugins/rapids_plugin/taskgraphs/xgboost_example/xgboost_stock_hpo.gq.yaml new file mode 100644 index 00000000..9fd66008 --- /dev/null +++ b/gQuant/plugins/rapids_plugin/taskgraphs/xgboost_example/xgboost_stock_hpo.gq.yaml @@ -0,0 +1,382 @@ +- conf: {} + id: '' + inputs: + in1: xgboost_model.train_roc@roc_curve + in2: xgboost_model.test_roc@roc_curve + in3: hpo.conf_out + in4: hpo.train_roc@roc_curve + in5: hpo.test_roc@roc_curve + type: Output_Collector +- conf: + context: + depth: + map: + - node_id: train_xgboost + xpath: train_xgboost.conf.xgboost_parameters.max_depth + type: number + value: 1 + eta: + map: + - node_id: train_xgboost + xpath: train_xgboost.conf.xgboost_parameters.eta + type: number + value: 0.1 + features: + map: + - node_id: train_norm + xpath: train_norm.conf.columns + - node_id: train_xgboost + xpath: train_xgboost.conf.columns + type: array_string + value: + - sign + inclusive: + map: + - node_id: train_norm + xpath: train_norm.conf.include + - node_id: train_xgboost + xpath: train_xgboost.conf.include + type: boolean + value: false + target: + map: + - node_id: train_xgboost + xpath: train_xgboost.conf.target + - node_id: train_roc + xpath: train_roc.conf.label + - node_id: test_roc + xpath: test_roc.conf.label + type: string + value: sign + input: + - train_norm.df_in + - test_norm.df_in + output: + - train_infer.out + - test_infer.out + - train_roc.roc_curve + - test_roc.roc_curve + subnodes_conf: {} + taskgraph: taskgraphs/xgboost_example/xgboost_model_with_metrics.gq.yaml + id: xgboost_model + inputs: + test_norm@df_in: split_data.test + train_norm@df_in: split_data.train + type: ContextCompositeNode +- conf: + best: + metric: test_roc.value + mode: max + context: + depth: + map: + - node_id: train_xgboost + xpath: train_xgboost.conf.xgboost_parameters.max_depth + type: number + value: 1 + eta: + map: + - node_id: train_xgboost + xpath: train_xgboost.conf.xgboost_parameters.eta + type: number + value: 0.1 + features: + map: + - node_id: train_norm + xpath: train_norm.conf.columns + - node_id: train_xgboost + xpath: train_xgboost.conf.columns + type: array_string + value: + - sign + inclusive: + map: + - node_id: train_norm + xpath: train_norm.conf.include + - node_id: train_xgboost + xpath: train_xgboost.conf.include + type: boolean + value: false + target: + map: + - node_id: train_xgboost + xpath: train_xgboost.conf.target + - node_id: train_roc + xpath: train_roc.conf.label + - node_id: test_roc + xpath: test_roc.conf.label + type: string + value: sign + input: + - train_norm.df_in + - test_norm.df_in + metrics: + - train_roc.value + - test_roc.value + output: + - train_infer.out + - test_infer.out + - train_roc.roc_curve + - test_roc.roc_curve + parameters: + - name: depth + search: + args: + - 1 + - 3 + - 5 + function: grid_search + - name: eta + search: + args: + - 0.1 + - 0.8 + function: uniform + subnodes_conf: {} + taskgraph: taskgraphs/xgboost_example/xgboost_model_with_metrics.gq.yaml + tune: + local_dir: ./ray + name: stock + num_samples: 1 + resources_per_trial: + cpu: 1 + gpu: 1 + id: hpo + inputs: + conf_in: xgboost_model.conf_out + test_norm@df_in: split_data.test + train_norm@df_in: split_data.train + module: gquant_rapids_plugin.ml + type: GridRandomSearchNode +- conf: + file: notebooks/data/stock_price_hist.csv.gz + path: notebooks/many-small + id: stock_data + inputs: {} + module: gquant_rapids_plugin.dataloader + type: CsvStockLoader +- conf: + input: + - preprocess.sort_node@in + output: + - technical_indicator.stock_out + subnode_ids: + - technical_indicator + subnodes_conf: + technical_indicator: + conf: + indicators: + - args: + - 10 + columns: + - close + function: port_bollinger_bands + - args: + - 2 + - 3 + columns: + - high + - low + - close + - volume + function: port_chaikin_oscillator + - args: + - 2 + - 3 + columns: + - close + function: port_macd + - args: + - 2 + columns: + - high + - low + function: port_relative_strength_index + - args: + - 2 + columns: + - high + - low + - close + function: port_average_true_range + - args: + - 2 + columns: + - high + - low + - close + function: port_stochastic_oscillator_k + - args: + - 2 + columns: + - high + - low + - close + function: port_stochastic_oscillator_d + - args: + - 2 + columns: + - high + - low + - close + - volume + function: port_money_flow_index + - args: + - 2 + columns: + - close + - volume + function: port_force_index + - args: + - 2 + columns: + - high + - low + - close + function: port_ultimate_oscillator + - args: + - 2 + columns: + - high + - low + - close + - volume + function: port_accumulation_distribution + - args: + - 2 + columns: + - high + - low + - close + function: port_commodity_channel_index + - args: + - 2 + columns: + - close + - volume + function: port_on_balance_volume + - args: + - 2 + columns: + - high + - low + - close + function: port_vortex_indicator + - args: + - 3 + - 4 + - 5 + - 6 + - 7 + - 8 + - 9 + - 10 + columns: + - close + function: port_kst_oscillator + - args: + - 2 + - 3 + columns: + - high + - low + function: port_mass_index + - args: + - 2 + - 3 + columns: + - close + function: port_true_strength_index + - args: + - 2 + columns: + - high + - low + - volume + function: port_ease_of_movement + - args: + - 2 + columns: + - close + function: port_coppock_curve + - args: + - 2 + columns: + - high + - low + - close + function: port_keltner_channel + - args: + - 2 + columns: + - high + - low + - close + function: port_ppsr + - args: + - 0.9 + columns: + - close + function: port_fractional_diff + - args: + - 0.7 + columns: + - close + function: port_fractional_diff + - args: + - 0.5 + columns: + - close + function: port_fractional_diff + - args: + - 0.3 + columns: + - close + function: port_fractional_diff + - args: + - 0.1 + columns: + - close + function: port_fractional_diff + - args: + - -1 + columns: + - returns + function: port_shift + remove_na: true + taskgraph: taskgraphs/xgboost_trade.gq.yaml + id: stock_feature + inputs: + preprocess@sort_node@in: stock_data.cudf_out + type: CompositeNode +- conf: + column: SHIFT_-1 + sign: sign + id: pos_neg_return + inputs: + in: stock_feature.technical_indicator@stock_out + module: gquant_rapids_plugin.transform + type: AddSignIndicatorNode +- conf: + columns: + - indicator + - datetime + - asset + - SHIFT_-1 + - open + - high + - low + - close + id: drop_col + inputs: + in: pos_neg_return.out + module: gquant_rapids_plugin.transform + type: DropNode +- conf: + target: sign + train_size: 0.8 + id: split_data + inputs: + in: drop_col.out + module: gquant_rapids_plugin.ml + type: DataSplittingNode diff --git a/taskgraphs/xgboost_trade.gq.yaml b/gQuant/plugins/rapids_plugin/taskgraphs/xgboost_trade.gq.yaml similarity index 53% rename from taskgraphs/xgboost_trade.gq.yaml rename to gQuant/plugins/rapids_plugin/taskgraphs/xgboost_trade.gq.yaml index b54913db..4367589a 100644 --- a/taskgraphs/xgboost_trade.gq.yaml +++ b/gQuant/plugins/rapids_plugin/taskgraphs/xgboost_trade.gq.yaml @@ -1,165 +1,165 @@ -- id: stock_data - type: CsvStockLoader - conf: +- conf: file: notebooks/data/stock_price_hist.csv.gz path: notebooks/many-small + id: stock_data inputs: {} - module: rapids_modules -- id: preprocess - type: CompositeNode - conf: + module: gquant_rapids_plugin.dataloader + type: CsvStockLoader +- conf: + input: + - sort_node.in + output: + - drop_columns.out subnode_ids: - - value_filter - - drop_columns + - value_filter + - drop_columns subnodes_conf: - value_filter: - conf: - - column: min_return - min: -10 - - column: max_return - max: 10 - - column: average_volume - min: 400 drop_columns: conf: columns: - - average_volume - - min_return - - max_return + - average_volume + - min_return + - max_return + value_filter: + conf: + - column: min_return + min: -10 + - column: max_return + max: 10 + - column: average_volume + min: 400 taskgraph: taskgraphs/preprocess.gq.yaml - input: - - sort_node.in - output: - - drop_columns.out + id: preprocess inputs: sort_node@in: stock_data.cudf_out -- id: sort_after - type: SortNode - conf: + type: CompositeNode +- conf: keys: - - asset - - datetime + - asset + - datetime + id: sort_after inputs: in: preprocess.drop_columns@out - module: rapids_modules -- id: backtest - type: SimpleBackTestNode - conf: {} + module: gquant_rapids_plugin.transform + type: SortNode +- conf: {} + id: backtest inputs: bardata_in: xgboost.stock_out - module: rapids_modules -- id: portfolio_opt_train - type: SimpleAveragePortOpt - conf: {} + module: gquant_rapids_plugin.backtest + type: SimpleBackTestNode +- conf: {} + id: portfolio_opt_train inputs: stock_in: train_df.stock_out - module: rapids_modules -- id: sharpe_ratio_trn - type: SharpeRatioNode - conf: {} + module: gquant_rapids_plugin.portofolio + type: SimpleAveragePortOpt +- conf: {} + id: sharpe_ratio_trn inputs: stock_in: portfolio_opt_train.stock_out - module: rapids_modules -- id: cumulative_return_trn - type: CumReturnNode - conf: - points: 300 + module: gquant_rapids_plugin.analysis + type: SharpeRatioNode +- conf: label: training cumulative return + points: 300 + id: cumulative_return_trn inputs: in: portfolio_opt_train.stock_out - module: rapids_modules -- id: "" - type: Output_Collector - conf: {} + module: gquant_rapids_plugin.analysis + type: CumReturnNode +- conf: {} + id: '' inputs: in1: sharpe_ratio_trn.sharpe_out in2: cumulative_return_trn.cum_return in3: sharpe_ratio_val.sharpe_out in4: cumulative_return_val.cum_return -- id: technical_indicator - type: IndicatorNode - conf: + type: Output_Collector +- conf: indicators: - - function: port_chaikin_oscillator - args: - - 10 - - 20 - columns: - - high - - low - - close - - volume - - function: port_bollinger_bands - args: - - 10 - columns: - - close - - function: port_shift - args: - - -1 - columns: - - returns + - args: + - 10 + - 20 + columns: + - high + - low + - close + - volume + function: port_chaikin_oscillator + - args: + - 10 + columns: + - close + function: port_bollinger_bands + - args: + - -1 + columns: + - returns + function: port_shift remove_na: true + id: technical_indicator inputs: stock_in: sort_after.out - module: rapids_modules -- id: xgboost - type: XGBoostStrategyNode - conf: - num_of_rounds: 100 + module: gquant_rapids_plugin.transform + type: IndicatorNode +- conf: no_feature: - - asset - - datetime - - volume - - close - - high - - low - - returns - - indicator - - open + - asset + - datetime + - volume + - close + - high + - low + - returns + - indicator + - open + num_of_rounds: 100 + target: SHIFT_-1 + train_date: 2010-01-01 xgboost_parameters: + gamma: 0 max_depth: 8 max_leaves: 256 - gamma: 0 objective: reg:squarederror - train_date: 2010-01-01 - target: SHIFT_-1 + id: xgboost inputs: stock_in: technical_indicator.stock_out - module: rapids_modules -- id: train_df - type: DatetimeFilterNode - conf: - end: 2010-01-01 + module: gquant_rapids_plugin.strategy + type: XGBoostStrategyNode +- conf: beg: 1985-01-01 + end: 2010-01-01 + id: train_df inputs: stock_in: backtest.backtest_out - module: rapids_modules -- id: validation_df + module: gquant_rapids_plugin.transform type: DatetimeFilterNode - conf: +- conf: beg: 2010-01-01 end: 2025-01-01 + id: validation_df inputs: stock_in: backtest.backtest_out - module: rapids_modules -- id: portfolio_opt_validation - type: SimpleAveragePortOpt - conf: {} + module: gquant_rapids_plugin.transform + type: DatetimeFilterNode +- conf: {} + id: portfolio_opt_validation inputs: stock_in: validation_df.stock_out - module: rapids_modules -- id: sharpe_ratio_val - type: SharpeRatioNode - conf: {} + module: gquant_rapids_plugin.portofolio + type: SimpleAveragePortOpt +- conf: {} + id: sharpe_ratio_val inputs: stock_in: portfolio_opt_validation.stock_out - module: rapids_modules -- id: cumulative_return_val - type: CumReturnNode - conf: - points: 300 + module: gquant_rapids_plugin.analysis + type: SharpeRatioNode +- conf: label: validation cumulative return + points: 300 + id: cumulative_return_val inputs: in: portfolio_opt_validation.stock_out - module: rapids_modules + module: gquant_rapids_plugin.analysis + type: CumReturnNode diff --git a/gQuant/plugins/rapids_plugin/tests/__init__.py b/gQuant/plugins/rapids_plugin/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/gQuant/plugins/rapids_plugin/tests/unit/__init__.py b/gQuant/plugins/rapids_plugin/tests/unit/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/technical_indicators.py b/gQuant/plugins/rapids_plugin/tests/unit/technical_indicators.py similarity index 99% rename from tests/unit/technical_indicators.py rename to gQuant/plugins/rapids_plugin/tests/unit/technical_indicators.py index 489e6105..d38c4aaa 100644 --- a/tests/unit/technical_indicators.py +++ b/gQuant/plugins/rapids_plugin/tests/unit/technical_indicators.py @@ -551,7 +551,7 @@ def donchian_channel(df, n): i = 0 while i + n - 1 < df.index[-1]: - dc = max(df['High'].ix[i:i + n - 1]) - min(df['Low'].ix[i:i + n - 1]) + dc = max(df['High'].loc[i:i + n - 1]) - min(df['Low'].loc[i:i + n - 1]) dc_l.append(dc) i += 1 diff --git a/tests/unit/test_fractional_diff.py b/gQuant/plugins/rapids_plugin/tests/unit/test_fractional_diff.py similarity index 95% rename from tests/unit/test_fractional_diff.py rename to gQuant/plugins/rapids_plugin/tests/unit/test_fractional_diff.py index 7b47e530..8ee325b3 100644 --- a/tests/unit/test_fractional_diff.py +++ b/gQuant/plugins/rapids_plugin/tests/unit/test_fractional_diff.py @@ -22,11 +22,9 @@ import pandas as pd import unittest import cudf -import os -from gquant.dataframe_flow.task import load_modules -load_modules(os.getenv('MODULEPATH')+'/rapids_modules/') -from rapids_modules.cuindicator import (fractional_diff, get_weights_floored, - port_fractional_diff) +from gquant_rapids_plugin.cuindicator import (fractional_diff, + get_weights_floored, + port_fractional_diff) import numpy as np from .utils import make_orderer import warnings diff --git a/tests/unit/test_indicator.py b/gQuant/plugins/rapids_plugin/tests/unit/test_indicator.py similarity index 99% rename from tests/unit/test_indicator.py rename to gQuant/plugins/rapids_plugin/tests/unit/test_indicator.py index 4060b512..7f27c51f 100644 --- a/tests/unit/test_indicator.py +++ b/gQuant/plugins/rapids_plugin/tests/unit/test_indicator.py @@ -24,10 +24,7 @@ import unittest import pathlib import cudf -import os -from gquant.dataframe_flow.task import load_modules -load_modules(os.getenv('MODULEPATH')+'/rapids_modules/') -import rapids_modules.cuindicator as gi +import gquant_rapids_plugin.cuindicator as gi from . import technical_indicators as ti from .utils import make_orderer, error_function import numpy as np diff --git a/tests/unit/test_indicator_node.py b/gQuant/plugins/rapids_plugin/tests/unit/test_indicator_node.py similarity index 97% rename from tests/unit/test_indicator_node.py rename to gQuant/plugins/rapids_plugin/tests/unit/test_indicator_node.py index 0a543d24..7f2bdc76 100644 --- a/tests/unit/test_indicator_node.py +++ b/gQuant/plugins/rapids_plugin/tests/unit/test_indicator_node.py @@ -22,11 +22,8 @@ import warnings import unittest import cudf -import os -from gquant.dataframe_flow.task import load_modules -load_modules(os.getenv('MODULEPATH')+'/rapids_modules/') -import rapids_modules.cuindicator as gi -from rapids_modules.transform.indicatorNode import IndicatorNode +import gquant_rapids_plugin.cuindicator as gi +from gquant_rapids_plugin.transform.indicatorNode import IndicatorNode from gquant.dataframe_flow.task import Task from .utils import make_orderer import numpy as np diff --git a/tests/unit/test_multi_assets_indicator.py b/gQuant/plugins/rapids_plugin/tests/unit/test_multi_assets_indicator.py similarity index 99% rename from tests/unit/test_multi_assets_indicator.py rename to gQuant/plugins/rapids_plugin/tests/unit/test_multi_assets_indicator.py index 5de85440..5330cbb2 100644 --- a/tests/unit/test_multi_assets_indicator.py +++ b/gQuant/plugins/rapids_plugin/tests/unit/test_multi_assets_indicator.py @@ -23,12 +23,9 @@ import unittest import cudf from .utils import make_orderer, error_function -import os -from gquant.dataframe_flow.task import load_modules -load_modules(os.getenv('MODULEPATH')+'/rapids_modules/') -import rapids_modules.cuindicator as gi +import gquant_rapids_plugin.cuindicator as gi from . import technical_indicators as ti -from rapids_modules.cuindicator import PEwm +from gquant_rapids_plugin.cuindicator import PEwm import numpy as np import warnings diff --git a/tests/unit/test_nodes.py b/gQuant/plugins/rapids_plugin/tests/unit/test_nodes.py similarity index 98% rename from tests/unit/test_nodes.py rename to gQuant/plugins/rapids_plugin/tests/unit/test_nodes.py index ebea19ec..cc7d8f4f 100644 --- a/tests/unit/test_nodes.py +++ b/gQuant/plugins/rapids_plugin/tests/unit/test_nodes.py @@ -22,12 +22,9 @@ import warnings import unittest import cudf -import os -from gquant.dataframe_flow.task import load_modules -load_modules(os.getenv('MODULEPATH')+'/rapids_modules/') -from rapids_modules.transform.returnFeatureNode import ReturnFeatureNode -from rapids_modules.transform.indicatorNode import IndicatorNode -from rapids_modules.transform.assetIndicatorNode import AssetIndicatorNode +from gquant_rapids_plugin.transform import ReturnFeatureNode +from gquant_rapids_plugin.transform import IndicatorNode +from gquant_rapids_plugin.transform import AssetIndicatorNode from gquant.dataframe_flow.task import Task from .utils import make_orderer, error_function_index import numpy as np @@ -163,9 +160,8 @@ def test_asset_indicator(self): inN = AssetIndicatorNode(task) gt = self._cudf_data.to_pandas()['indicator'] - o = inN.process({'stock_in': - self._cudf_data.drop('indicator')})['stock_out'] + self._cudf_data.drop('indicator', axis=1)})['stock_out'] err, index_err = error_function_index(o['indicator'], gt) msg = "bad error %f\n" % (err,) diff --git a/tests/unit/test_rolling.py b/gQuant/plugins/rapids_plugin/tests/unit/test_rolling.py similarity index 95% rename from tests/unit/test_rolling.py rename to gQuant/plugins/rapids_plugin/tests/unit/test_rolling.py index 4f659ef0..535985b4 100644 --- a/tests/unit/test_rolling.py +++ b/gQuant/plugins/rapids_plugin/tests/unit/test_rolling.py @@ -22,10 +22,7 @@ import pandas as pd import unittest import cudf -import os -from gquant.dataframe_flow.task import load_modules -load_modules(os.getenv('MODULEPATH')+'/rapids_modules/') -from rapids_modules.cuindicator import Rolling, Ewm +from gquant_rapids_plugin.cuindicator import Rolling, Ewm from .utils import make_orderer, error_function import numpy as np diff --git a/tests/unit/test_util.py b/gQuant/plugins/rapids_plugin/tests/unit/test_util.py similarity index 92% rename from tests/unit/test_util.py rename to gQuant/plugins/rapids_plugin/tests/unit/test_util.py index 39403b85..bada0c72 100644 --- a/tests/unit/test_util.py +++ b/gQuant/plugins/rapids_plugin/tests/unit/test_util.py @@ -22,10 +22,7 @@ import pandas as pd import unittest import cudf -from gquant.dataframe_flow.task import load_modules -import os -load_modules(os.getenv('MODULEPATH')+'/rapids_modules/') -from rapids_modules.cuindicator import shift, diff +from gquant_rapids_plugin.cuindicator import shift, diff import numpy as np from .utils import make_orderer, error_function diff --git a/tests/unit/testdata.csv.gz b/gQuant/plugins/rapids_plugin/tests/unit/testdata.csv.gz similarity index 100% rename from tests/unit/testdata.csv.gz rename to gQuant/plugins/rapids_plugin/tests/unit/testdata.csv.gz diff --git a/gQuant/plugins/rapids_plugin/tests/unit/utils.py b/gQuant/plugins/rapids_plugin/tests/unit/utils.py new file mode 100644 index 00000000..be3a71d0 --- /dev/null +++ b/gQuant/plugins/rapids_plugin/tests/unit/utils.py @@ -0,0 +1,61 @@ +import numpy as np + + +def make_orderer(): + """Keep tests in order""" + order = {} + + def ordered(f): + order[f.__name__] = len(order) + return f + + def compare(a, b): + return [1, -1][order[a] < order[b]] + + return ordered, compare + + +def error_function(gpu_series, result_series): + """ + utility function to compare GPU array vs CPU array + Parameters + ------ + gpu_series: cudf.Series + GPU computation result series + result_series: pandas.Series + Pandas computation result series + + Returns + ----- + double + maximum error of the two arrays + """ + gpu_arr = gpu_series.to_array(fillna='pandas') + pan_arr = result_series.values + gpu_arr = gpu_arr[~np.isnan(gpu_arr) & ~np.isinf(gpu_arr)] + pan_arr = pan_arr[~np.isnan(pan_arr) & ~np.isinf(pan_arr)] + err = np.abs(gpu_arr - pan_arr).max() + return err + + +def error_function_index(gpu_series, result_series): + """ + utility function to compare GPU array vs CPU array + Parameters + ------ + gpu_series: cudf.Series + GPU computation result series + result_series: pandas.Series + Pandas computation result series + + Returns + ----- + double + maximum error of the two arrays + int + maximum index value diff + """ + err = error_function(gpu_series, result_series) + error_index = np.abs(gpu_series.index.to_array() - + result_series.index.values).max() + return err, error_index diff --git a/external/README.md b/gQuant/plugins/simple_example/README.md similarity index 56% rename from external/README.md rename to gQuant/plugins/simple_example/README.md index 6846d1a5..c4e7c791 100644 --- a/external/README.md +++ b/gQuant/plugins/simple_example/README.md @@ -7,34 +7,28 @@ This is a simple example to show how to write an external gQuant plugin. gQuant conda create -n test python=3.8 ``` -### Install the gQuant lib -To install the gQuant graph computation library, first install the dependence libraries: -```bash -pip install dask[dataframe] distributed networkx -conda install python-graphviz ruamel.yaml numpy pandas -``` -Then install gquant lib: +### Install the gQuant +To install the gQuant graph computation library, run: ```bash pip install gquant ``` - -### Install the gQuantlab plugin -To install JupyterLab plugin, install the following dependence libraries: +Or install `gquant` at the gquant directory: ```bash -conda install nodejs ipywidgets +pip install . ``` -Then install the gquantlab lib: + +### Install the gquantlab JupyterLab plugin +To install `gquantlab` JupyterLab plugin, make sure `nodejs` of version [12^14^15] is installed. E.g: ```bash -pip install gquantlab==0.1.2 +conda install -c conda-forge nodejs=12.4.0 ``` -Build the ipywidgets Jupyterlab plugin +Then install the `gquantlab`: ```bash -jupyter labextension install @jupyter-widgets/jupyterlab-manager@2.0 +pip install gquantlab ``` -If you launch the JupyterLab, it will prompt to build the new plugin. You can -explicitly build it by: +Or install `gquantlab` at the gquantlab directory: ```bash -jupyter lab build +pip install . ``` ### Install the external example plugin diff --git a/external/example/__init__.py b/gQuant/plugins/simple_example/example/__init__.py similarity index 100% rename from external/example/__init__.py rename to gQuant/plugins/simple_example/example/__init__.py diff --git a/external/example/client.py b/gQuant/plugins/simple_example/example/client.py similarity index 100% rename from external/example/client.py rename to gQuant/plugins/simple_example/example/client.py diff --git a/external/example/distanceNode.py b/gQuant/plugins/simple_example/example/distanceNode.py similarity index 100% rename from external/example/distanceNode.py rename to gQuant/plugins/simple_example/example/distanceNode.py diff --git a/external/example/pointNode.py b/gQuant/plugins/simple_example/example/pointNode.py similarity index 100% rename from external/example/pointNode.py rename to gQuant/plugins/simple_example/example/pointNode.py diff --git a/gQuant/plugins/simple_example/notebooks/plugin_example.ipynb b/gQuant/plugins/simple_example/notebooks/plugin_example.ipynb new file mode 100644 index 00000000..0b08f3ee --- /dev/null +++ b/gQuant/plugins/simple_example/notebooks/plugin_example.ipynb @@ -0,0 +1,90 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "28b4de15b5114ea4929c4a3d716eb194", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "GQuantWidget(sub=HBox(), value=[OrderedDict([('id', 'points'), ('type', 'PointNode'), ('conf', {'npts': 20}), …" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import json\n", + "from gquant.dataframe_flow import TaskGraph\n", + "taskGraph=TaskGraph.load_taskgraph('./simple_plugin.gq.yaml')\n", + "taskGraph.draw()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "bc10f41548f7414585a3b5454c4ce8c2", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Tab(children=(Output(), Output(), Output(layout=Layout(border='1px solid black'), outputs=({'output_type': 'st…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "taskGraph.run(formated=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/gQuant/plugins/simple_example/notebooks/simple_plugin.gq.yaml b/gQuant/plugins/simple_example/notebooks/simple_plugin.gq.yaml new file mode 100644 index 00000000..6be153f2 --- /dev/null +++ b/gQuant/plugins/simple_example/notebooks/simple_plugin.gq.yaml @@ -0,0 +1,18 @@ +- id: points + type: PointNode + conf: + npts: 20 + inputs: {} + module: custom_nodes +- id: compute_distance + type: DistanceNode + conf: {} + inputs: + points_df_in: points.points_df_out + module: custom_nodes +- id: '' + type: Output_Collector + conf: {} + inputs: + in1: compute_distance.distance_df + in2: compute_distance.distance_abs_df diff --git a/external/setup.py b/gQuant/plugins/simple_example/setup.py similarity index 100% rename from external/setup.py rename to gQuant/plugins/simple_example/setup.py diff --git a/tutorial.gif b/gQuant/tutorial.gif similarity index 100% rename from tutorial.gif rename to gQuant/tutorial.gif diff --git a/util/auto_gen.py b/gQuant/util/auto_gen.py similarity index 100% rename from util/auto_gen.py rename to gQuant/util/auto_gen.py diff --git a/util/print_env.sh b/gQuant/util/print_env.sh similarity index 100% rename from util/print_env.sh rename to gQuant/util/print_env.sh diff --git a/util/start.sh b/gQuant/util/start.sh similarity index 100% rename from util/start.sh rename to gQuant/util/start.sh diff --git a/util/stop.sh b/gQuant/util/stop.sh similarity index 100% rename from util/stop.sh rename to gQuant/util/stop.sh diff --git a/workspace.code-workspace b/gQuant/workspace.code-workspace similarity index 93% rename from workspace.code-workspace rename to gQuant/workspace.code-workspace index 41b2e86d..21e186db 100644 --- a/workspace.code-workspace +++ b/gQuant/workspace.code-workspace @@ -6,7 +6,7 @@ ], "settings": { "terminal.integrated.shell.linux": "/bin/bash", - "python.pythonPath": "${env:HOME}/miniconda3/bin/python", + "python.pythonPath": "/home/yi/miniconda3/envs/gdax/bin/python", "python.linting.enabled": true, "python.linting.pylintEnabled": true, "python.formatting.autopep8Path": "${env:HOME}/miniconda3/bin/autopep8", diff --git a/gquantlab/gquantlab/__init__.py b/gquantlab/gquantlab/__init__.py deleted file mode 100644 index 13a3f2b8..00000000 --- a/gquantlab/gquantlab/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -from ._version import __version__ -from .handlers import setup_handlers - - -def _jupyter_server_extension_paths(): - return [{ - "module": "gquantlab" - }] - - -def load_jupyter_server_extension(lab_app): - """Registers the API handler to receive HTTP requests from the frontend extension. - - Parameters - ---------- - lab_app: jupyterlab.labapp.LabApp - JupyterLab application instance - """ - setup_handlers(lab_app.web_app) - lab_app.log.info("Registered gQuantLab extension at URL path /gquantlab") diff --git a/gquantlab/gquantlab/_version.py b/gquantlab/gquantlab/_version.py deleted file mode 100644 index 38852bdb..00000000 --- a/gquantlab/gquantlab/_version.py +++ /dev/null @@ -1,2 +0,0 @@ -version_info = (0, 1, 2) -__version__ = ".".join(map(str, version_info)) diff --git a/gquantlab/jupyter-config/gquantlab.json b/gquantlab/jupyter-config/gquantlab.json deleted file mode 100644 index 3aa2c1cc..00000000 --- a/gquantlab/jupyter-config/gquantlab.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "NotebookApp": { - "nbserver_extensions": { - "gquantlab": true - } - } -} diff --git a/gquantlab/pyproject.toml b/gquantlab/pyproject.toml deleted file mode 100644 index 010df900..00000000 --- a/gquantlab/pyproject.toml +++ /dev/null @@ -1,3 +0,0 @@ -[build-system] -requires = ["jupyter_packaging~=0.4.0", "jupyterlab~=2.0", "setuptools>=40.8.0", "wheel"] -build-backend = "setuptools.build_meta" diff --git a/gquantrc b/gquantrc deleted file mode 100644 index d30087b3..00000000 --- a/gquantrc +++ /dev/null @@ -1,4 +0,0 @@ -[ModuleFiles] -nemo_modules= %(MODULEPATH)s/nemo_gquant_modules -rapids_modules= %(MODULEPATH)s/rapids_modules -my_node= %(MODULEPATH)s/my_node.py diff --git a/notebooks/01_tutorial.ipynb b/notebooks/01_tutorial.ipynb deleted file mode 100644 index 3e25e25d..00000000 --- a/notebooks/01_tutorial.ipynb +++ /dev/null @@ -1,1413 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Introduction to gQuant\n", - "\n", - "**gQuant** is a set of open-source examples for Quantitative Analysis tasks:\n", - "- Data preparation & feat. engineering\n", - "- Alpha seeking modeling\n", - "- Technical indicators\n", - "- Backtesting\n", - "\n", - "It is GPU-accelerated by leveraging [**RAPIDS.ai**](https://rapids.ai) technology, and has Multi-GPU and Multi-Node support.\n", - "\n", - "gQuant computing components are oriented around its plugins and task graph.\n", - "\n", - "## Download example datasets\n", - "\n", - "Before getting started, let's download the example datasets if not present." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Dataset is already present. No need to re-download it.\n" - ] - } - ], - "source": [ - "! ((test ! -f './data/stock_price_hist.csv.gz' || test ! -f './data/security_master.csv.gz') && \\\n", - " cd .. && bash download_data.sh) || echo \"Dataset is already present. No need to re-download it.\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## About this notebook\n", - "\n", - "In this tutorial, we are going to use gQuant to do a simple quant job. The job tasks are listed below:\n", - " 1. load csv stock data.\n", - " 2. filter out the stocks that has average volume smaller than 50.\n", - " 3. sort the stock symbols and datetime.\n", - " 4. add rate of return as a feature into the table.\n", - " 5. in two branches, computethe mean volume and mean return.\n", - " 6. read the file containing the stock symbol names, and join the computed dataframes.\n", - " 7. output the result in csv files.\n", - " \n", - "## TaskGraph playground\n", - "\n", - "Run the following gquant code to start a empty TaskGraph where computation graph can be created. You can follow the steps as listed below." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "26579d22aa1743bda97aa88f04e8dc36", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "GQuantWidget(sub=HBox())" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import sys; sys.path.insert(0, '..')\n", - "from gquant.dataframe_flow import TaskGraph\n", - "task_graph = TaskGraph()\n", - "task_graph.draw()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Step by Step to build your first task graph\n", - "\n", - "### Create Task node to load the included stock csv file \n", - "\n", - "\n", - "### Explore the data and visualize it\n", - "\n", - "\n", - "### Clean up the Task nodes for next steps\n", - "\n", - "\n", - "### Filter the data and compute the rate of return feature\n", - "\n", - "\n", - "### Save current TaskGraph for a composite Task node\n", - "\n", - "\n", - "### Clean up the redudant feature computation Task nodes\n", - "\n", - "\n", - "### Compute the averge volume and returns \n", - "\n", - "\n", - "### Dump the dataframe to csv files\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Just in case you cannnot follow along, here you can load the tutorial taskgraph from the file. First one is the graph to calculate the return feature. " - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "0c3b7242f4124ac58758b09a3079e273", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "GQuantWidget(sub=HBox(), value=[OrderedDict([('id', 'stock_data'), ('type', 'CsvStockLoader'), ('conf', {'file…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "task_graph = TaskGraph.load_taskgraph('../taskgraphs/get_return_feature.gq.yaml')\n", - "task_graph.draw()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Load the full graph and click on the `run` button to see the result" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "6f710007cebc41ce8cb2f91dd4237b77", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "GQuantWidget(sub=HBox(), value=[OrderedDict([('id', 'stock_data'), ('type', 'CsvStockLoader'), ('conf', {'file…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "task_graph = TaskGraph.load_taskgraph('../taskgraphs/tutorial_intro.gq.yaml')\n", - "task_graph.draw()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## About Task graphs, nodes and plugins\n", - "\n", - "Quant processing operators are defined as nodes that operates on **cuDF**/**dask_cuDF** dataframes.\n", - "\n", - "A **task graph** is a list of tasks composed of gQuant nodes.\n", - "\n", - "The cell below contains the task graph described before." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "import warnings; warnings.simplefilter(\"ignore\")\n", - "csv_average_return = 'average_return.csv'\n", - "csv_average_volume = 'average_volume.csv'\n", - "csv_file_path = './data/stock_price_hist.csv.gz'\n", - "csv_name_file_path = './data/security_master.csv.gz'\n", - "from gquant.dataframe_flow import TaskSpecSchema \n", - "\n", - "# load csv stock data\n", - "task_csvdata = {\n", - " TaskSpecSchema.task_id: 'stock_data',\n", - " TaskSpecSchema.node_type: 'CsvStockLoader',\n", - " TaskSpecSchema.conf: {'file': csv_file_path},\n", - " TaskSpecSchema.inputs: {},\n", - " TaskSpecSchema.module: \"rapids_modules\"\n", - "}\n", - "\n", - "# filter out the stocks that has average volume smaller than 50\n", - "task_minVolume = {\n", - " TaskSpecSchema.task_id: 'volume_filter',\n", - " TaskSpecSchema.node_type: 'ValueFilterNode',\n", - " TaskSpecSchema.conf: [{'min': 50.0, 'column': 'volume'}],\n", - " TaskSpecSchema.inputs: {'in': 'stock_data.cudf_out'},\n", - " TaskSpecSchema.module: \"rapids_modules\"\n", - "}\n", - "\n", - "# sort the stock symbols and datetime\n", - "task_sort = {\n", - " TaskSpecSchema.task_id: 'sort_node',\n", - " TaskSpecSchema.node_type: 'SortNode',\n", - " TaskSpecSchema.conf: {'keys': ['asset', 'datetime']},\n", - " TaskSpecSchema.inputs: {'in': 'volume_filter.out'},\n", - " TaskSpecSchema.module: \"rapids_modules\"\n", - "}\n", - "\n", - "# add rate of return as a feature into the table\n", - "task_addReturn = {\n", - " TaskSpecSchema.task_id: 'add_return_feature',\n", - " TaskSpecSchema.node_type: 'ReturnFeatureNode',\n", - " TaskSpecSchema.conf: {},\n", - " TaskSpecSchema.inputs: {'stock_in': 'sort_node.out'},\n", - " TaskSpecSchema.module: \"rapids_modules\"\n", - "}\n", - "\n", - "# read the stock symbol name file and join the computed dataframes\n", - "task_stockSymbol = {\n", - " TaskSpecSchema.task_id: 'stock_name',\n", - " TaskSpecSchema.node_type: 'StockNameLoader',\n", - " TaskSpecSchema.conf: {'file': csv_name_file_path },\n", - " TaskSpecSchema.inputs: {},\n", - " TaskSpecSchema.module: \"rapids_modules\"\n", - "}\n", - "\n", - "# In two branches, compute the mean volume and mean return seperately\n", - "task_volumeMean = {\n", - " TaskSpecSchema.task_id: 'average_volume',\n", - " TaskSpecSchema.node_type: 'AverageNode',\n", - " TaskSpecSchema.conf: {'column': 'volume'},\n", - " TaskSpecSchema.inputs: {'stock_in': 'add_return_feature.stock_out'},\n", - " TaskSpecSchema.module: \"rapids_modules\"\n", - "}\n", - "\n", - "task_returnMean = {\n", - " TaskSpecSchema.task_id: 'average_return',\n", - " TaskSpecSchema.node_type: 'AverageNode',\n", - " TaskSpecSchema.conf: {'column': 'returns'},\n", - " TaskSpecSchema.inputs: {'stock_in': 'add_return_feature.stock_out'},\n", - " TaskSpecSchema.module: \"rapids_modules\"\n", - "}\n", - "\n", - "task_leftMerge1 = {\n", - " TaskSpecSchema.task_id: 'left_merge1',\n", - " TaskSpecSchema.node_type: 'LeftMergeNode',\n", - " TaskSpecSchema.conf: {'column': 'asset'},\n", - " TaskSpecSchema.inputs: {'left': 'average_return.stock_out', \n", - " 'right': 'stock_name.stock_name'},\n", - " TaskSpecSchema.module: \"rapids_modules\"\n", - "}\n", - "\n", - "task_leftMerge2 = {\n", - " TaskSpecSchema.task_id: 'left_merge2',\n", - " TaskSpecSchema.node_type: 'LeftMergeNode',\n", - " TaskSpecSchema.conf: {'column': 'asset'},\n", - " TaskSpecSchema.inputs: {'left': 'average_volume.stock_out', \n", - " 'right': 'stock_name.stock_name'},\n", - " TaskSpecSchema.module: \"rapids_modules\"\n", - "}\n", - "\n", - "# output the result in csv files\n", - "\n", - "task_outputCsv1 = {\n", - " TaskSpecSchema.task_id: 'output_csv1',\n", - " TaskSpecSchema.node_type: 'OutCsvNode',\n", - " TaskSpecSchema.conf: {'path': csv_average_return},\n", - " TaskSpecSchema.inputs: {'df_in': 'left_merge1.merged'},\n", - " TaskSpecSchema.module: \"rapids_modules\"\n", - "}\n", - "\n", - "task_outputCsv2 = {\n", - " TaskSpecSchema.task_id: 'output_csv2',\n", - " TaskSpecSchema.node_type: 'OutCsvNode',\n", - " TaskSpecSchema.conf: {'path': csv_average_volume },\n", - " TaskSpecSchema.inputs: {'df_in': 'left_merge2.merged'},\n", - " TaskSpecSchema.module: \"rapids_modules\"\n", - "}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In Python, a gQuant task-spec is defined as a dictionary with the following fields:\n", - "- `id`\n", - "- `type`\n", - "- `conf`\n", - "- `inputs`\n", - "- `filepath`\n", - "- `module`\n", - "\n", - "As a best practice, we recommend using the `TaskSpecSchema` class for these fields, instead of strings.\n", - "\n", - "The `id` for a given task must be unique within a task graph. To use the result(s) of other task(s) as input(s) of a different task, we use the id(s) of the former task(s) in the `inputs` field of the next task.\n", - "\n", - "The `type` field contains the node type to use for the compute task. gQuant includes a collection of node classes. These can be found in `gquant.plugin_nodes`. Click [here](#node_class_example) to see a gQuant node class example.\n", - "\n", - "The `conf` field is used to parameterise a task. It lets you access user-set parameters within a plugin (such as `self.conf['min']` in the example above). Each node defines the `conf` json schema. The gQuant UI can use this schema to generate the proper form UI for the inputs. It is recommended to use the UI to configure the `conf`. \n", - "\n", - "The `filepath` field is used to specify a python module where a custom plugin is defined. It is optional if the plugin is in `plugin_nodes` directory, and mandatory when the plugin is somewhere else. In a different tutorial, we will learn how to create custom plugins.\n", - "\n", - "The `module` is optional to tell gQuant the name of module that the node type is from. If it is not specified, gQuant will search for it among all the customized modules. \n", - "\n", - "A custom node schema will look something like this:\n", - "```\n", - "custom_task = {\n", - " TaskSpecSchema.task_id: 'custom_calc',\n", - " TaskSpecSchema.node_type: 'CustomNode',\n", - " TaskSpecSchema.conf: {},\n", - " TaskSpecSchema.inputs: ['some_other_node'],\n", - " TaskSpecSchema.filepath: 'custom_nodes.py'\n", - "}\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Below, we compose our task graph and visualize it as a graph." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from gquant.dataframe_flow import TaskGraph\n", - "\n", - "# list of nodes composing the task graph\n", - "task_list = [\n", - " task_csvdata, task_minVolume, task_sort, task_addReturn,\n", - " task_stockSymbol, task_volumeMean, task_returnMean,\n", - " task_leftMerge1, task_leftMerge2,\n", - " task_outputCsv1, task_outputCsv2]\n", - "\n", - "task_graph = TaskGraph(task_list)\n", - "task_graph.draw(show='ipynb')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can visualize the ports by setting `show_ports` to `True`" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "task_graph.draw(show='ipynb', show_ports=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "It is recommended to visualize it with gQuant widget so you can interact with it by call `draw` without arguments" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "5c0113e410a44bf0bb64512a6e334123", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "GQuantWidget(cache={'height': 410.7, 'width': 1369, 'nodes': [{'width': 140, 'id': 'stock_data', 'type': 'CsvS…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "task_graph.draw()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We will use `save_taskgraph` method to save the task graph to a **yaml file**.\n", - "\n", - "That will allow us to re-use it in the future." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "task_graph_file_name = '01_tutorial_task_graph.gq.yaml'\n", - "\n", - "task_graph.save_taskgraph(task_graph_file_name)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here is a snippet of the content in the resulting yaml file:" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "- id: stock_data\n", - " type: CsvStockLoader\n", - " conf:\n", - " file: ./data/stock_price_hist.csv.gz\n", - " inputs: {}\n", - " module: rapids_modules\n", - "- id: volume_filter\n", - " type: ValueFilterNode\n", - " conf:\n", - " - column: volume\n", - " min: 50\n", - " inputs:\n", - " in: stock_data.cudf_out\n", - " module: rapids_modules\n", - "- id: sort_node\n", - " type: SortNode\n", - " conf:\n", - " keys:\n", - " - asset\n" - ] - } - ], - "source": [ - "%%bash -s \"$task_graph_file_name\"\n", - "head -n 19 $1" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The yaml file describes the computation tasks. We can load it and visualize it as a graph." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "87834069146846799ab1b92dc21532c1", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "GQuantWidget(sub=HBox(), value=[OrderedDict([('id', 'stock_data'), ('type', 'CsvStockLoader'), ('conf', {'file…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "task_graph = TaskGraph.load_taskgraph(task_graph_file_name)\n", - "task_graph.draw()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Building a task graph\n", - "\n", - "Running the task graph is the next logical step. Nevertheless, it can optionally be built before running it.\n", - "\n", - "By calling `build` method, the graph is traversed without running the dataframe computations. This could be useful to inspect the column names and types, validate that the plugins can be instantiated, and check for errors.\n", - "\n", - "The output of `build` are instances of each task in a dictionary.\n", - "\n", - "In the example below, we inspect the column names and types for the inputs and outputs of the `left_merge1` task:" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Output of build task graph are instances of each task in a dictionary:\n", - "\n", - "stock_data: \n", - "volume_filter: \n", - "sort_node: \n", - "add_return_feature: \n", - "stock_name: \n", - "average_volume: \n", - "average_return: \n", - "left_merge1: \n", - "left_merge2: \n", - "output_csv1: \n", - "output_csv2: \n", - "\n" - ] - } - ], - "source": [ - "from pprint import pprint\n", - "\n", - "task_graph.build()\n", - "\n", - "print('Output of build task graph are instances of each task in a dictionary:\\n')\n", - "print(str(task_graph))" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "output meta in outgoing dataframe:\n", - "\n", - "MetaData(inports={'left': {}, 'right': {}}, outports={'merged': {'returns': 'float64', 'asset': 'int64', 'asset_name': 'object'}})\n" - ] - } - ], - "source": [ - "# output meta in 'left_merge_1' node\n", - "\n", - "print('output meta in outgoing dataframe:\\n')\n", - "pprint(task_graph['left_merge1'].meta_setup())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Running a task graph\n", - "\n", - "To execute the graph computations, we will use the `run` method. If the `Output_Collector` task node is not added to the graph, a output list can be feeded to the run method. The result can be displayed in a rich mode if the `formated` argument is turned on.\n", - "\n", - "`run` can also takes an optional `replace` argument which is used and explained later on" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "cdb4f783594b48ac9c090132f35ffdca", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Tab(children=(Output(), Output(), Output(), Output(layout=Layout(border='1px solid black'), outputs=({'output_…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "outputs = ['stock_data.cudf_out', 'output_csv1.df_out', 'output_csv2.df_out']\n", - "task_graph.run(outputs=outputs, formated=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The result can be used as a tuple or dictionary." - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
assetvolumeasset_name
0869577154.528596CTT
1869584701.630560LPT
2869587119.878161HBP
3869589161.938559DSLV
4869590204.126667BPTH
............
499522876187.673706SIM
49962287979.648169GLQ
4997228801360.962269EXPR
499822884171.356747LTBR
499922888526.766194HIW
\n", - "

5000 rows × 3 columns

\n", - "
" - ], - "text/plain": [ - " asset volume asset_name\n", - "0 869577 154.528596 CTT\n", - "1 869584 701.630560 LPT\n", - "2 869587 119.878161 HBP\n", - "3 869589 161.938559 DSLV\n", - "4 869590 204.126667 BPTH\n", - "... ... ... ...\n", - "4995 22876 187.673706 SIM\n", - "4996 22879 79.648169 GLQ\n", - "4997 22880 1360.962269 EXPR\n", - "4998 22884 171.356747 LTBR\n", - "4999 22888 526.766194 HIW\n", - "\n", - "[5000 rows x 3 columns]" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "result = task_graph.run(outputs=outputs)\n", - "csv_data_df, csv_1_df, csv_2_df = result\n", - "result['output_csv2.df_out']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can profile each of the computation node running time by turning on the profiler." - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "id:stock_data process time:3.999s\n", - "id:volume_filter process time:0.009s\n", - "id:sort_node process time:0.088s\n", - "id:add_return_feature process time:0.056s\n", - "id:average_volume process time:0.036s\n", - "id:average_return process time:0.035s\n", - "id:stock_name process time:0.012s\n", - "id:left_merge1 process time:0.002s\n", - "id:output_csv1 process time:0.028s\n", - "id:left_merge2 process time:0.002s\n", - "id:output_csv2 process time:0.025s\n" - ] - } - ], - "source": [ - "outputs = ['stock_data.cudf_out', 'output_csv1.df_out', 'output_csv2.df_out']\n", - "csv_data_df, csv_1_df, csv_2_df = task_graph.run(outputs=outputs, profile=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Where most of the time is spent on the csv file processing. This is because we have to convert the time string to the proper format via CPU. Let's inspect the content of `csv_1_df` and `csv_2_df`." - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "csv_1_df content:\n", - " asset returns asset_name\n", - "0 869577 -0.000295 CTT\n", - "1 869584 0.000387 LPT\n", - "2 869587 0.027713 HBP\n", - "3 869589 0.001337 DSLV\n", - "4 869590 0.009657 BPTH\n", - "... ... ... ...\n", - "4995 707588 -0.000049 LDP\n", - "4996 707611 -0.000967 LITB\n", - "4997 707619 -0.021890 LND\n", - "4998 707624 0.001069 LOCK\n", - "4999 707647 0.001011 LXFR\n", - "\n", - "[5000 rows x 3 columns]\n", - "\n", - "csv_2_df content:\n", - " asset volume asset_name\n", - "0 869577 154.528596 CTT\n", - "1 869584 701.630560 LPT\n", - "2 869587 119.878161 HBP\n", - "3 869589 161.938559 DSLV\n", - "4 869590 204.126667 BPTH\n", - "... ... ... ...\n", - "4995 707588 99.917736 LDP\n", - "4996 707611 393.918500 LITB\n", - "4997 707619 112.800000 LND\n", - "4998 707624 1256.701650 LOCK\n", - "4999 707647 116.476103 LXFR\n", - "\n", - "[5000 rows x 3 columns]\n" - ] - } - ], - "source": [ - "print('csv_1_df content:')\n", - "print(csv_1_df)\n", - "\n", - "print('\\ncsv_2_df content:')\n", - "print(csv_2_df) " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Also, please notice that two resulting csv files has been created:\n", - "- average_return.csv\n", - "- average_volume.csv" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "csv files created:\n" - ] - } - ], - "source": [ - "print('\\ncsv files created:')\n", - "!find . -iname \"*symbol*\" " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Subgraphs\n", - "\n", - "A nice feature of task graphs is that we can evaluate any **subgraph**. For instance, if you are only interested in the `average volume` result, you can run only the tasks which are relevant for that computation.\n", - "\n", - "If we would not want to re-run tasks, we could also use the `replace` argument of the `run` function with a `load` option.\n", - "\n", - "The `replace` argument needs to be a dictionary where each key is the task/node id. The values are a replacement task-spec dictionary (i.e. each key is a spec overload, and its value is what to overload with).\n", - "\n", - "In the example below, instead of re-running the `stock_data` node to load a csv file into a `cudf` dataframe, we will use its dataframe output to load from it." - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " asset volume\n", - "0 93 86.594737\n", - "1 165 67.020000\n", - "2 239 128.835141\n", - "3 281 68.450000\n", - "4 592 110.333333\n", - "... ... ...\n", - "4995 869589 161.938559\n", - "4996 869590 204.126667\n", - "4997 869592 101.423675\n", - "4998 869597 81.298788\n", - "4999 869599 83.425988\n", - "\n", - "[5000 rows x 2 columns]\n" - ] - } - ], - "source": [ - "replace = {\n", - " 'stock_data': {\n", - " 'load': {\n", - " 'cudf_out': csv_data_df\n", - " },\n", - " 'save': True\n", - " }\n", - "}\n", - "\n", - "(volume_mean_df, ) = task_graph.run(outputs=['average_volume.stock_out'],\n", - " replace=replace)\n", - "\n", - "print(volume_mean_df)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As a convenience, we can save on disk the checkpoints for any of the nodes, and re-load them if needed. It is only needed to set the save option to `True`. This step will take a while depends on the disk IO speed.\n", - "\n", - "In the example above, the `replace` spec directs `run` to save on disk for the `stock_data`. If `load` was boolean then the data would be loaded from disk presuming the data was saved to disk in a prior run.\n", - "\n", - "The default directory for saving is `/.cache/.hdf5`.\n", - "\n", - "`replace` is also used to override parameters in the tasks. For instance, if we wanted to use the value `40.0` instead `50.0` in the task `volume_filter`, we would do something similar to:\n", - "```\n", - "replace_spec = {\n", - " 'volume_filter': {\n", - " 'conf': {\n", - " 'min': 40.0\n", - " }\n", - " },\n", - " 'some_task': etc...\n", - "}\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Return mean Dataframe:\n", - "\n", - " asset returns\n", - "0 93 0.241380\n", - "1 165 0.000700\n", - "2 239 0.010021\n", - "3 281 -0.088465\n", - "4 592 0.619716\n", - "... ... ...\n", - "4995 869589 0.001337\n", - "4996 869590 0.009657\n", - "4997 869592 0.001202\n", - "4998 869597 -0.003332\n", - "4999 869599 0.003291\n", - "\n", - "[5000 rows x 2 columns]\n" - ] - } - ], - "source": [ - "replace = {'stock_data': {'load': True},\n", - " 'average_return': {'save': True}}\n", - "\n", - "\n", - "(return_mean_df, ) = task_graph.run(outputs=['average_return.stock_out'], replace=replace)\n", - "\n", - "print('Return mean Dataframe:\\n')\n", - "print(return_mean_df)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now, we might want to load the `return_mean_df` from the saved file and evaluate only tasks that we are interested in.\n", - "\n", - "In the cells below, we compare different load approaches:\n", - "- in-memory,\n", - "- from disk, \n", - "- and not loading at all.\n", - "\n", - "When working interactively, or in situations requiring iterative and explorative task graphs, a significant amount of time is saved by just re-loading the data that do not require to be recalculated." - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Using in-memory dataframes for load:\n", - "CPU times: user 260 ms, sys: 67.6 ms, total: 327 ms\n", - "Wall time: 322 ms\n" - ] - } - ], - "source": [ - "%%time\n", - "print('Using in-memory dataframes for load:')\n", - "\n", - "replace = {'stock_data': {'load': {\n", - " 'cudf_out': csv_data_df\n", - " }},\n", - " 'average return': {'load': \n", - " {'stock_out': return_mean_df}}\n", - " }\n", - "\n", - "_ = task_graph.run(outputs=['output_csv2.df_out'], replace=replace)" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Using cached dataframes on disk for load:\n", - "CPU times: user 2.93 s, sys: 857 ms, total: 3.79 s\n", - "Wall time: 3.77 s\n" - ] - } - ], - "source": [ - "%%time\n", - "print('Using cached dataframes on disk for load:')\n", - "\n", - "replace = {'stock_data': {'load': True},\n", - " 'average return': {'load': True}}\n", - "\n", - "_ = task_graph.run(outputs=['output_csv2.df_out'], replace=replace)" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Re-running dataframes calculations instead of using load:\n", - "CPU times: user 2.92 s, sys: 885 ms, total: 3.8 s\n", - "Wall time: 3.78 s\n" - ] - } - ], - "source": [ - "%%time\n", - "print('Re-running dataframes calculations instead of using load:')\n", - "\n", - "replace = {'stock_data': {'load': True}}\n", - "\n", - "_ = task_graph.run(outputs=['output_csv2.df_out'], replace=replace)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "An idiomatic way to save data, if not on disk, or load data, if present on disk, is demonstrated below." - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 2.96 s, sys: 837 ms, total: 3.8 s\n", - "Wall time: 3.78 s\n" - ] - } - ], - "source": [ - "%%time\n", - "import os\n", - "\n", - "loadsave_csv_data = 'load' if os.path.isfile('./.cache/stock_data.hdf5') else 'save'\n", - "loadsave_return_mean = 'load' if os.path.isfile('./.cache/average_return.hdf5') else 'save'\n", - "\n", - "replace = {'stock_data': {loadsave_csv_data: True},\n", - " 'average_return': {loadsave_return_mean: True}}\n", - "\n", - "_ = task_graph.run(outputs=['output_csv2.df_out'], replace=replace)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Delete temporary files\n", - "\n", - "A few cells above, we generated a .yaml file containing the example task graph, and also a couple of CSV files.\n", - "\n", - "Let's keep our directory clean, and delete them." - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [], - "source": [ - "%%bash -s \"$task_graph_file_name\" \"$csv_average_return\" \"$csv_average_volume\" \n", - "rm -f $1 $2 $3" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "---\n", - "\n", - "## Node class example\n", - "\n", - "Implementing custom nodes in gQuant is very straighforward.\n", - "\n", - "Data scientists only need to override five methods in the parent class `Node`:\n", - "- `init`\n", - "- `meta_setup`\n", - "- `ports_setup`\n", - "- `conf_schema`\n", - "- `process`\n", - "\n", - "`init` method is usually used to define the required column names\n", - "\n", - "`ports_setup` defines the input and output ports for the node\n", - "\n", - "`meta_setup` method is used to calculate the output meta name and types.\n", - "\n", - "`conf_schema` method is used to define the JSON schema for the node conf so the client can generate the proper UI for it.\n", - "\n", - "`process` method takes input dataframes and computes the output dataframe. \n", - "\n", - "In this way, dataframes are strongly typed, and errors can be detected early before the time-consuming computation happens.\n", - "\n", - "Below, it can be observed `ValueFilterNode` implementation details:" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "class ValueFilterNode(Node, _PortTypesMixin):\n", - "\n", - " def init(self):\n", - " _PortTypesMixin.init(self)\n", - "\n", - " def meta_setup(self):\n", - " cols_required = {\"asset\": \"int64\"}\n", - " return _PortTypesMixin.meta_setup(self, required=cols_required)\n", - "\n", - " def ports_setup(self):\n", - " return _PortTypesMixin.ports_setup(self)\n", - "\n", - " def conf_schema(self):\n", - " json = {\n", - " \"title\": \"Value Filter Node configure\",\n", - " \"type\": \"array\",\n", - " \"description\": \"\"\"Filter the dataframe based on a list of\n", - " min/max values.\"\"\",\n", - " \"items\": {\n", - " \"type\": \"object\",\n", - " \"properties\": {\n", - " \"column\": {\n", - " \"type\": \"string\",\n", - " \"description\": \"dataframe column to be filered on\"\n", - " },\n", - " \"min\": {\n", - " \"type\": \"number\",\n", - " \"description\": \"min value, inclusive\"\n", - " },\n", - " \"max\": {\n", - " \"type\": \"number\",\n", - " \"description\": \"max value, inclusive\"\n", - " }\n", - " }\n", - " }\n", - " }\n", - " ui = {}\n", - " input_meta = self.get_input_meta()\n", - " if self.INPUT_PORT_NAME in input_meta:\n", - " col_from_inport = input_meta[self.INPUT_PORT_NAME]\n", - " enums = [col for col in col_from_inport.keys()]\n", - " json['items']['properties']['column']['enum'] = enums\n", - " return ConfSchema(json=json, ui=ui)\n", - " else:\n", - " return ConfSchema(json=json, ui=ui)\n", - "\n", - " def process(self, inputs):\n", - " \"\"\"\n", - " filter the dataframe based on a list of min/max values. The node's\n", - " conf is a list of column criteria. It defines the column name in\n", - " 'column`, the min value in `min` and the max value in `max`.\n", - "\n", - " Arguments\n", - " -------\n", - " inputs: list\n", - " list of input dataframes.\n", - " Returns\n", - " -------\n", - " dataframe\n", - " \"\"\"\n", - "\n", - " input_df = inputs[self.INPUT_PORT_NAME]\n", - " str_list = []\n", - " for column_item in self.conf:\n", - " column_name = column_item['column']\n", - " if 'min' in column_item:\n", - " minValue = column_item['min']\n", - " str_item = '%s >= %f' % (column_name, minValue)\n", - " str_list.append(str_item)\n", - " if 'max' in column_item:\n", - " maxValue = column_item['max']\n", - " str_item = '%s <= %f' % (column_name, maxValue)\n", - " str_list.append(str_item)\n", - " input_df = input_df.query(\" and \".join(str_list))\n", - " return {self.OUTPUT_PORT_NAME: input_df}\n", - "\n" - ] - } - ], - "source": [ - "import inspect\n", - "from rapids_modules import ValueFilterNode\n", - "\n", - "print(inspect.getsource(ValueFilterNode))" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'status': 'ok', 'restart': True}" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import IPython\n", - "app = IPython.Application.instance()\n", - "app.kernel.do_shutdown(True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.7" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/notebooks/02_single_stock_trade.ipynb b/notebooks/02_single_stock_trade.ipynb deleted file mode 100644 index a50d338f..00000000 --- a/notebooks/02_single_stock_trade.ipynb +++ /dev/null @@ -1,386 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### gQuant Tutorial\n", - "First import all the necessary modules." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import sys; sys.path.insert(0, '..')\n", - "import os\n", - "import warnings\n", - "import ipywidgets as widgets\n", - "from gquant.dataframe_flow import TaskGraph\n", - "\n", - "warnings.simplefilter(\"ignore\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In this tutorial, we are going to use gQuant to do a simple quant job. The task is fully described in a yaml file" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "- id: stock_data\n", - " type: CsvStockLoader\n", - " conf:\n", - " file: notebooks/data/stock_price_hist.csv.gz\n", - " inputs: {}\n", - " module: rapids_modules\n", - "- id: stock_name\n", - " type: StockNameLoader\n", - " conf:\n", - " file: notebooks/data/security_master.csv.gz\n", - " inputs: {}\n", - " module: rapids_modules\n", - "- id: stock_selector\n", - " type: AssetFilterNode\n", - " conf:\n", - " asset: 4330\n", - " inputs:\n", - " name_map: stock_name.map_data\n", - " stock_in: stock_data.cudf_out\n", - " module: rapids_modules\n", - "- id: \"\"\n", - " type: Output_Collector\n", - " conf: {}\n", - " inputs:\n", - " in1: stock_selector.stock_name\n", - " in2: lineplot.lineplot\n", - " in3: barplot.barplot\n", - " in4: sharpe_ratio.sharpe_out\n", - " in5: cumulative_return.cum_return\n", - " in6: stock_data.cudf_out\n", - " module: rapids_modules\n" - ] - } - ], - "source": [ - "!head -n 31 ../taskgraphs/simple_trade.gq.yaml" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The yaml file is describing the computation task by a graph, we can visualize it" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "task_graph = TaskGraph.load_taskgraph('../taskgraphs/simple_trade.gq.yaml')\n", - "task_graph.draw(show='ipynb', show_ports=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "689b7c1a98f84f668fcb815cc686c510", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "GQuantWidget(sub=HBox(), value=[OrderedDict([('id', 'stock_data'), ('type', 'CsvStockLoader'), ('conf', {'file…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "task_graph.draw()" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "b69d7d9beec64973a1cb880adfe96792", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Tab(children=(Output(), Output(), Output(), Output(), Output(), Output(), Output(layout=Layout(border='1px sol…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "task_graph.run(formated=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We define a method to organize the output images" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "def plot_figures(result):\n", - " # format the figures\n", - " figure_width = '1200px'\n", - " figure_height = '400px'\n", - " bar_figure = result['barplot.barplot']\n", - " sharpe_number = result['sharpe_ratio.sharpe_out']\n", - " cum_return = result['cumulative_return.cum_return']\n", - " signals = result['lineplot.lineplot']\n", - " symbol = result['stock_selector.stock_name']\n", - "\n", - " bar_figure.layout.height = figure_height\n", - " bar_figure.layout.width = figure_width\n", - " cum_return.layout.height = figure_height\n", - " cum_return.layout.width = figure_width\n", - " cum_return.title = 'P & L %.3f' % (sharpe_number)\n", - " bar_figure.marks[0].labels = [symbol]\n", - " cum_return.marks[0].labels = [symbol]\n", - " signals.layout.height = figure_height\n", - " signals.layout.width = figure_width\n", - " bar_figure.axes = [bar_figure.axes[1]]\n", - " cum_return.axes = [cum_return.axes[0]]\n", - " output = widgets.VBox([bar_figure, cum_return, signals])\n", - "\n", - " return output" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Rerun the graph and send the computation result to the `plot_figure` method" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "202309b6665643338cf132f122f5b693", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "VBox(children=(Figure(axes=[Axis(label='Price', orientation='vertical', scale=LinearScale(max=38.13, min=-10.1…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "result = task_graph.run()\n", - "plot_figures(result)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can change the TaskGraph node parameters interatively and hit the run button to get the updated result. It can also be done programtically, E.g. change the mean reversion parameters:" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "9efb864c81fd4aeabca1677513043b2f", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "VBox(children=(Figure(axes=[Axis(label='Price', orientation='vertical', scale=LinearScale(max=38.13, min=-10.1…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "o = task_graph.run(\n", - " outputs=(list(result.get_keys())[0:]),\n", - " replace={'stock_data': {\"load\": {'cudf_out': result['stock_data.cudf_out']}},\n", - " 'mean_reversion': {'conf': {'fast': 1, 'slow': 10}}})\n", - "figure_combo = plot_figures(o)\n", - "figure_combo" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Since computation is accelerated in the GPU, we can do hyper-parameter search interatively, try to change the parameters of the `slow` and `fast` for the moving average and see if you can improve the result:" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "59ff6c5fc87f41f2a0f768258c620958", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "VBox(children=(HBox(children=(IntRangeSlider(value=(10, 30), continuous_update=False, description='MA:', max=6…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "para_selector = widgets.IntRangeSlider(value=[10, 30],\n", - " min=3,\n", - " max=60,\n", - " step=1,\n", - " description=\"MA:\",\n", - " disabled=False,\n", - " continuous_update=False,\n", - " orientation='horizontal',\n", - " readout=True)\n", - "\n", - "\n", - "def para_selection(*stocks):\n", - " with out:\n", - " print('run')\n", - " para1 = para_selector.value[0]\n", - " para2 = para_selector.value[1]\n", - " o = task_graph.run(\n", - " outputs=(list(result.get_keys())[0:]),\n", - " replace={'stock_data': {\"load\": {'cudf_out': result['stock_data.cudf_out']}},\n", - " 'mean_reversion': {'conf': {'fast': para1, 'slow': para2}}})\n", - " figure_combo = plot_figures(o)\n", - " if (len(w.children) < 2):\n", - " w.children = (w.children[0], figure_combo,)\n", - " else:\n", - " w.children[1].children[1].marks = figure_combo.children[1].marks\n", - " w.children[1].children[2].marks = figure_combo.children[2].marks\n", - " w.children[1].children[1].title = 'P & L %.3f' % (o['sharpe_ratio.sharpe_out'])\n", - "\n", - "\n", - "out = widgets.Output(layout={'border': '1px solid black'})\n", - "para_selector.observe(para_selection, 'value')\n", - "selectors = widgets.HBox([para_selector])\n", - "w = widgets.VBox([selectors])\n", - "w" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'status': 'ok', 'restart': True}" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import IPython\n", - "app = IPython.Application.instance()\n", - "app.kernel.do_shutdown(True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.7" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/notebooks/04_portfolio_trade.ipynb b/notebooks/04_portfolio_trade.ipynb deleted file mode 100644 index 2b561b50..00000000 --- a/notebooks/04_portfolio_trade.ipynb +++ /dev/null @@ -1,1154 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# gQuant - Making Quantitative Analysis Faster\n", - "\n", - "## Background\n", - "By definition, **Quantitative Finance** is the use of mathematical models and large datasets to analyze financial markets and securities, requiring massive computation to extract insight from the data. \n", - "\n", - "Many data science toolkits have been developed to help data scientists to manipulate the data. It starts with scalar number computations at the beginning. Later, the development of [Numpy](https://www.numpy.org) library helps to operate the numbers at vectors, and the popular [Pandas](https://pandas.pydata.org) library operates at a dataframe level. Manipulating data at a high level brings productivity gain for data scientists in quantitative finance.\n", - "\n", - "However, the amount of collected data is increasing exponentially over time. Also, more and more machine learning and statistical models are being developed. As a result, data scientists are facing new challenges hard to deal with traditional data science libraries.\n", - "\n", - "It is very time-consuming for CPUs to crunch massive amount of data and compute the complicated data science models. Large data set requires distributed computation, which is too complicated for data scientists to adopt.\n", - "\n", - "As a consequence, the quantitative workflow has become more complicated than ever. It integrates massive data from different sources, requiring multiple iterations to obtain significative results. \n", - "\n", - "**gQuant** has been developed to address all these challenges by organizing dataframes into graphs. It introduces the idea of **dataframe-flow**, which manipulates dataframes at graph level. An **acyclic directed graph** is defined, where the nodes are dataframe processors and the edges are the directions of passing resulting dataframes.\n", - "\n", - "With a graph approach, quant's workflow is described at a high level, letting quant analysts address the complicated workflow challenge.\n", - "\n", - "It is GPU-accelerated by leveraging [RAPIDS.ai](https://rapids.ai) technology and has **Multi-GPU and Multi-Node support**.\n", - "\n", - "We can get orders of magnitude performance boosts compared to CPU. gQuant dataframe-flow is **dataframe agnostic**, and can flow:\n", - "- Pandas dataframe, computed in the CPU.\n", - "- cuDF dataframe, computed in the GPU and producing the same result but much faster.\n", - "- dask_cuDF dataframe, being the computation automatically executed on multiple nodes and multiple GPUs." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Download example datasets\n", - "\n", - "Before getting started, let's download the example datasets if not present." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Dataset is already present. No need to re-download it.\n" - ] - } - ], - "source": [ - "! ((test ! -f './data/stock_price_hist.csv.gz' || test ! -f './data/security_master.csv.gz') && \\\n", - " cd .. && bash download_data.sh) || echo \"Dataset is already present. No need to re-download it.\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Prepare for running in Dask environment\n", - "\n", - "Let's start the Dask local cluster environment for distributed computation.\n", - "\n", - "Dask provides a web-based dashboard to help to track progress, identify performance issues, and debug failures. To learn more about Dask dashboard, just follow this [link](https://distributed.dask.org/en/latest/web.html).\n" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "
\n", - "

Client

\n", - "\n", - "
\n", - "

Cluster

\n", - "
    \n", - "
  • Workers: 2
  • \n", - "
  • Cores: 2
  • \n", - "
  • Memory: 100.00 GB
  • \n", - "
\n", - "
" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Start the Dask local cluster environment for distrubuted computation\n", - "from dask_cuda import LocalCUDACluster\n", - "from dask.distributed import Client\n", - "\n", - "cluster = LocalCUDACluster()\n", - "client = Client(cluster)\n", - "client\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Though our stock dataset is small enough to fit in a single 16G GPU, to show how to do distributed computation, we will split the dataframe into small pieces to be loaded by different workers in the cluster.\n", - "\n", - "Notice this step is need only if the dataset is not split in multiple files yet.\n", - "\n", - "First use this simple taskgraph to load data then sort it by the asset id and datatime:" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "62a6b62b03944dd7803e2226e6e2a6f7", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "GQuantWidget(sub=HBox(), value=[OrderedDict([('id', 'stock_data'), ('type', 'CsvStockLoader'), ('conf', {'file…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import sys; sys.path.insert(0, '..')\n", - "from gquant.dataframe_flow import TaskGraph\n", - "task_graph = TaskGraph.load_taskgraph('../taskgraphs/sort_stocks.gq.yaml')\n", - "input_cached, = task_graph.run()\n", - "task_graph.draw()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "convert the sorted stock data into partitions and save it into csv files. Note, the data is slited in a way that the same asset belongs to the same partition" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['/home/quant/gQuant/notebooks/many-small/0.csv',\n", - " '/home/quant/gQuant/notebooks/many-small/1.csv',\n", - " '/home/quant/gQuant/notebooks/many-small/2.csv',\n", - " '/home/quant/gQuant/notebooks/many-small/3.csv',\n", - " '/home/quant/gQuant/notebooks/many-small/4.csv',\n", - " '/home/quant/gQuant/notebooks/many-small/5.csv',\n", - " '/home/quant/gQuant/notebooks/many-small/6.csv',\n", - " '/home/quant/gQuant/notebooks/many-small/7.csv']" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import dask.dataframe as dd\n", - "import os\n", - "num_partitions = 8\n", - "\n", - "os.makedirs('many-small', exist_ok=True)\n", - "dd.from_pandas(input_cached.set_index('asset'), npartitions=num_partitions).reset_index().to_csv('many-small/*.csv', index=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## The toy example\n", - "In this notebook, we will use a simple toy example to show how easy it is to accelerate the quant workflow in the GPU.\n", - "\n", - "To mimic the end-to-end quantitative analyst task, we are going to backtest a simple mean reversion trading strategy.\n", - "\n", - "The workflow can be divided into two steps. You can follow with me with an empty gQuant widget to build the TaskGraph:" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "209bf2c2fccd48b2b3808ec0cbc9304d", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "GQuantWidget(sub=HBox())" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "\n", - "task_graph = TaskGraph()\n", - "task_graph.draw()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Preprocess the dataset to remove bad points and add return feature\n", - "\n", - "\n", - "1. Load the 5000 end-of-day stocks CSV data into the dataframe and add rate of return feature to the dataframe.\n", - "\n", - "\n", - "2. Compute the average volume, min/max returns for each of the stocks\n", - "\n", - "\n", - "3. Merge the features into one dataframe, clean up the data by removing low volume stocks and extreme rate of returns stocks.\n", - "\n", - "\n", - "4. Create a composite node for this preprocess task\n", - "\n", - "\n", - "### Apply simple mean reversion algorithm and run backtest\n", - "\n", - "1. Clean up the nodes for the backtest\n", - "\n", - "\n", - "2. Compute the slow and fast exponential moving average and compute the trading signal based on it. Run backtesting and compute the returns from this strategy for each of the days and stock symbols. Run a simple portfolio optimization by averaging the stocks together for each of the trading days. Compute the sharpe ratio and cumulative return results.\n", - "\n", - "\n", - "3. Change the `slow`, `fast` parameters for the trading stratiges and re-run the backtest\n", - "\n", - "\n", - "4. Switch to run the backtest in a distributed environment by Dask\n", - "\n", - "\n", - "5. As a reference, switch to run the backtest in a CPU environment by Pandas\n", - "\n", - "\n", - "The whole workflow is organized into a TaskGraph file, which is described in a **gq.yaml** file.\n", - "\n", - "The same taskgraphs are saved in the `taskgraphs` directories. The whole workflow can be organized into a computation graph, which is described in a **yaml** file. \n", - "\n", - "Here is snippet of the yaml file:" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "- id: stock_data\n", - " type: CsvStockLoader\n", - " conf:\n", - " file: notebooks/data/stock_price_hist.csv.gz\n", - " path: notebooks/many-small\n", - " inputs: {}\n", - " module: rapids_modules\n", - "- id: preprocess\n", - " type: CompositeNode\n", - " conf:\n", - " subnode_ids:\n", - " - value_filter\n", - " subnodes_conf:\n", - " value_filter:\n", - " conf:\n", - " - column: min_return\n", - " min: -10\n", - " - column: max_return\n", - "...\n" - ] - } - ], - "source": [ - "!head -n 18 ../taskgraphs/portfolio_trade.gq.yaml\n", - "print(\"...\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "Lode the preprocess TaskGraph by `load_taskgraph` command" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "task_graph = TaskGraph.load_taskgraph('../taskgraphs/preprocess.gq.yaml')\n", - "task_graph.draw(show='ipynb')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "It can be shown in the interactive gQuant widget. Try to run it and see the preprocess results:" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "69aa5783a26d47248873e753fa0fc043", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "GQuantWidget(sub=HBox(), value=[OrderedDict([('id', 'stock_data'), ('type', 'CsvStockLoader'), ('conf', {'file…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "task_graph.draw()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Lode the whole TaskGraph by `load_taskgraph` command" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "task_graph = TaskGraph.load_taskgraph('../taskgraphs/portfolio_trade.gq.yaml')\n", - "task_graph.draw(show='ipynb')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To see the input/output ports of the nodes, turn `show_ports` flag on" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "task_graph.draw(show='ipynb', show_ports=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Similarly, it can be shown in the interactive gQuant widget. Try to run it, change the parameters and play with it as shown in the animation:" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "f194c058ea71469cbf3b824355841a87", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "GQuantWidget(sub=HBox(), value=[OrderedDict([('id', 'stock_data'), ('type', 'CsvStockLoader'), ('conf', {'file…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "task_graph.draw()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Running this toy example in a Dask distributed environment is super easy, as gQuant operates at dataframe level.\n", - "\n", - "We just need to swap cuDF dataframes to **dask_cuDF** dataframes. Try to connect the `preprocess` node to the `Dask dataframe` output port in the `stock data` node.\n", - "\n", - "Similarly, to see how fast the GPU acceleration is, we can swtich to CPU computation environment by connecting to the `Pandas dataframe` output port.\n", - "\n", - "## Benchmarks\n", - "\n", - "While running this notebook, we have obtained the following results:\n", - "\n", - "- 181.00 seconds to run in CPU (Intel(R) Xeon(R) CPU E5-2698 v4 @ 2.20GHz).\n", - "- 9.06 seconds to run in GPU (NVIDIA v100).\n", - "\n", - "We get ~20x speed up by using GPU and GPU dataframes, compared to CPU and CPU dataframes.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## GQuant Task Node \n", - "\n", - "Each node is composed of:\n", - "- a unique id,\n", - "- a node type, \n", - "- configuration parameters\n", - "- from zero to many input nodes ids.\n", - "\n", - "gQuant's `load_taskgraph` takes this yaml file, and wires it into a graph.\n", - "\n", - "gQuant implementation includes some common nodes, useful for quantitative finance. With the help of [Numba](https://numba.pydata.org) library, we have implemented more than 30 technical indicators used in computing trading signals. All of them computed in the GPU.\n", - "\n", - "However, gQuant's goal is not to be comprehensive for quant applications. It provides a framework that is easy for anyone to implement his own nodes in the gQuant.\n", - "\n", - "\n", - "Data scientists only need to override five methods in the parent class `Node`:\n", - "- `init`\n", - "- `meta_setup`\n", - "- `ports_setup`\n", - "- `conf_schema`\n", - "- `process`\n", - "\n", - "`init` method is usually used to define the required column names\n", - "\n", - "`ports_setup` defines the input and output ports for the node\n", - "\n", - "`meta_setup` method is used to calculate the output meta name and types.\n", - "\n", - "`conf_schema` method is used to define the JSON schema for the node conf so the client can generate the proper UI for it.\n", - "\n", - "`process` method takes input dataframes and computes the output dataframe. \n", - "\n", - "In this way, dataframes are strongly typed, and errors can be detected early before the time-consuming computation happens.\n", - "\n", - "Here is the code example for implementing `MaxNode`, which is to compute the maximum value for a specified column in the dataframe." - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "from gquant.dataframe_flow import Node\n", - "from rapids_modules._port_type_node import _PortTypesMixin\n", - "from gquant.dataframe_flow.portsSpecSchema import ConfSchema\n", - "\n", - "\n", - "class MaxNode(Node, _PortTypesMixin):\n", - "\n", - " def init(self):\n", - " _PortTypesMixin.init(self)\n", - " self.INPUT_PORT_NAME = 'in'\n", - " self.OUTPUT_PORT_NAME = 'out'\n", - "\n", - " def ports_setup(self):\n", - " return _PortTypesMixin.ports_setup(self)\n", - "\n", - " def conf_schema(self):\n", - " json = {\n", - " \"title\": \"Maximum Value Node configure\",\n", - " \"type\": \"object\",\n", - " \"description\": \"Compute the maximum value of the key column\",\n", - " \"properties\": {\n", - " \"column\": {\n", - " \"type\": \"string\",\n", - " \"description\": \"column to calculate the maximum value\"\n", - " }\n", - " },\n", - " \"required\": [\"column\"],\n", - " }\n", - " input_meta = self.get_input_meta()\n", - " if self.INPUT_PORT_NAME in input_meta:\n", - " col_from_inport = input_meta[self.INPUT_PORT_NAME]\n", - " enums = [col for col in col_from_inport.keys()]\n", - " json['properties']['column']['enum'] = enums\n", - " ui = {}\n", - " return ConfSchema(json=json, ui=ui)\n", - " else:\n", - " ui = {\n", - " \"column\": {\"ui:widget\": \"text\"}\n", - " }\n", - " return ConfSchema(json=json, ui=ui)\n", - "\n", - " def process(self, inputs):\n", - " \"\"\"\n", - " Compute the maximum value of the key column which is defined in the\n", - " `column` of the node's conf\n", - "\n", - " Arguments\n", - " -------\n", - " inputs: list\n", - " list of input dataframes.\n", - " Returns\n", - " -------\n", - " dataframe\n", - " \"\"\"\n", - " input_df = inputs[self.INPUT_PORT_NAME]\n", - " max_column = self.conf['column']\n", - " volume_df = input_df[[max_column,\n", - " \"asset\"]].groupby([\"asset\"]).max().reset_index()\n", - " volume_df.columns = ['asset', max_column]\n", - " return {self.OUTPUT_PORT_NAME: volume_df}\n", - "\n", - " def meta_setup(self):\n", - " cols_required = {\"asset\": \"int64\"}\n", - " if 'column' in self.conf:\n", - " retention = {self.conf['column']: \"float64\",\n", - " \"asset\": \"int64\"}\n", - " return _PortTypesMixin.retention_meta_setup(self,\n", - " retention,\n", - " required=cols_required)\n", - " else:\n", - " retention = {\"asset\": \"int64\"}\n", - " return _PortTypesMixin.retention_meta_setup(self,\n", - " retention,\n", - " required=cols_required)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In case that there is no direct dataframe method for a particular logic, a Numba GPU kernel can be used to implement it. Some examples of customized GPU kernels in Numba can be found [here](https://github.com/rapidsai/gQuant/blob/master/notebooks/05_customize_nodes.ipynb).\n", - "\n", - "If we use customized GPU kernel functions inside the `process` method to process the dataframe instead of _normal_ dataframe API functions calls, we need to add `self.delayed_process = True` in the `meta_setup` method to let gQuant handle the dask graph integration problem. If we use _normal_ dataframe API functions inside the `process` method, nothing needs to be done as `self.delayed_process = False` by default.bgQuant automatically handles the complication of including a customized GPU kernel node into the Dask computation graph.\n", - "\n", - "Note, we set `self.delayed_process = True` for the `SortNode`. So th sort is performed at the Dask data partition level instead of sorting it globally. This has a benefits of guranteeing the sortting doens't pollute the data partition allocation, as sometimes we want to make sure the data partition remain the same during the distributed computation. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Running the TaskGraph programmatically\n", - "\n", - "\n", - "To run the TaskGraph programmatically , we can specifiy a list of output ports to the TaskGraph `run` method. The `profile` flag can be used to see the computation time spent on each of the nodes:" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "id:stock_data process time:4.093s\n", - "id:preprocess process time:0.569s\n", - "id:sort_after process time:0.052s\n", - "id:exp_mean_reversion process time:0.049s\n", - "id:backtest process time:0.002s\n", - "id:portfolio_opt process time:0.031s\n", - "id:sharpe_ratio process time:0.002s\n", - "id:cumulative_return process time:0.023s\n" - ] - } - ], - "source": [ - "import warnings; warnings.simplefilter(\"ignore\")\n", - "\n", - "o_gpu = task_graph.run(\n", - " outputs=['sharpe_ratio.sharpe_out', 'cumulative_return.cum_return','stock_data.cudf_out', 'preprocess.drop_columns@out'], profile=True)\n", - "gpu_strategy_cached = o_gpu['preprocess.drop_columns@out'] \n", - "gpu_input_cached = o_gpu['stock_data.cudf_out'] " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "`o_gpu` will contain the outputs of four nodes: `sharpe_ratio`, `cumlative_return`, `stock_data`, `preprocess`.\n", - "\n", - "Similarly, the output from `stock_data` and `preprocess` nodes will be cached stored in `gpu_input_cached` and `strategy_cached` variables for later use. \n", - "\n", - "We can check how many of the stocks are filtered out by preprocessing steps:" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "5052 stocks in original dataset.\n", - "1558 stocks remaining after filtering.\n" - ] - } - ], - "source": [ - "print(\"{} stocks in original dataset.\".format(len(gpu_input_cached['asset'].unique())))\n", - "print(\"{} stocks remaining after filtering.\".format(len(gpu_strategy_cached['asset'].unique())))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The result can be shown in IPython Rich display by turnning on the `formatted` flag:" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "db4dbeca0a2d483ea7429119c0448aae", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Tab(children=(Output(), Output(), Output(), Output(layout=Layout(border='1px solid black'), outputs=({'output_…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "task_graph.run(\n", - " outputs=['sharpe_ratio.sharpe_out', 'cumulative_return.cum_return','preprocess.drop_columns@out'], formated=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This toy strategy gets a Sharpe ratio 0.338 without considering the transaction cost. Nice! \n", - "\n", - "[bqplot](https://github.com/bloomberg/bqplot) library is used to visualize the backtesting results in the JupyterLab notebooks. " - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "182dd2ea51d2426b853916ccf7e8c899", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Figure(axes=[Axis(label='Cumulative return', orientation='vertical', scale=LinearScale(), side='left'), Axis(l…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# define the function to format the plots\n", - "def plot_figures(outputs):\n", - " # format the figures\n", - " figure_width = '1200px'\n", - " figure_height = '400px'\n", - " sharpe_number = outputs[0]\n", - " cum_return = outputs[1]\n", - " cum_return.layout.height = figure_height\n", - " cum_return.layout.width = figure_width\n", - " cum_return.title = 'P & L %.3f' % (sharpe_number)\n", - " return cum_return\n", - "\n", - "plot_figures(o_gpu)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "gQuant TaskGraph can be evaluated by overwritting any of the Node's parameters. E.g. we can change the parameters to filter out the stocks:" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [], - "source": [ - "# Define some constants for the data filters.\n", - "# If using a GPU of 32G memory, you can safely \n", - "# set the `min_volume` to 5.0\n", - "min_volume = 10.0\n", - "min_rate = -10.0\n", - "max_rate = 10.0" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "id:stock_data process time:4.136s\n", - "id:preprocess process time:0.743s\n", - "id:sort_after process time:0.130s\n", - "id:exp_mean_reversion process time:0.076s\n", - "id:backtest process time:0.003s\n", - "id:portfolio_opt process time:0.072s\n", - "id:sharpe_ratio process time:0.001s\n", - "id:cumulative_return process time:0.023s\n", - "5052 stocks in original dataset.\n", - "4405 stocks remaining after filtering.\n", - "CPU times: user 5.17 s, sys: 743 ms, total: 5.91 s\n", - "Wall time: 5.65 s\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "b50e20ee9c1b4501b2eb08632a8a0fa7", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Figure(axes=[Axis(label='Cumulative return', orientation='vertical', scale=LinearScale()), Axis(label='Time', …" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%%time\n", - "o_gpu = task_graph.run(\n", - " outputs=['sharpe_ratio.sharpe_out', 'cumulative_return.cum_return', 'stock_data.cudf_out', 'preprocess.drop_columns@out'], \n", - " replace={'preprocess': {\"conf\": {\n", - " \"subnodes_conf\": {\n", - " \"value_filter\": {\n", - " \"conf\": [{\"column\": \"average_volume\", \"min\": min_volume},\n", - " {\"column\": \"max_return\", \"max\": max_rate},\n", - " {\"column\": \"min_return\", \"min\": min_rate}]\n", - " }\n", - " },\n", - " \"taskgraph\": \"taskgraphs/preprocess.gq.yaml\",\n", - " \"input\": [\"sort_node.in\"],\n", - " \"output\": [\"drop_columns.out\"]\n", - " }}\n", - " }, profile=True)\n", - "\n", - "gpu_input_cached = o_gpu['stock_data.cudf_out'] \n", - "gpu_strategy_cached = o_gpu['preprocess.drop_columns@out'] \n", - "print(\"{} stocks in original dataset.\".format(len(gpu_input_cached['asset'].unique())))\n", - "print(\"{} stocks remaining after filtering.\".format(len(gpu_strategy_cached['asset'].unique())))\n", - "plot_figures(o_gpu)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In the example above, `preprocess` node is a composite node that use a seperate TaskGraph as input and output. Any of the node inside the composite node TaskGraph configuration can be overridden as shown in the example. We change the `filter_value` node configuration inside the composite node to filter out the stocks that are not suitable for backtesting. It will discard stocks according to the values stored in `min_volume`, `min_rate`, and `max_rate` variables.\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Next, we are going to compare the performance difference between CPU and GPU. The same computation graph can be used to flow the CPU Pandas dataframe with one change that\n", - "the preprocess node need to get input from the Pandas dataframe:" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "id:stock_data process time:105.435s\n", - "id:preprocess process time:24.096s\n", - "id:sort_after process time:1.382s\n", - "id:exp_mean_reversion process time:10.445s\n", - "id:backtest process time:0.043s\n", - "id:portfolio_opt process time:0.472s\n", - "id:sharpe_ratio process time:0.001s\n", - "id:cumulative_return process time:0.021s\n", - "CPU times: user 2min 11s, sys: 14.5 s, total: 2min 26s\n", - "Wall time: 2min 22s\n" - ] - } - ], - "source": [ - "%%time\n", - "\n", - "o_cpu = task_graph.run(\n", - " outputs=['sharpe_ratio.sharpe_out', 'cumulative_return.cum_return'], \n", - " replace={'preprocess': {\"inputs\": {\"sort_node@in\": \"stock_data.pandas_out\"}}}, profile=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "89439c03a6cd4b0cbbd2d4d975234d4d", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Figure(axes=[Axis(label='Cumulative return', orientation='vertical', scale=LinearScale()), Axis(label='Time', …" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "plot_figures(o_cpu)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "It produces the same result as the single GPU version but a lot slower.\n", - "\n", - "While running this notebook, we have obtained the following results:\n", - "\n", - "- 181.00 seconds to run in CPU (Intel(R) Xeon(R) CPU E5-2698 v4 @ 2.20GHz).\n", - "- 9.06 seconds to run in GPU (NVIDIA v100).\n", - "\n", - "We get ~20x speed up by using GPU and GPU dataframes, compared to CPU and CPU dataframes.\n", - "\n", - "Note, the input nodes load the dataframes from the cache variables to save the disk IO time." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The distributed computation is turned on by changing the preprocess node's input dataframe to dask dataframe: " - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "id:stock_data process time:0.118s\n", - "id:preprocess process time:9.833s\n", - "id:backtest process time:0.016s\n", - "id:portfolio_opt process time:0.331s\n", - "id:sharpe_ratio process time:0.371s\n", - "id:cumulative_return process time:0.405s\n", - "CPU times: user 6.01 s, sys: 489 ms, total: 6.5 s\n", - "Wall time: 16.5 s\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "bedf166c923640839437c5505f30816e", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Figure(axes=[Axis(label='Cumulative return', orientation='vertical', scale=LinearScale()), Axis(label='Time', …" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%%time\n", - "o_dask = task_graph.run(\n", - " outputs=['sharpe_ratio.sharpe_out', 'cumulative_return.cum_return'], \n", - " replace={'preprocess': {\"inputs\": {\"sort_node@in\": \"stock_data.dask_cudf_out\"}}}, profile=True)\n", - "plot_figures(o_dask)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Again, it produces the same results. However, the performance is not better than in the single GPU scenarios.\n", - "\n", - "Distributed computation only makes sense if we have a very large dataset that cannot be fit into one GPU.\n", - "\n", - "In this example, the dataset is small enough to be loaded into a single GPU. The between-GPU communication overhead dominates in the computation." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Strategy parameter search\n", - "Quantitative analysts often need to explore different parameters for their trading strategy.\n", - "\n", - "gQuant speeds up this iterative exploration process by using cached dataframes and sub-graphs evaluation.\n", - "\n", - "To find the optimal parameters for this toy mean reversion strategy, we only need the dataframe from `sort_2` node, which is cached in the `gpu_strategy_cached` variable.\n", - "\n", - "Because the GPU computation is so fast, we can make the parameter exploration interactive in the JupyterLab notebook:" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "7a40a498b0f74452b2effa99d0865bd3", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "VBox(children=(HBox(children=(IntRangeSlider(value=(10, 30), continuous_update=False, description='MA:', max=6…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import ipywidgets as widgets\n", - "\n", - "para_selector = widgets.IntRangeSlider(value=[10, 30],\n", - " min=3,\n", - " max=60,\n", - " step=1,\n", - " description=\"MA:\",\n", - " disabled=False,\n", - " continuous_update=False,\n", - " orientation='horizontal',\n", - " readout=True)\n", - "\n", - "\n", - "def para_selection(*stocks):\n", - " with out:\n", - " para1 = para_selector.value[0]\n", - " para2 = para_selector.value[1]\n", - " o = task_graph.run(\n", - " outputs=['sharpe_ratio.sharpe_out', 'cumulative_return.cum_return'],\n", - " replace={'exp_mean_reversion': {'conf': {'fast': para1,\n", - " 'slow': para2}},\n", - " 'preprocess': {\"load\": {\"drop_columns@out\": gpu_strategy_cached},\n", - " \"conf\": {\n", - " \"subnodes_conf\": {\n", - " \"value_filter\": {\n", - " \"conf\": [{\"column\": \"average_volume\", \"min\": min_volume},\n", - " {\"column\": \"max_return\", \"max\": max_rate},\n", - " {\"column\": \"min_return\", \"min\": min_rate}]\n", - " }\n", - " },\n", - " \"taskgraph\": \"taskgraphs/preprocess.gq.yaml\",\n", - " \"input\": [\"sort_node.in\"],\n", - " \"output\": [\"drop_columns.out\"]\n", - " }}})\n", - "\n", - " figure_combo = plot_figures(o)\n", - " w.children = (w.children[0], figure_combo,)\n", - "\n", - "\n", - "out = widgets.Output(layout={'border': '1px solid black'})\n", - "para_selector.observe(para_selection, 'value')\n", - "selectors = widgets.HBox([para_selector])\n", - "w = widgets.VBox([selectors])\n", - "w" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "d692885694be475da5cd47e6acf94417", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Output(layout=Layout(border='1px solid black'))" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "out" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'status': 'ok', 'restart': True}" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import IPython\n", - "app = IPython.Application.instance()\n", - "app.kernel.do_shutdown(True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.7" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/notebooks/05_customize_nodes_with_ports.ipynb b/notebooks/05_customize_nodes_with_ports.ipynb deleted file mode 100644 index 10ecfeab..00000000 --- a/notebooks/05_customize_nodes_with_ports.ipynb +++ /dev/null @@ -1,1977 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Customize your own GPU Kernels in gQuant\n", - "\n", - "The gQuant is designed to accelerate quantitive finance workflows on the GPU. The acceleration on GPU is facilitated by using cuDF dataframes organized into a computation graph. The cuDF project is a continously evolving library that provides a pandas-like API. Sometimes the data scientists are facing a few challenges that cannot be easily solved:\n", - "\n", - " 1. The quantitative work needs customized logic to manipulate the data, and there are no direct methods within cuDF to support this logic.\n", - " 2. Each cuDF dataframe method call launches the GPU kernel once. For performance crtical task, it is sometimes required to wrap lots of computation steps together in a single GPU kernel to reduce the kernel launch overheads.\n", - "\n", - "The solution is to build customized GPU kernels to implement them. The code and examples below illustrate a variety of approaches to implement customized GPU kernels in Python." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import sys; sys.path.insert(0, '..')\n", - "# Load necessary Python modules\n", - "import sys\n", - "from gquant.dataframe_flow import TaskSpecSchema, TaskGraph, MetaData\n", - "from gquant.dataframe_flow import Node, NodePorts, PortsSpecSchema\n", - "from gquant.dataframe_flow import ConfSchema\n", - "import cudf\n", - "import numpy as np\n", - "from numba import cuda\n", - "import cupy\n", - "import math\n", - "import dask\n", - "import dask_cudf" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Define a utility function to verify the results:" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "def verify(ground_truth, computed):\n", - " max_difference = (ground_truth - computed).abs().max()\n", - " # print('Max Difference: {}'.format(max_difference))\n", - " assert(max_difference < 1e-8)\n", - " return max_difference" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "task_graph = TaskGraph()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Example Problem: Calculating the distance of points to the origin\n", - "\n", - "The sample problem is to take a list of points in 2-D space and compute their distance to the origin.\n", - "We start by creating a source `Node` in the graph that generates a cuDF dataframe containing some configurable number of random points. A custom node is defined by inheriting from the `Node` class and overriding methods `init`, `meta_setup`, `ports_setup`, `conf_schema`, `process`.\n", - "\n", - "The `ports_setup` must return an instance of `NodePorts` which encapsulates the ports specs. Ports specs are dictionaries with port attributes/options per `PortsSpecSchema`.\n", - "\n", - "In the case of the `PointNode` below the input port is an empty dictionary, since no inputs are required, and the output has two ports `points_df_out` and `points_ddf_out`. It can output two types of dataframe frames depends who connects it.\n", - "\n", - "The `process` method receives a input dictionary where keys are input ports and values are input data. It return a dictionary where the keys correspond to the output ports. \n", - "\n", - "The `meta_setup` is used to compute the output meta information. It returns a dictionary where keys correspond to the output ports.\n", - "\n", - "The `conf_schema` is used to define the Node configuration [JSON schema](https://json-schema.org/). gQuantlab UI uses [RJSF](https://github.com/rjsf-team/react-jsonschema-form) project to generate HTML form elements based on the JSON schema. [RJSF playground](https://rjsf-team.github.io/react-jsonschema-form/) is a good place to learn how to write JSON schema and visualize it. `conf_schema` returns `ConfSchema` which encapsulate the JSON schema and UI schema together.\n", - "\n", - "The `column` and `port_types` information sometimes are determined dynamically. gQuant provides a few utility functions to help get dynamical graph information. `self.get_connected_inports()` will return a dictionay where keys are connected inport names and values are inport types. \n", - "`self.get_input_meta()` will return a dictionary where keys are connected inport names and values are column name/type paris from the parent node. `self.outport_connected(port_name)` method returns a boolean if the output port `port_name` is connected. The `PointNode` uses it to determine what kind of computation it needs to do depending on the connection." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "class PointNode(Node):\n", - "\n", - " def ports_setup(self):\n", - " input_ports = {}\n", - " output_ports = {\n", - " 'points_df_out': {\n", - " PortsSpecSchema.port_type: cudf.DataFrame\n", - " },\n", - " 'points_ddf_out': {\n", - " PortsSpecSchema.port_type: dask_cudf.DataFrame\n", - " },\n", - " }\n", - " return NodePorts(inports=input_ports, outports=output_ports)\n", - "\n", - " def conf_schema(self):\n", - " json = {\n", - " \"title\": \"PointNode configure\",\n", - " \"type\": \"object\",\n", - " \"properties\": {\n", - " \"npts\": {\n", - " \"type\": \"number\",\n", - " \"description\": \"number of data points\",\n", - " \"minimum\": 10\n", - " },\n", - " \"npartitions\": {\n", - " \"type\": \"number\",\n", - " \"description\": \"num of partitions in the Dask dataframe\",\n", - " \"minimum\": 1\n", - " }\n", - "\n", - " },\n", - " \"required\": [\"npts\", \"npartitions\"],\n", - " }\n", - "\n", - " ui = {\n", - " \"npts\": {\"ui:widget\": \"updown\"},\n", - " \"npartitions\": {\"ui:widget\": \"updown\"}\n", - " }\n", - " return ConfSchema(json=json, ui=ui)\n", - "\n", - " def init(self):\n", - " pass\n", - " \n", - " def meta_setup(self):\n", - " columns_out = {\n", - " 'points_df_out': {\n", - " 'x': 'float64',\n", - " 'y': 'float64'\n", - " },\n", - " 'points_ddf_out': {\n", - " 'x': 'float64',\n", - " 'y': 'float64'\n", - " }\n", - " }\n", - " return MetaData(inports={}, outports=columns_out)\n", - "\n", - " def process(self, inputs):\n", - " npts = self.conf['npts']\n", - " df = cudf.DataFrame()\n", - " df['x'] = np.random.rand(npts)\n", - " df['y'] = np.random.rand(npts)\n", - " output = {}\n", - " if self.outport_connected('points_df_out'):\n", - " output.update({'points_df_out': df})\n", - " if self.outport_connected('points_ddf_out'):\n", - " npartitions = self.conf['npartitions']\n", - " ddf = dask_cudf.from_cudf(df, npartitions=npartitions)\n", - " output.update({'points_ddf_out': ddf})\n", - " return output" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The distance can be computed via cuDF methods. We define the `DistanceNode` to calculate the euclidean distance and add a `distance_cudf` column to the output dataframe. We will use that as the ground truth to compare and verify results later. Additionally, the distance node calculates absolute distance (Manhattan distance) in another output port. The compuation is done depending which output is connected.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "class DistanceNode(Node):\n", - "\n", - " def ports_setup(self):\n", - " port_type = PortsSpecSchema.port_type\n", - " input_ports = {\n", - " 'points_df_in': {\n", - " port_type: [cudf.DataFrame, dask_cudf.DataFrame]\n", - " }\n", - " }\n", - "\n", - " output_ports = {\n", - " 'distance_df': {\n", - " port_type: [cudf.DataFrame, dask_cudf.DataFrame]\n", - " },\n", - " 'distance_abs_df': {\n", - " PortsSpecSchema.port_type: [cudf.DataFrame, dask_cudf.DataFrame]\n", - " }\n", - " }\n", - " input_connections = self.get_connected_inports()\n", - " if 'points_df_in' in input_connections:\n", - " types = input_connections['points_df_in']\n", - " # connected, use the types passed in from parent\n", - " return NodePorts(inports={'points_df_in': {port_type: types}},\n", - " outports={'distance_df': {port_type: types},\n", - " 'distance_abs_df': {port_type: types},\n", - " })\n", - " else:\n", - " return NodePorts(inports=input_ports, outports=output_ports)\n", - "\n", - " def conf_schema(self):\n", - " return ConfSchema()\n", - "\n", - " def init(self):\n", - " self.delayed_process = True\n", - "\n", - "\n", - " def meta_setup(self):\n", - " req_cols = {\n", - " 'x': 'float64',\n", - " 'y': 'float64'\n", - " }\n", - " required = {\n", - " 'points_df_in': req_cols,\n", - " }\n", - " input_meta = self.get_input_meta()\n", - " output_cols = ({\n", - " 'distance_df': {\n", - " 'distance_cudf': 'float64',\n", - " 'x': 'float64',\n", - " 'y': 'float64'\n", - " },\n", - " 'distance_abs_df': {\n", - " 'distance_abs_cudf': 'float64',\n", - " 'x': 'float64',\n", - " 'y': 'float64'\n", - " }\n", - " })\n", - " if 'points_df_in' in input_meta:\n", - " col_from_inport = input_meta['points_df_in']\n", - " # additional ports\n", - " output_cols['distance_df'].update(col_from_inport)\n", - " output_cols['distance_abs_df'].update(col_from_inport)\n", - " return MetaData(inports=required, outports=output_cols)\n", - "\n", - " def process(self, inputs):\n", - " df = inputs['points_df_in']\n", - " output = {}\n", - " if self.outport_connected('distance_df'):\n", - " copy_df = df.copy()\n", - " copy_df['distance_cudf'] = (df['x'] ** 2 + df['y'] ** 2).sqrt()\n", - " output.update({'distance_df': copy_df})\n", - " if self.outport_connected('distance_abs_df'):\n", - " copy_df = df.copy()\n", - " copy_df['distance_abs_cudf'] = df['x'].abs() + df['y'].abs()\n", - " output.update({'distance_abs_df': copy_df})\n", - " return output" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Having these two nodes, we can construct a simple task graph to compute the distance." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "# Task specifications.\n", - "module_name = 'custom_nodes'\n", - "\n", - "points_tspec = {\n", - " TaskSpecSchema.task_id: 'points_task',\n", - " TaskSpecSchema.node_type: PointNode,\n", - " TaskSpecSchema.conf: {'npts': 1000},\n", - " TaskSpecSchema.module: module_name,\n", - " TaskSpecSchema.inputs: {},\n", - "}\n", - "\n", - "cudf_distance_tspec = {\n", - " TaskSpecSchema.task_id: 'distance_by_cudf',\n", - " TaskSpecSchema.node_type: DistanceNode,\n", - " TaskSpecSchema.conf: {},\n", - " TaskSpecSchema.module: module_name,\n", - " TaskSpecSchema.inputs: {\n", - " 'points_df_in': 'points_task.points_df_out'\n", - " }\n", - "}\n", - "\n", - "out_spec = {\n", - " TaskSpecSchema.task_id: '',\n", - " TaskSpecSchema.node_type: \"Output_Collector\",\n", - " TaskSpecSchema.conf: {},\n", - " TaskSpecSchema.inputs: {\n", - " 'in0': 'points_task.points_df_out',\n", - " 'in1': 'distance_by_cudf.distance_df',\n", - " 'in2': 'distance_by_cudf.distance_abs_df'\n", - " }\n", - "}\n", - "\n", - "task_list = [points_tspec, cudf_distance_tspec, out_spec]\n", - "task_graph = TaskGraph(task_list)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can visualize the task graph with and without ports." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "WITHOUT PORTS\n" - ] - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "print('WITHOUT PORTS')\n", - "task_graph.draw(show='ipynb')" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "WITH PORTS\n" - ] - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "print('WITH PORTS')\n", - "task_graph.draw(show='ipynb', show_ports=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can draw the graph in an interactive widget. First, let's register the dynamically defined gQuant nodes so the client knows about them. Note, this step is only needed if we would like to interact with gQuant by Jupyterlab UI. " - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "87cab73c7fff4b89bec965231b9f291d", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "GQuantWidget(sub=HBox())" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "TaskGraph.register_lab_node(module_name, PointNode)\n", - "TaskGraph.register_lab_node(module_name, DistanceNode)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Draw the widget:" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "8ebd29d5191d40d4b21d90cda855c411", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "GQuantWidget(sub=HBox(), value=[OrderedDict([('id', 'points_task'), ('type', 'PointNode'), ('conf', {'npts': 1…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "task_graph.draw()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The next step is to run the task graph to obtain the distances. The output is identified by the `id` of the distance node:" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "a4684abb3a8d412bac5203f57d966cf0", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Tab(children=(Output(), Output(), Output(), Output(layout=Layout(border='1px solid black'), outputs=({'output_…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "r = task_graph.run()\n", - "points_df = r['points_task.points_df_out']\n", - "task_graph.run(formated=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Customized Kernel with Numba library\n", - "\n", - "Numba is an excellent python library used for accelerating numerical computations. Numba supports CUDA GPU programming by directly compiling a restricted subset of Python code into CUDA kernels and device functions. The Numba GPU kernel is written in Python and translated (JIT just-in-time compiled) into GPU code at runtime. This is achieved by decorating a Python function with `@cuda.jit`. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Just like a C/C++ CUDA GPU kernel, the `distance_kernel` function is called by thousands of threads in the GPU. The thread id is computed by `threadIdx.x`, `blockId.x` and `blockDim.x` built-in variables. Please check the [CUDA programming guild](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#thread-hierarchy) for details." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A cuDF series can be converted to GPU arrays compatible with the Numba library via.to_gpu_array` API. The next step is to define a Node that calls this Numba kernel to compute the distance and save the result into `distance_numba` column in the output dataframe." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "import rmm\n", - "@cuda.jit\n", - "def distance_kernel(x, y, distance, array_len):\n", - " # ii - overall thread index\n", - " ii = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x\n", - " if ii < array_len:\n", - " distance[ii] = math.sqrt(x[ii] ** 2 + y[ii] ** 2)\n", - "\n", - "\n", - "class NumbaDistanceNode(Node):\n", - "\n", - " def ports_setup(self):\n", - " port_type = PortsSpecSchema.port_type\n", - " input_ports = {\n", - " 'points_df_in': {\n", - " port_type: [cudf.DataFrame,\n", - " dask_cudf.DataFrame]\n", - " }\n", - " }\n", - "\n", - " output_ports = {\n", - " 'distance_df': {\n", - " port_type: [cudf.DataFrame,\n", - " dask_cudf.DataFrame]\n", - " }\n", - " }\n", - "\n", - " input_connections = self.get_connected_inports()\n", - " if 'points_df_in' in input_connections:\n", - " types = input_connections['points_df_in']\n", - " # connected\n", - " return NodePorts(inports={'points_df_in': {port_type: types}},\n", - " outports={'distance_df': {port_type: types}})\n", - " else:\n", - " return NodePorts(inports=input_ports, outports=output_ports)\n", - " \n", - " def init(self):\n", - " self.delayed_process = True\n", - "\n", - "\n", - " def meta_setup(self,):\n", - " required_cols = {'x': 'float64',\n", - " 'y': 'float64'}\n", - " required = {\n", - " 'points_df_in': required_cols,\n", - " 'distance_df': required_cols\n", - " }\n", - " input_meta = self.get_input_meta()\n", - " output_cols = ({\n", - " 'distance_df': {\n", - " 'distance_numba': 'float64',\n", - " 'x': 'float64',\n", - " 'y': 'float64'\n", - " }\n", - " })\n", - " if 'points_df_in' in input_meta:\n", - " col_from_inport = input_meta['points_df_in']\n", - " # additional ports\n", - " output_cols['distance_df'].update(col_from_inport)\n", - " return MetaData(inports=required, outports=output_cols)\n", - "\n", - " def conf_schema(self):\n", - " return ConfSchema()\n", - "\n", - " def process(self, inputs):\n", - " df = inputs['points_df_in']\n", - " number_of_threads = 16\n", - " number_of_blocks = ((len(df) - 1) // number_of_threads) + 1\n", - " # Inits device array by setting 0 for each index.\n", - " # df['distance_numba'] = 0.0\n", - " darr = rmm.device_array(len(df))\n", - " distance_kernel[(number_of_blocks,), (number_of_threads,)](\n", - " df['x'],\n", - " df['y'],\n", - " darr,\n", - " len(df))\n", - " df['distance_numba'] = darr\n", - " return {'distance_df': df}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The `self.delayed_process = True` flag in the `meta_setup` is necesary to enable the logic in the `Node` class for handling `dask_cudf` dataframes in order to use Dask (for distributed computation i.e. multi-gpu in examples later on). The `dask_cudf` dataframe does not support GPU customized kernels directly. The `to_delayed` and `from_delayed` low level interfaces of `dask_cudf` enable this support. The gQuant framework handles `dask_cudf` dataframes automatically under the hood when we set this flag." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Customized Kernel by CuPy library\n", - "\n", - "CuPy is an alternative to Numba. Numba JIT compiles Python code into GPU device code at runtime. There are some limitations in how Numba can be used as well as JIT compilation latency overhead. When a Python process calls a Numba GPU kernel for the first time Numba has to compile the Python code, and each time a new Python process is started the GPU kernel has to be recompiled. If advanced features of CUDA are needed and latency is important, CuPy is an alternative library that can be used to compile C/C++ CUDA code. CuPy caches the GPU device code on disk (default location `$(HOME)/.cupy/kernel_cache` which can be changed via `CUPY_CACHE_DIR` environment variable) thus eliminating compilation latency for subsequent Python processes.\n", - "\n", - "`CuPy` GPU kernel is esentially a C/C++ GPU kernel. Below we define the `compute_distance` kernel using `CuPy`:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Using gQuant we can now define a Node that calls this CuPy kernel to compute the distance and save the results into `distance_cupy` column of a `cudf` dataframe." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "kernel_string = r'''\n", - " extern \"C\" __global__\n", - " void compute_distance(const double* x, const double* y,\n", - " double* distance, int arr_len) {\n", - " int tid = blockDim.x * blockIdx.x + threadIdx.x;\n", - " if (tid < arr_len){\n", - " distance[tid] = sqrt(x[tid]*x[tid] + y[tid]*y[tid]);\n", - " }\n", - " }\n", - "'''\n", - "\n", - "\n", - "class CupyDistanceNode(Node):\n", - "\n", - " def ports_setup(self):\n", - " port_type = PortsSpecSchema.port_type\n", - " input_ports = {\n", - " 'points_df_in': {\n", - " port_type: [cudf.DataFrame,\n", - " dask_cudf.DataFrame]\n", - " }\n", - " }\n", - "\n", - " output_ports = {\n", - " 'distance_df': {\n", - " port_type: [cudf.DataFrame,\n", - " dask_cudf.DataFrame]\n", - " }\n", - " }\n", - "\n", - " input_connections = self.get_connected_inports()\n", - " if 'points_df_in' in input_connections:\n", - " types = input_connections['points_df_in']\n", - " # connected\n", - " return NodePorts(inports={'points_df_in': {port_type: types}},\n", - " outports={'distance_df': {port_type: types}})\n", - " else:\n", - " return NodePorts(inports=input_ports, outports=output_ports)\n", - "\n", - " def init(self):\n", - " self.delayed_process = True\n", - "\n", - "\n", - " def meta_setup(self,):\n", - " cols_required = {'x': 'float64',\n", - " 'y': 'float64'}\n", - " required = {\n", - " 'points_df_in': cols_required,\n", - " 'distance_df': cols_required\n", - " }\n", - " input_meta = self.get_input_meta()\n", - " output_cols = ({\n", - " 'distance_df': {\n", - " 'distance_cupy': 'float64',\n", - " 'x': 'float64',\n", - " 'y': 'float64'\n", - " }\n", - " })\n", - " if 'points_df_in' in input_meta:\n", - " col_from_inport = input_meta['points_df_in']\n", - " # additional ports\n", - " output_cols['distance_df'].update(col_from_inport)\n", - " return MetaData(inports=required, outports=output_cols)\n", - "\n", - " def conf_schema(self):\n", - " return ConfSchema()\n", - "\n", - " def get_kernel(self):\n", - " raw_kernel = cupy.RawKernel(kernel_string, 'compute_distance')\n", - " return raw_kernel\n", - "\n", - " def process(self, inputs):\n", - " df = inputs['points_df_in']\n", - " cupy_x = cupy.asarray(df['x'])\n", - " cupy_y = cupy.asarray(df['y'])\n", - " number_of_threads = 16\n", - " number_of_blocks = (len(df) - 1) // number_of_threads + 1\n", - " dis = cupy.ndarray(len(df), dtype=cupy.float64)\n", - " self.get_kernel()((number_of_blocks,), (number_of_threads,),\n", - " (cupy_x, cupy_y, dis, len(df)))\n", - " df['distance_cupy'] = dis\n", - "\n", - " return {'distance_df': df}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The `self.delayed_process = True` flag is added for the same reason as with `DistanceNumbaNode` i.e. to support `dask_cudf` data frames.\n", - "\n", - "Let's register these two added new nodes" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "TaskGraph.register_lab_node(module_name, NumbaDistanceNode)\n", - "TaskGraph.register_lab_node(module_name, CupyDistanceNode)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Computing using the Nodes with customized GPU kernels\n", - "\n", - "First we construct the computation graph for gQuant." - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# For comparison to above re-use points dataframe instead\n", - "# of rand generating each time when running the task-graph.\n", - "\n", - "numba_distance_tspec = {\n", - " TaskSpecSchema.task_id: 'distance_by_numba',\n", - " TaskSpecSchema.node_type: NumbaDistanceNode,\n", - " TaskSpecSchema.conf: {}, \n", - " TaskSpecSchema.module: module_name,\n", - " TaskSpecSchema.inputs: {\n", - " 'points_df_in': 'points_task.points_df_out'\n", - " },\n", - "}\n", - "\n", - "cupy_distance_tspec = {\n", - " TaskSpecSchema.task_id: 'distance_by_cupy',\n", - " TaskSpecSchema.node_type: CupyDistanceNode,\n", - " TaskSpecSchema.conf: {},\n", - " TaskSpecSchema.module: module_name,\n", - " TaskSpecSchema.inputs: {\n", - " 'points_df_in': 'points_task.points_df_out'\n", - " },\n", - "}\n", - "\n", - "out_spec = {\n", - " TaskSpecSchema.task_id: '',\n", - " TaskSpecSchema.node_type: \"Output_Collector\",\n", - " TaskSpecSchema.conf: {},\n", - " TaskSpecSchema.inputs: {\n", - " 'in0': 'distance_by_cudf.distance_df',\n", - " 'in1': 'distance_by_numba.distance_df',\n", - " 'in2': 'distance_by_cupy.distance_df'\n", - " }\n", - "}\n", - "\n", - "task_list = [\n", - " points_tspec,\n", - " cudf_distance_tspec,\n", - " numba_distance_tspec,\n", - " cupy_distance_tspec,\n", - " out_spec\n", - "]\n", - "task_graph = TaskGraph(task_list)\n", - "\n", - "task_graph.draw(show='ipynb', show_ports=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Then we run the task graph interatively by using the gQuantlab widget" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "d78b9a4495bb4e36b8131724eaefa0cc", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "GQuantWidget(sub=HBox(), value=[OrderedDict([('id', 'points_task'), ('type', 'PointNode'), ('conf', {'npts': 1…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "task_graph.draw()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Or, run it programmatically" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "469f3ad4c6444e95a70106eb1fc5861f", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Tab(children=(Output(), Output(), Output(), Output(layout=Layout(border='1px solid black'), outputs=({'output_…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "out_list = [\n", - " 'distance_by_cudf.distance_df',\n", - " 'distance_by_numba.distance_df',\n", - " 'distance_by_cupy.distance_df'\n", - "]\n", - "cache_load = {\"points_task\": {\"load\": {'points_df_out': points_df}}}\n", - "(df_w_cudf, df_w_numba, df_w_cupy) = task_graph.run(out_list, replace=cache_load)\n", - "task_graph.run(out_list, replace=cache_load, formated=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Use `verify` function defined above to verify the results:" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Max Difference cudf to numba: 2.220446049250313e-16\n", - "Max Difference cudf to cupy: 2.220446049250313e-16\n" - ] - } - ], - "source": [ - "mdiff = verify(df_w_cudf['distance_cudf'], df_w_numba['distance_numba'])\n", - "print('Max Difference cudf to numba: {}'.format(mdiff))\n", - "mdiff = verify(df_w_cudf['distance_cudf'], df_w_cupy['distance_cupy'])\n", - "print('Max Difference cudf to cupy: {}'.format(mdiff))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To illustrate multi-input nodes let's create a verify node." - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [], - "source": [ - "class VerifyNode(Node):\n", - "\n", - " def ports_setup(self):\n", - " input_ports = {\n", - " 'df1': {\n", - " PortsSpecSchema.port_type: [cudf.DataFrame,\n", - " dask_cudf.DataFrame]\n", - " },\n", - " 'df2': {\n", - " PortsSpecSchema.port_type: [cudf.DataFrame,\n", - " dask_cudf.DataFrame]\n", - " }\n", - " }\n", - " output_ports = {\n", - " 'max_diff': {\n", - " PortsSpecSchema.port_type: float\n", - " }\n", - " }\n", - "\n", - " connections = self.get_connected_inports() \n", - " for key in input_ports:\n", - " if key in connections:\n", - " # connected\n", - " types = connections[key]\n", - " input_ports[key].update({PortsSpecSchema.port_type: types})\n", - " return NodePorts(inports=input_ports, outports=output_ports)\n", - "\n", - " def meta_setup(self):\n", - " required ={\n", - " \"df1\": {},\n", - " \"df2\": {}\n", - " }\n", - " return MetaData(inports=required, outports={'max_diff': {}})\n", - "\n", - " def conf_schema(self):\n", - " json = {\n", - " \"title\": \"VerifyNode configure\",\n", - " \"type\": \"object\",\n", - " \"properties\": {\n", - " \"df1_col\": {\n", - " \"type\": \"string\",\n", - " \"description\": \"dataframe1 column name\"\n", - " },\n", - " \"df2_col\": {\n", - " \"type\": \"string\",\n", - " \"description\": \"dataframe2 column name\"\n", - " }\n", - " },\n", - " \"required\": [\"df1_col\", \"df2_col\"],\n", - " }\n", - "\n", - " ui = {\n", - " \"df1_col\": {\"ui:widget\": \"text\"},\n", - " \"df2_col\": {\"ui:widget\": \"text\"}\n", - " }\n", - " return ConfSchema(json=json, ui=ui)\n", - "\n", - " def process(self, inputs):\n", - " df1 = inputs['df1']\n", - " df2 = inputs['df2']\n", - " col_df1 = self.conf['df1_col']\n", - " col_df2 = self.conf['df2_col']\n", - "\n", - " df1_col = df1[col_df1]\n", - " if isinstance(df1, dask_cudf.DataFrame):\n", - " # df1_col = df1_col.compute()\n", - " pass\n", - "\n", - " df2_col = df2[col_df2]\n", - " if isinstance(df2, dask_cudf.DataFrame):\n", - " # df2_col = df2_col.compute()\n", - " pass\n", - "\n", - " max_difference = (df1_col - df2_col).abs().max()\n", - "\n", - " if isinstance(max_difference, dask.dataframe.core.Scalar):\n", - " max_difference = float(max_difference.compute())\n", - " max_difference = float(max_difference)\n", - " return {'max_diff': max_difference}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Register the `VerifyNode`:" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [], - "source": [ - "TaskGraph.register_lab_node(module_name, VerifyNode)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Define the full Taskgraph:" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "verify_tspec = {\n", - " TaskSpecSchema.task_id: 'verify_cudf_to_numba',\n", - " TaskSpecSchema.node_type: VerifyNode,\n", - " TaskSpecSchema.conf: {\n", - " 'df1_col': 'distance_cudf',\n", - " 'df2_col': 'distance_numba'\n", - " }, \n", - " TaskSpecSchema.module: module_name,\n", - " TaskSpecSchema.inputs: {\n", - " 'df1': 'distance_by_cudf.distance_df',\n", - " 'df2': 'distance_by_numba.distance_df'\n", - " }\n", - "}\n", - "\n", - "verify_tspec2 = {\n", - " TaskSpecSchema.task_id: 'verify_cudf_to_cupy',\n", - " TaskSpecSchema.node_type: VerifyNode,\n", - " TaskSpecSchema.conf: {\n", - " 'df1_col': 'distance_cudf',\n", - " 'df2_col': 'distance_cupy'\n", - " },\n", - " TaskSpecSchema.module: module_name,\n", - " TaskSpecSchema.inputs: {\n", - " 'df1': 'distance_by_cudf.distance_df',\n", - " 'df2': 'distance_by_cupy.distance_df'\n", - " }\n", - "}\n", - "out_spec = {\n", - " TaskSpecSchema.task_id: '',\n", - " TaskSpecSchema.node_type: \"Output_Collector\",\n", - " TaskSpecSchema.conf: {},\n", - " TaskSpecSchema.inputs: {\n", - " 'in0': 'verify_cudf_to_numba.max_diff',\n", - " 'in1': 'verify_cudf_to_cupy.max_diff'\n", - " }\n", - "}\n", - "\n", - "task_list = [\n", - " points_tspec,\n", - " cudf_distance_tspec,\n", - " numba_distance_tspec,\n", - " cupy_distance_tspec,\n", - " out_spec,\n", - " verify_tspec, \n", - " verify_tspec2\n", - "]\n", - "task_graph = TaskGraph(task_list)\n", - "task_graph.draw(show='ipynb', show_ports=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "2662f2c4647b4c029eb99776b56cc290", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "GQuantWidget(sub=HBox(), value=[OrderedDict([('id', 'points_task'), ('type', 'PointNode'), ('conf', {'npts': 1…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "task_graph.draw()" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Max Difference cudf to numba: 2.220446049250313e-16\n", - "Max Difference cudf to cupy: 2.220446049250313e-16\n" - ] - } - ], - "source": [ - "(max_cudf_to_numba_diff, max_cudf_to_cupy_diff) = task_graph.run([\n", - " 'verify_cudf_to_numba.max_diff',\n", - " 'verify_cudf_to_cupy.max_diff'\n", - "])\n", - "print('Max Difference cudf to numba: {}'.format(max_cudf_to_numba_diff))\n", - "print('Max Difference cudf to cupy: {}'.format(max_cudf_to_cupy_diff))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Dask distributed computation\n", - "\n", - "Using Dask and `dask-cudf` we can run the Nodes with customized GPU kernels on distributed dataframes. Under the hood of the `Node` class the Dask delayed processing API is handled for cudf dataframes when the `self.delayed_process = True` flag is set.\n", - "\n", - "We first start a distributed Dask environment. When a dask client is instantiated it registers itself as the default Dask scheduler (). Therefore all subsequent Dask distibuted dataframe operations will run in distributed fashion." - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "
\n", - "

Client

\n", - "\n", - "
\n", - "

Cluster

\n", - "
    \n", - "
  • Workers: 2
  • \n", - "
  • Cores: 2
  • \n", - "
  • Memory: 100.00 GB
  • \n", - "
\n", - "
" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from dask_cuda import LocalCUDACluster\n", - "from dask.distributed import Client\n", - "\n", - "cluster = LocalCUDACluster()\n", - "client = Client(cluster)\n", - "client" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The Dask status page can be displayed in a web browser at `:8787`. The ip-address corresponds to the machine where the dask cluster (scheduler) was launched. Most likely same ip-address as where this jupyter notebook is running. Using the Dask status page is convenient for monitoring dask distributed processing. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The next step is to partition the `cudf` dataframe into a `dask_cudf` dataframe. Here we make the number of partitions corresponding to the number of workers:" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [], - "source": [ - "class DistributedNode(Node):\n", - "\n", - " def ports_setup(self):\n", - " input_ports = {\n", - " 'points_df_in': {\n", - " PortsSpecSchema.port_type: cudf.DataFrame\n", - " }\n", - " }\n", - "\n", - " output_ports = {\n", - " 'points_ddf_out': {\n", - " PortsSpecSchema.port_type: dask_cudf.DataFrame\n", - " }\n", - " }\n", - "\n", - " return NodePorts(inports=input_ports, outports=output_ports)\n", - "\n", - " def init(self):\n", - " pass\n", - "\n", - " def meta_setup(self,):\n", - " cols_required = {\n", - " 'x': 'float64',\n", - " 'y': 'float64'\n", - " }\n", - " required = {\n", - " 'points_df_in': cols_required,\n", - " 'points_ddf_out': cols_required\n", - " }\n", - " input_meta = self.get_input_meta()\n", - " output_cols = ({\n", - " 'points_ddf_out': {\n", - " 'x': 'float64',\n", - " 'y': 'float64'\n", - " }\n", - " })\n", - " if 'points_df_in' in input_meta:\n", - " col_from_inport = input_meta['points_df_in']\n", - " # additional ports\n", - " output_cols['points_ddf_out'].update(col_from_inport)\n", - " return MetaData(inports=required, outports=output_cols)\n", - "\n", - " def conf_schema(self):\n", - " json = {\n", - " \"title\": \"DistributedNode configure\",\n", - " \"type\": \"object\",\n", - " \"properties\": {\n", - " \"npartitions\": {\n", - " \"type\": \"number\",\n", - " \"description\": \"num of partitions in the Dask dataframe\",\n", - " \"minimum\": 1\n", - " }\n", - " },\n", - " \"required\": [\"npartitions\"],\n", - " }\n", - "\n", - " ui = {\n", - " \"npartitions\": {\"ui:widget\": \"updown\"}\n", - " }\n", - " return ConfSchema(json=json, ui=ui)\n", - "\n", - " def process(self, inputs):\n", - " npartitions = self.conf['npartitions']\n", - " df = inputs['points_df_in']\n", - " ddf = dask_cudf.from_cudf(df, npartitions=npartitions)\n", - " return {'points_ddf_out': ddf}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Register it:" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [], - "source": [ - "TaskGraph.register_lab_node(module_name, DistributedNode)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We add this distribution node to the computation graph to convert `cudf` dataframes into `dask-cudf` dataframes. The `dask-cudf` dataframes are handled automatically in gQuant when `self.delayed_process=True` within a `Node` implementation (setup in `meta_setup`). When using nodes with ports with `self.delayed_process=True` setting, it is required that all input and output ports be of type `cudf.DataFrame`. Otherwise don't set `self.delayed_process` and one can write custom logic to handle distributed dataframes (refer to `VerifyNode` abover for an example where `dask_cudf` dataframes are handled directly within the process method)." - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "npartitions = len(client.scheduler_info()['workers'])\n", - "\n", - "\n", - "distribute_tspec = {\n", - " TaskSpecSchema.task_id: 'distributed_points',\n", - " TaskSpecSchema.node_type: DistributedNode,\n", - " TaskSpecSchema.conf: {'npartitions': npartitions},\n", - " TaskSpecSchema.module: module_name,\n", - " TaskSpecSchema.inputs: {\n", - " 'points_df_in': 'points_task.points_df_out'\n", - " }\n", - "}\n", - "\n", - "dask_cudf_distance_tspec = {\n", - " TaskSpecSchema.task_id: 'distance_by_cudf',\n", - " TaskSpecSchema.node_type: DistanceNode,\n", - " TaskSpecSchema.conf: {},\n", - " TaskSpecSchema.module: module_name,\n", - " TaskSpecSchema.inputs: {\n", - " 'points_df_in': 'distributed_points.points_ddf_out'\n", - " }\n", - "}\n", - "\n", - "dask_numba_distance_tspec = {\n", - " TaskSpecSchema.task_id: 'distance_by_numba',\n", - " TaskSpecSchema.node_type: NumbaDistanceNode,\n", - " TaskSpecSchema.conf: {},\n", - " TaskSpecSchema.module: module_name,\n", - " TaskSpecSchema.inputs: {\n", - " 'points_df_in': 'distributed_points.points_ddf_out'\n", - " }\n", - "}\n", - "\n", - "dask_cupy_distance_tspec = {\n", - " TaskSpecSchema.task_id: 'distance_by_cupy',\n", - " TaskSpecSchema.node_type: CupyDistanceNode,\n", - " TaskSpecSchema.conf: {},\n", - " TaskSpecSchema.module: module_name,\n", - " TaskSpecSchema.inputs: {\n", - " 'points_df_in': 'distributed_points.points_ddf_out'\n", - " }\n", - "}\n", - "\n", - "out_spec = {\n", - " TaskSpecSchema.task_id: '',\n", - " TaskSpecSchema.node_type: \"Output_Collector\",\n", - " TaskSpecSchema.conf: {},\n", - " TaskSpecSchema.inputs: {\n", - " 'in0': 'distributed_points.points_ddf_out',\n", - " 'in1': 'distance_by_cudf.distance_df',\n", - " 'in2': 'distance_by_numba.distance_df',\n", - " 'in3': 'distance_by_cupy.distance_df'\n", - " }\n", - "}\n", - "\n", - "\n", - "task_list = [\n", - " points_tspec,\n", - " distribute_tspec,\n", - " dask_cudf_distance_tspec,\n", - " dask_numba_distance_tspec,\n", - " dask_cupy_distance_tspec,\n", - " out_spec\n", - "]\n", - "\n", - "task_graph = TaskGraph(task_list)\n", - "task_graph.draw(show='ipynb', show_ports=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "c4e4038195864d72803fac1d3971937b", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "GQuantWidget(sub=HBox(), value=[OrderedDict([('id', 'points_task'), ('type', 'PointNode'), ('conf', {'npts': 1…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "task_graph.draw()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Run the distributed computation programmatically:" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [], - "source": [ - "out_list = [\n", - " 'distributed_points.points_ddf_out',\n", - " 'distance_by_cudf.distance_df',\n", - " 'distance_by_numba.distance_df',\n", - " 'distance_by_cupy.distance_df'\n", - "]\n", - "\n", - "(points_ddf, ddf_w_cudf, ddf_w_numba, ddf_w_cupy) = task_graph.run(out_list)\n", - "df_w_cudf = ddf_w_cudf.compute()\n", - "df_w_numba = ddf_w_numba.compute()\n", - "df_w_cupy = ddf_w_cupy.compute()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Verify the results:" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "HEAD points_ddf:\n", - " x y\n", - "0 0.017881 0.460670\n", - "1 0.184132 0.864031\n", - "2 0.241513 0.956813\n", - "3 0.336786 0.975579\n", - "4 0.767782 0.330257\n", - "\n", - "HEAD df_w_cudf:\n", - " x y distance_cudf\n", - "0 0.017881 0.460670 0.461017\n", - "1 0.184132 0.864031 0.883433\n", - "2 0.241513 0.956813 0.986823\n", - "3 0.336786 0.975579 1.032075\n", - "4 0.767782 0.330257 0.835799\n", - "\n", - "HEAD df_w_numba:\n", - " x y distance_numba\n", - "0 0.017881 0.460670 0.461017\n", - "1 0.184132 0.864031 0.883433\n", - "2 0.241513 0.956813 0.986823\n", - "3 0.336786 0.975579 1.032075\n", - "4 0.767782 0.330257 0.835799\n", - "\n", - "HEAD df_w_cupy:\n", - " x y distance_cupy\n", - "0 0.017881 0.460670 0.461017\n", - "1 0.184132 0.864031 0.883433\n", - "2 0.241513 0.956813 0.986823\n", - "3 0.336786 0.975579 1.032075\n", - "4 0.767782 0.330257 0.835799\n", - "\n", - "Max Difference cudf to numba: 2.220446049250313e-16\n", - "Max Difference cudf to cupy: 2.220446049250313e-16\n" - ] - } - ], - "source": [ - "verify_cudf_numba_tspec = verify_tspec.copy()\n", - "verify_cudf_cupy_tspec = verify_tspec2.copy()\n", - "\n", - "task_graph.extend(\n", - " [verify_cudf_numba_tspec,\n", - " verify_cudf_cupy_tspec],\n", - " replace=True)\n", - "task_graph.draw(show='ipynb', show_ports=True)\n", - "\n", - "# Use results above and avoid re-running dask\n", - "replace_spec = {\n", - " 'distance_by_cudf': {\n", - " TaskSpecSchema.load: {\n", - " 'distance_df': ddf_w_cudf\n", - " }\n", - " },\n", - " 'distance_by_numba': {\n", - " TaskSpecSchema.load: {\n", - " 'distance_df': ddf_w_numba\n", - " }\n", - " },\n", - " 'distance_by_cupy': {\n", - " TaskSpecSchema.load: {\n", - " 'distance_df': ddf_w_cupy\n", - " }\n", - " }\n", - "}\n", - "\n", - "(max_cudf_to_numba_diff, max_cudf_to_cupy_diff) = task_graph.run(\n", - " ['verify_cudf_to_numba.max_diff',\n", - " 'verify_cudf_to_cupy.max_diff'],\n", - " replace=replace_spec\n", - ")\n", - "\n", - "print('HEAD points_ddf:\\n{}\\n'.format(points_ddf.head()))\n", - "print('HEAD df_w_cudf:\\n{}\\n'.format(ddf_w_cudf.head()))\n", - "print('HEAD df_w_numba:\\n{}\\n'.format(ddf_w_numba.head()))\n", - "print('HEAD df_w_cupy:\\n{}\\n'.format(ddf_w_cupy.head()))\n", - "print('Max Difference cudf to numba: {}'.format(max_cudf_to_numba_diff))\n", - "print('Max Difference cudf to cupy: {}'.format(max_cudf_to_cupy_diff))\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "One limitation to be aware of when using customized kernels within Nodes in the Dask environment, is that each GPU kernel works on one partition of the dataframe. Therefore if the computation depends on other partitions of the dataframe the approach above does not work." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Saving Custom Nodes and Kernels\n", - "\n", - "The gQuant examples already implement a number of `Nodes`. These can be found in `gquant.plugin_nodes` submodules.\n", - "\n", - "The customized kernels and nodes can be saved to your own python modules for future re-use instead of having to re-define them at runtime. The nodes we defined above were to a written to a python module \"custom_port_nodes.py\" (the `DistanceNode` was simplified to ommit the absolute distance calculation). We will re-run our workflow importing the Nodes from the custom module we wrote out.\n", - "\n", - "When defining the tasks we specify `filepath` for the path to the python module that has the Node definition. Notice, that the `node_type` is specified as a string instead of class. The string is the class name of the node that will be imported for running a task." - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "" - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "npartitions = len(client.scheduler_info()['workers'])\n", - "\n", - "points_tspec = {\n", - " TaskSpecSchema.task_id: 'points_task',\n", - " TaskSpecSchema.node_type: 'PointNode',\n", - " TaskSpecSchema.filepath: '/home/quant/gQuant/notebooks/custom_port_nodes.py',\n", - " TaskSpecSchema.conf: {'npts': 1000},\n", - " TaskSpecSchema.inputs: {},\n", - "}\n", - "\n", - "distribute_tspec = {\n", - " TaskSpecSchema.task_id: 'distributed_points',\n", - " TaskSpecSchema.node_type: 'DistributedNode',\n", - " TaskSpecSchema.filepath: '/home/quant/gQuant/notebooks/custom_port_nodes.py',\n", - " TaskSpecSchema.conf: {'npartitions': npartitions},\n", - " TaskSpecSchema.inputs: {\n", - " 'points_df_in': 'points_task.points_df_out'\n", - " }\n", - "}\n", - "\n", - "dask_cudf_distance_tspec = {\n", - " TaskSpecSchema.task_id: 'distance_by_cudf',\n", - " TaskSpecSchema.node_type: 'DistanceNode',\n", - " TaskSpecSchema.filepath: '/home/quant/gQuant/notebooks/custom_port_nodes.py',\n", - " TaskSpecSchema.conf: {},\n", - " TaskSpecSchema.inputs: {\n", - " 'points_df_in': 'distributed_points.points_ddf_out'\n", - " }\n", - "}\n", - "\n", - "dask_numba_distance_tspec = {\n", - " TaskSpecSchema.task_id: 'distance_by_numba',\n", - " TaskSpecSchema.node_type: 'NumbaDistanceNode',\n", - " TaskSpecSchema.filepath: '/home/quant/gQuant/notebooks/custom_port_nodes.py',\n", - " TaskSpecSchema.conf: {},\n", - " TaskSpecSchema.inputs: {\n", - " 'points_df_in': 'distributed_points.points_ddf_out'\n", - " }\n", - "}\n", - "\n", - "dask_cupy_distance_tspec = {\n", - " TaskSpecSchema.task_id: 'distance_by_cupy',\n", - " TaskSpecSchema.node_type: 'CupyDistanceNode',\n", - " TaskSpecSchema.filepath: '/home/quant/gQuant/notebooks/custom_port_nodes.py',\n", - " TaskSpecSchema.conf: {},\n", - " TaskSpecSchema.inputs: {\n", - " 'points_df_in': 'distributed_points.points_ddf_out'\n", - " }\n", - "}\n", - "\n", - "verify_cudf_to_numba_tspec = {\n", - " TaskSpecSchema.task_id: 'verify_cudf_to_numba',\n", - " TaskSpecSchema.node_type: 'VerifyNode',\n", - " TaskSpecSchema.filepath: '/home/quant/gQuant/notebooks/custom_port_nodes.py',\n", - " TaskSpecSchema.conf: {\n", - " 'df1_col': 'distance_cudf',\n", - " 'df2_col': 'distance_numba'\n", - " }, \n", - " TaskSpecSchema.inputs: {\n", - " 'df1': 'distance_by_cudf.distance_df',\n", - " 'df2': 'distance_by_numba.distance_df'\n", - " }\n", - "}\n", - "\n", - "verify_cudf_to_cupy_tspec = {\n", - " TaskSpecSchema.task_id: 'verify_cudf_to_cupy',\n", - " TaskSpecSchema.node_type: 'VerifyNode',\n", - " TaskSpecSchema.filepath: '/home/quant/gQuant/notebooks/custom_port_nodes.py',\n", - " TaskSpecSchema.conf: {\n", - " 'df1_col': 'distance_cudf',\n", - " 'df2_col': 'distance_cupy'\n", - " }, \n", - " TaskSpecSchema.inputs: {\n", - " 'df1': 'distance_by_cudf.distance_df',\n", - " 'df2': 'distance_by_cupy.distance_df'\n", - " }\n", - "}\n", - "\n", - "task_list = [\n", - " points_tspec,\n", - " distribute_tspec,\n", - " dask_cudf_distance_tspec,\n", - " dask_numba_distance_tspec,\n", - " dask_cupy_distance_tspec,\n", - " verify_cudf_to_numba_tspec,\n", - " verify_cudf_to_cupy_tspec\n", - "]\n", - "\n", - "task_graph = TaskGraph(task_list)\n", - "task_graph.draw(show='ipynb', show_ports=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "75d07ac3c6274bf19dc453c8f0407eda", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "GQuantWidget(sub=HBox(), value=[OrderedDict([('id', 'points_task'), ('type', 'PointNode'), ('conf', {'npts': 1…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "task_graph.draw()" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "HEAD df_w_cudf:\n", - " x y distance_cudf\n", - "0 0.438593 0.505563 0.669296\n", - "1 0.714780 0.531261 0.890589\n", - "2 0.174467 0.734742 0.755171\n", - "3 0.964969 0.986593 1.380048\n", - "4 0.631274 0.819636 1.034558\n", - "\n", - "HEAD df_w_numba:\n", - " x y distance_numba\n", - "0 0.438593 0.505563 0.669296\n", - "1 0.714780 0.531261 0.890589\n", - "2 0.174467 0.734742 0.755171\n", - "3 0.964969 0.986593 1.380048\n", - "4 0.631274 0.819636 1.034558\n", - "\n", - "HEAD df_w_cupy:\n", - " x y distance_cupy\n", - "0 0.438593 0.505563 0.669296\n", - "1 0.714780 0.531261 0.890589\n", - "2 0.174467 0.734742 0.755171\n", - "3 0.964969 0.986593 1.380048\n", - "4 0.631274 0.819636 1.034558\n", - "\n", - "Max Difference cudf to numba: 2.220446049250313e-16\n", - "Max Difference cudf to cupy: 2.220446049250313e-16\n" - ] - } - ], - "source": [ - "out_list = [\n", - " 'distance_by_cudf.distance_df',\n", - " 'distance_by_numba.distance_df',\n", - " 'distance_by_cupy.distance_df',\n", - " 'verify_cudf_to_numba.max_diff',\n", - " 'verify_cudf_to_cupy.max_diff'\n", - "]\n", - "\n", - "(ddf_w_cudf, ddf_w_numba, ddf_w_cupy,\n", - " mdiff_cudf_to_numba, mdiff_cudf_to_cupy) = task_graph.run(out_list)\n", - "\n", - "print('HEAD df_w_cudf:\\n{}\\n'.format(ddf_w_cudf.head()))\n", - "print('HEAD df_w_numba:\\n{}\\n'.format(ddf_w_numba.head()))\n", - "print('HEAD df_w_cupy:\\n{}\\n'.format(ddf_w_cupy.head()))\n", - "print('Max Difference cudf to numba: {}'.format(mdiff_cudf_to_numba))\n", - "print('Max Difference cudf to cupy: {}'.format(mdiff_cudf_to_cupy))\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The final illustration is how to save and load a task graph to a file for re-use." - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [], - "source": [ - "task_graph.save_taskgraph('custom_wflow.gq.yaml')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The gQuant TaskGraph file is created and saved. You can double click on it to open it up in the JupyterLab to edit it.\n", - "\n", - "Or you can display it by gQuant widget and play with it interactively " - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "5a0fa6ed517045268a59f0049393b31a", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "GQuantWidget(sub=HBox(), value=[OrderedDict([('id', 'points_task'), ('type', 'PointNode'), ('conf', {'npts': 1…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "task_graph = TaskGraph.load_taskgraph('custom_wflow.gq.yaml')\n", - "task_graph.draw()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Of course you can run it by callign `run` method." - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "3bffa81276594ac2a655de9e7ee62fb3", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Tab(children=(Output(), Output(), Output(), Output(), Output(), Output(layout=Layout(border='1px solid black')…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "\n", - "# update npartitions in case the scheduler is running with\n", - "# different number of workers than what was saved.\n", - "npartitions = len(client.scheduler_info()['workers'])\n", - "replace_spec = {\n", - " 'distributed_points': {\n", - " TaskSpecSchema.conf: {'npartitions': npartitions},\n", - " }\n", - "}\n", - "\n", - "out_list = [\n", - " 'distance_by_cudf.distance_df',\n", - " 'distance_by_numba.distance_df',\n", - " 'distance_by_cupy.distance_df',\n", - " 'verify_cudf_to_numba.max_diff',\n", - " 'verify_cudf_to_cupy.max_diff'\n", - "]\n", - "\n", - "task_graph.run(out_list, replace=replace_spec, formated=True)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "HEAD df_w_cudf:\n", - " x y distance_cudf\n", - "0 0.438593 0.505563 0.669296\n", - "1 0.714780 0.531261 0.890589\n", - "2 0.174467 0.734742 0.755171\n", - "3 0.964969 0.986593 1.380048\n", - "4 0.631274 0.819636 1.034558\n", - "\n", - "HEAD df_w_numba:\n", - " x y distance_numba\n", - "0 0.438593 0.505563 0.669296\n", - "1 0.714780 0.531261 0.890589\n", - "2 0.174467 0.734742 0.755171\n", - "3 0.964969 0.986593 1.380048\n", - "4 0.631274 0.819636 1.034558\n", - "\n", - "HEAD df_w_cupy:\n", - " x y distance_cupy\n", - "0 0.438593 0.505563 0.669296\n", - "1 0.714780 0.531261 0.890589\n", - "2 0.174467 0.734742 0.755171\n", - "3 0.964969 0.986593 1.380048\n", - "4 0.631274 0.819636 1.034558\n", - "\n" - ] - } - ], - "source": [ - "\n", - "print('HEAD df_w_cudf:\\n{}\\n'.format(ddf_w_cudf.head()))\n", - "print('HEAD df_w_numba:\\n{}\\n'.format(ddf_w_numba.head()))\n", - "print('HEAD df_w_cupy:\\n{}\\n'.format(ddf_w_cupy.head()))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Conclusion\n", - "\n", - "Using customized GPU kernels allows data scientists to implement and incorporate advanced algorithms. We demonstrated implementations using Numba and CuPy.\n", - "\n", - "The Numba approach enables data scientists to write GPU kernels directly in the Python language. Numba is easy to use for implementing and accelerating computations. However there is some overhead incurred for compiling the kernels whenever the Numba GPU kernels are used for the first time in a Python process. Currently Numba library only supports primitive data types. Some advanced CUDA programming features, such as function pointers and function recursions are not supported. \n", - "\n", - "The Cupy method is very flexible, because data scientists are writing C/C++ GPU kernels with CUDA directly. All the CUDA programming features are supported. CuPy compiles the kernel and caches the device code to the filesystem. The launch overhead is low. Also, the GPU kernel is built statically resulting in runtime efficiency. However it might be harder for data scientists to use, because C/C++ programming is more complicated. \n", - "\n", - "Below is a brief summary comparison table:\n", - "\n", - "| Methods | Development Difficulty | Flexibility | Efficiency | Latency |\n", - "|---|---|---|---|---|\n", - "| Numba method | medium | medium | low | high |\n", - "| CuPy method | hard | high | high | low |\n", - "\n", - "We recommend that the data scientists select the approach appropriate for their task taking into consideration the efficiency, latency, difficulty and flexibility of their workflow. \n", - "\n", - "In this blog, we showed how to wrap the customized GPU kernels in gQuant nodes. Also, by taking advantage of having the gQuant handle the low-level Dask interfaces for the developer, we demonstrated how to use the gQuant workflow with Dask distributed computations." - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [], - "source": [ - "# Clean up\n", - "\n", - "# Shutdown the Dask cluster\n", - "client.close()\n", - "cluster.close()" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'status': 'ok', 'restart': True}" - ] - }, - "execution_count": 39, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import IPython\n", - "app = IPython.Application.instance()\n", - "app.kernel.do_shutdown(True)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.7" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/notebooks/06_xgboost_trade.ipynb b/notebooks/06_xgboost_trade.ipynb deleted file mode 100644 index 30529fd0..00000000 --- a/notebooks/06_xgboost_trade.ipynb +++ /dev/null @@ -1,1277 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Trade with XGBoost algorithm\n", - "## Background\n", - "In the [portfolio trade example](https://github.com/rapidsai/gQuant/blob/master/notebooks/04_portfolio_trade.ipynb), we use gQuant to backtest a simple mean reversion trading strategy on 5000 stocks.\n", - "It shows decent performance by tweaking the moving average window size. Searching for alpha signal is the ultimate goal for the trading companies. A lot of different methods are used to do so. Machine learning approach\n", - "is one of those. It has the benefits of extracting important information in the data automatically given enough computation. There are a few popular machine learning algrithoms, including SVM, Random forest tree etc. Amoung those, XGBoost is known to be a very powerful machine \n", - "learning method that is winning a lot of [ML competitions](https://medium.com/syncedreview/tree-boosting-with-xgboost-why-does-xgboost-win-every-machine-learning-competition-ca8034c0b283). Luckily, the [RAPIDS library](https://github.com/rapidsai) accelerates the XGBoost ML algorithm in the GPU so that we can easily take advantage of it in the gQuant. \n", - "\n", - "In this notebook, we are going to demo how to use gQuant to backtest a XGBoost based trading stragty.\n", - "\n", - "\n", - "## Environment Preparation\n", - "\n", - "### Download the example Datasets\n", - "Before getting started, let's download the example datasets if not presen" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Dataset is already present. No need to re-download it.\n" - ] - } - ], - "source": [ - "! ((test ! -f './data/stock_price_hist.csv.gz' || test ! -f './data/security_master.csv.gz') && \\\n", - " cd .. && bash download_data.sh) || echo \"Dataset is already present. No need to re-download it.\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Prepare for running in Dask environment\n", - "\n", - "Let's start the Dask local cluster environment for distributed computation.\n", - "\n", - "Dask provides a web-based dashboard to help to track progress, identify performance issues, and debug failures. To learn more about Dask dashboard, just follow this [link](https://distributed.dask.org/en/latest/web.html).\n" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "
\n", - "

Client

\n", - "\n", - "
\n", - "

Cluster

\n", - "
    \n", - "
  • Workers: 2
  • \n", - "
  • Cores: 2
  • \n", - "
  • Memory: 100.00 GB
  • \n", - "
\n", - "
" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Start the Dask local cluster environment for distrubuted computation\n", - "from dask_cuda import LocalCUDACluster\n", - "from dask.distributed import Client\n", - "\n", - "cluster = LocalCUDACluster()\n", - "client = Client(cluster)\n", - "client\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Though our stock dataset is small enough to fit in a single 16G GPU, to show how to do distributed computation, we will split the dataframe into small pieces to be loaded by different workers in the cluster.\n", - "\n", - "Notice this step is need only if the dataset is not split in multiple files yet.\n", - "\n", - "First use this simple taskgraph to load data then sort it by the asset id and datatime:" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "4d38e117bbd3413590030532513d321b", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "GQuantWidget(sub=HBox(), value=[OrderedDict([('id', 'stock_data'), ('type', 'CsvStockLoader'), ('conf', {'file…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import sys; sys.path.insert(0, '..')\n", - "import warnings\n", - "from gquant.dataframe_flow import TaskGraph\n", - "import ipywidgets as widgets\n", - "import os\n", - "warnings.simplefilter(\"ignore\")\n", - "task_graph = TaskGraph.load_taskgraph('../taskgraphs/sort_stocks.gq.yaml')\n", - "input_cached, = task_graph.run()\n", - "task_graph.draw()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "convert the sorted stock data into partitions and save it into csv files. Note, the data is slited in a way that the same asset belongs to the same partition" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['/home/quant/gQuant/notebooks/many-small/0.csv',\n", - " '/home/quant/gQuant/notebooks/many-small/1.csv',\n", - " '/home/quant/gQuant/notebooks/many-small/2.csv',\n", - " '/home/quant/gQuant/notebooks/many-small/3.csv',\n", - " '/home/quant/gQuant/notebooks/many-small/4.csv',\n", - " '/home/quant/gQuant/notebooks/many-small/5.csv',\n", - " '/home/quant/gQuant/notebooks/many-small/6.csv',\n", - " '/home/quant/gQuant/notebooks/many-small/7.csv']" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import dask.dataframe as dd\n", - "import os\n", - "num_partitions = 8\n", - "\n", - "os.makedirs('many-small', exist_ok=True)\n", - "dd.from_pandas(input_cached.set_index('asset'), npartitions=num_partitions).reset_index().to_csv('many-small/*.csv', index=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note, this notebook requires `cudf` of version >=0.8.0. It can be checked by following command" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.14.0\n" - ] - } - ], - "source": [ - "import cudf\n", - "print(cudf.__version__)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## The toy example\n", - "To mimic the end-to-end quantitative analyst task, we are going to backtest a XGBoost trading strategy. \n", - "\n", - "We will reuse the preprocessing steps as shown in the portfolio trade notebook example. \n", - "\n", - "The workflow includes following steps:\n", - "\n", - "1. Preprocess the datasets.\n", - "\n", - "4. Compute the features based on different technical indicators \n", - "\n", - "5. Split the data in training and testing and build a XGBoost model based on the training data. From the XGBoost model, compute the trading signals for all the data points.\n", - "\n", - "5. Run backtesting and compute the returns from this strategy for each of the days and stock symbols \n", - "\n", - "6. Run a simple portfolio optimization by averaging the stocks together for each of the trading days.\n", - "\n", - "7. Compute the sharpe ratio and cumulative return results for both training and testing datasets\n", - "\n", - "The whole workflow can be organized into a TaskGraph, which are fully described in a `.gq.yaml` file.\n", - "\n", - "Each nodes has a unique id, a node type, configuration parameters and input nodes ids. gQuant takes this yaml file, wires it into a graph to visualize it.\n", - "\n", - "First let's load the proprocess TaskGraph:" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "task_graph = TaskGraph.load_taskgraph('../taskgraphs/preprocess.gq.yaml')\n", - "task_graph.draw(show='ipynb')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "It can be shown in the interactive gQuant widget. Try to run it and see the preprocess results:" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "d75c661b3bbb429d80bf407571d4d0d4", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "GQuantWidget(sub=HBox(), value=[OrderedDict([('id', 'stock_data'), ('type', 'CsvStockLoader'), ('conf', {'file…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "task_graph.draw()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Lode the whole TaskGraph by `load_taskgraph` command. Note the preprocess TaskGraph is included inside the `preprocess` `Composite Node`." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "task_graph = TaskGraph.load_taskgraph('../taskgraphs/xgboost_trade.gq.yaml')\n", - "task_graph.draw(show='ipynb')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To see the input/output ports of the nodes, turn show_ports flag on" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "task_graph.draw(show='ipynb', show_ports=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Similarly, it can be shown in the interactive gQuant widget. Try to run it, change the parameters and play with it" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "dfcd077b6b4c4f3ea348138d6310d947", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "GQuantWidget(sub=HBox(), value=[OrderedDict([('id', 'stock_data'), ('type', 'CsvStockLoader'), ('conf', {'file…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "task_graph.draw()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - " " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The features used for XGBoost algorithm are prepared in the `xgboost` Task node, where `cuIndicator` module is used to compute the technical indicators in the GPU for all the stock symbols. `xgboost` is the Task node that is used to compute the trading signals from the stock technical indicators. Each of the gQuant Task node is implemented by overwriting `meta_setup`, `process`, `ports_setup`, `conf_chema` methods of the Node base class. Please refer to [customize nodes notebook](https://github.com/rapidsai/gQuant/blob/master/notebooks/05_customize_nodes.ipynb) for details. Following is the source code for \"XGBoostStrategyNode\":" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "class XGBoostStrategyNode(Node):\n", - " \"\"\"\n", - " This is the Node used to compute trading signal from XGBoost Strategy.\n", - " It requires the following conf fields:\n", - " \"train_date\": a date string of \"Y-m-d\" format. All the data points\n", - " before this date is considered as training, otherwise as testing. If\n", - " not provided, all the data points are considered as training.\n", - " \"xgboost_parameters\": a dictionary of any legal parameters for XGBoost\n", - " models. It overwrites the default parameters used in the process method\n", - " \"no_feature\": specifying a list of columns in the input dataframe that\n", - " should NOT be considered as training features.\n", - " \"target\": the column that is considered as \"target\" in machine learning\n", - " algorithm\n", - " It requires the \"datetime\" column for spliting the data points and adds a\n", - " new column \"signal\" to be used for backtesting.\n", - " The detailed computation steps are listed in the process method's docstring\n", - " \"\"\"\n", - "\n", - " def init(self):\n", - " _PortTypesMixin.init(self)\n", - " self.INPUT_PORT_NAME = 'stock_in'\n", - " self.OUTPUT_PORT_NAME = 'stock_out'\n", - "\n", - " def meta_setup(self):\n", - " # if 'no_feature' in self.conf:\n", - " # retention = self.conf['no_feature']\n", - " # else:\n", - " cols_required = {'datetime': 'date',\n", - " \"asset\": \"int64\"}\n", - " # self.delayed_process = True\n", - " required = {\n", - " self.INPUT_PORT_NAME: cols_required\n", - " }\n", - " retention = {}\n", - " retention['signal'] = 'float64'\n", - " # _PortTypesMixin.retention_meta_setup(self, retention)\n", - "\n", - " input_meta = self.get_input_meta()\n", - " if self.INPUT_PORT_NAME not in input_meta:\n", - " col_from_inport = required[self.INPUT_PORT_NAME]\n", - " else:\n", - " col_from_inport = input_meta[self.INPUT_PORT_NAME]\n", - " # delete the columns from the inputs\n", - " if 'no_feature' in self.conf:\n", - " for key in self.conf['no_feature']:\n", - " if key in col_from_inport:\n", - " retention[key] = col_from_inport[key]\n", - " metadata = MetaData(inports=required,\n", - " outports={self.OUTPUT_PORT_NAME: retention})\n", - " return metadata\n", - "\n", - " def ports_setup(self):\n", - " types = [cudf.DataFrame,\n", - " dask_cudf.DataFrame]\n", - " return _PortTypesMixin.ports_setup_from_types(self, types)\n", - "\n", - " def conf_schema(self):\n", - " json = {\n", - " \"title\": \"XGBoost Node configure\",\n", - " \"type\": \"object\",\n", - " \"description\": \"\"\"Split the data into training and testing based on\n", - " 'train_data', train a XGBoost model based on the training data, \n", - " make predictions for all the data points, compute the trading.\n", - " \"\"\",\n", - " \"properties\": {\n", - " \"num_of_rounds\": {\n", - " \"type\": \"number\",\n", - " \"description\": \"\"\"The number of rounds for boosting\"\"\",\n", - " \"default\": 100\n", - " },\n", - " \"train_date\": {\n", - " \"type\": \"string\",\n", - " \"description\": \"\"\"the date to splite train and validation\n", - " dataset\"\"\"\n", - " },\n", - " \"target\": {\n", - " \"type\": \"string\",\n", - " \"description\": \"the column used as dependent variable\"\n", - " },\n", - " \"no_feature\": {\n", - " \"type\": \"array\",\n", - " \"items\": {\n", - " \"type\": \"string\",\n", - " },\n", - " \"description\": \"\"\"columns in the input dataframe that\n", - " should NOT be considered as training features.\"\"\"\n", - " },\n", - " \"xgboost_parameters\": {\n", - " \"type\": \"object\",\n", - " \"description\": \"xgoobst parameters\",\n", - " \"properties\": {\n", - " 'max_depth': {\n", - " \"type\": \"number\",\n", - " \"description\": \"Maximum depth of a tree.\",\n", - " \"default\": 8\n", - " }, \n", - " \"max_leaves\": {\n", - " \"type\": \"number\",\n", - " \"description\": \"maximum number of tree leaves\",\n", - " \"default\": 2**8\n", - " },\n", - " \"gamma\": {\n", - " \"type\": \"number\",\n", - " \"description\": \"\"\"Minimum loss reduction required\n", - " to make a further partition on a leaf node of the\n", - " tree.\"\"\",\n", - " \"default\": 0\n", - " },\n", - " \"objective\": {\n", - " \"type\": \"string\",\n", - " \"enum\": [\"reg:squarederror\", \"reg:squaredlogerror\",\n", - " \"reg:logistic\", \"reg:pseudohubererror\"],\n", - " \"description\": \"\"\"Specify the learning task and\n", - " the corresponding learning objective.\"\"\",\n", - " \"default\": \"reg:squarederror\"\n", - " }\n", - " }\n", - " }\n", - " },\n", - " \"required\": [\"target\", \"num_of_rounds\"],\n", - " }\n", - " ui = {\n", - " \"train_date\": {\n", - " \"ui:widget\": \"alt-date\",\n", - " \"ui:options\": {\n", - " \"yearsRange\": [1985, 2025],\n", - " \"hideNowButton\": True,\n", - " \"hideClearButton\": True,\n", - " },\n", - " },\n", - " }\n", - " input_meta = self.get_input_meta()\n", - " if self.INPUT_PORT_NAME in input_meta:\n", - " col_from_inport = input_meta[self.INPUT_PORT_NAME]\n", - " enums = [col for col in col_from_inport.keys()]\n", - " json['properties']['no_feature']['items']['enum'] = enums\n", - " json['properties']['target']['enum'] = enums\n", - " return ConfSchema(json=json, ui=ui)\n", - " else:\n", - " return ConfSchema(json=json, ui=ui)\n", - "\n", - " def process(self, inputs):\n", - " \"\"\"\n", - " The process is doing following things:\n", - " 1. split the data into training and testing based on provided\n", - " conf['train_date']. If it is not provided, all the data is\n", - " treated as training data.\n", - " 2. train a XGBoost model based on the training data\n", - " 3. Make predictions for all the data points including training and\n", - " testing.\n", - " 4. From the prediction of returns, compute the trading signals that\n", - " can be used in the backtesting.\n", - " Arguments\n", - " -------\n", - " inputs: list\n", - " list of input dataframes.\n", - " Returns\n", - " -------\n", - " dataframe\n", - " \"\"\"\n", - " dxgb_params = {\n", - " 'max_depth': 8,\n", - " 'max_leaves': 2 ** 8,\n", - " 'tree_method': 'gpu_hist',\n", - " 'objective': 'reg:squarederror',\n", - " 'grow_policy': 'lossguide',\n", - " }\n", - " # num_of_rounds = 100\n", - " if 'xgboost_parameters' in self.conf:\n", - " dxgb_params.update(self.conf['xgboost_parameters'])\n", - " input_df = inputs[self.INPUT_PORT_NAME]\n", - " model_df = input_df\n", - " train_cols = set(model_df.columns) - set(\n", - " self.conf['no_feature'])\n", - " train_cols = list(train_cols - set([self.conf['target']]))\n", - "\n", - " if isinstance(input_df, dask_cudf.DataFrame):\n", - " # get the client\n", - " client = dask.distributed.client.default_client()\n", - " if 'train_date' in self.conf:\n", - " train_date = datetime.datetime.strptime(self.conf['train_date'], # noqa: F841, E501\n", - " '%Y-%m-%d')\n", - " model_df = model_df[model_df.datetime < train_date]\n", - " train = model_df[train_cols]\n", - " target = model_df[self.conf['target']]\n", - " dmatrix = xgb.dask.DaskDMatrix(client, train, label=target)\n", - " bst = xgb.dask.train(client, dxgb_params, dmatrix,\n", - " num_boost_round=self.conf[\"num_of_rounds\"])\n", - "\n", - " dtrain = xgb.dask.DaskDMatrix(client, input_df[train_cols])\n", - " prediction = xgb.dask.predict(client, bst, dtrain).persist()\n", - " pred_df = dask_cudf.from_dask_dataframe(\n", - " prediction.to_dask_dataframe())\n", - " pred_df.index = input_df.index\n", - " input_df['signal'] = pred_df\n", - " elif isinstance(input_df, cudf.DataFrame):\n", - " if 'train_date' in self.conf:\n", - " train_date = datetime.datetime.strptime(self.conf['train_date'], # noqa: F841, E501\n", - " '%Y-%m-%d')\n", - " model_df = model_df.query('datetime<@train_date')\n", - " train = model_df[train_cols]\n", - " target = model_df[self.conf['target']]\n", - " dmatrix = xgb.DMatrix(train, label=target)\n", - " bst = xgb.train(dxgb_params, dmatrix,\n", - " num_boost_round=self.conf[\"num_of_rounds\"])\n", - " infer_dmatrix = xgb.DMatrix(input_df[train_cols])\n", - " prediction = cudf.Series(bst.predict(infer_dmatrix),\n", - " nan_as_null=False,\n", - " index=input_df.index\n", - " ).astype('float64')\n", - " input_df['signal'] = prediction\n", - "\n", - " input_df['tmp'] = (input_df['asset'] -\n", - " input_df['asset'].shift(1)).fillna(1)\n", - " input_df['tmp'] = (input_df['tmp'] != 0).astype('int32')\n", - " tmp = input_df['tmp']\n", - " input_df['tmp'] = tmp.where(tmp != 1, None)\n", - " input_df = input_df.dropna(subset=['tmp'])\n", - " input_df = input_df.drop('tmp', axis=1)\n", - "\n", - " # convert the signal to trading action\n", - " # 1 is buy and -1 is sell\n", - " # It predicts the tomorrow's return (shift -1)\n", - " # We shift 1 for trading actions so that it acts on the second day\n", - " input_df['signal'] = ((\n", - " input_df['signal'] >= 0).astype('float') * 2 - 1).shift(1)\n", - "\n", - " # remove the bad datapints\n", - " input_df = input_df.dropna()\n", - " remaining = list(self.conf['no_feature']) + ['signal']\n", - " return {self.OUTPUT_PORT_NAME: input_df[remaining]}\n", - "\n" - ] - } - ], - "source": [ - "import inspect\n", - "from rapids_modules import XGBoostStrategyNode\n", - "\n", - "print(inspect.getsource(XGBoostStrategyNode))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### XGBoost Trading Strategy Performance\n", - "Similar to tensorflow, gQuant graph is evaluated by specifying the output nodes and input nodes replacement. We first look at the column result from data preparation node." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "output meta of node node_technical_indicator:\n", - "MetaData(inports={'stock_in': {'indicator': 'int32', 'high': 'float64', 'low': 'float64', 'close': 'float64', 'volume': 'float64', 'returns': 'float64'}}, outports={'stock_out': {'CH_OS_10_20': 'float64', 'BO_BA_b1_10': 'float64', 'BO_BA_b2_10': 'float64', 'SHIFT_-1': 'float64', 'indicator': 'int32', 'returns': 'float64', 'datetime': 'date', 'asset': 'int64', 'volume': 'float64', 'close': 'float64', 'open': 'float64', 'high': 'float64', 'low': 'float64'}})\n" - ] - } - ], - "source": [ - "from pprint import pprint\n", - "print('output meta of node node_technical_indicator:')\n", - "task_graph.build()\n", - "pprint(task_graph['technical_indicator'].meta_setup())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "It adds the columns \"BO_BA_b1_10\", \"BO_BA_b2_10\", 'CH_OS_10_20\" as features and \"SHFIT_-1\" as the target, which is the return of next day. A good feature should be the one that provides highest information about the next day return. In the case we have no prior information about it,\n", - "we can compute as many features as we like and leave it to the XGBoost to find the right combination of those features. \n", - "\n", - "Evaluate the leaf nodes of the backtesting graph by gQuant `run` method." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "id:stock_data process time:4.398s\n", - "id:preprocess process time:0.568s\n", - "id:sort_after process time:0.054s\n", - "id:technical_indicator process time:0.116s\n", - "id:xgboost process time:2.268s\n", - "id:backtest process time:0.002s\n", - "id:train_df process time:0.007s\n", - "id:portfolio_opt_train process time:0.024s\n", - "id:sharpe_ratio_trn process time:0.001s\n", - "id:cumulative_return_trn process time:0.023s\n", - "id:validation_df process time:0.006s\n", - "id:portfolio_opt_validation process time:0.019s\n", - "id:sharpe_ratio_val process time:0.001s\n", - "id:cumulative_return_val process time:0.022s\n", - "CPU times: user 7.36 s, sys: 1.33 s, total: 8.7 s\n", - "Wall time: 8.31 s\n" - ] - } - ], - "source": [ - "%%time\n", - "output_list = ['sharpe_ratio_trn.sharpe_out',\n", - " 'cumulative_return_trn.cum_return',\n", - " 'sharpe_ratio_val.sharpe_out',\n", - " 'cumulative_return_val.cum_return',\n", - " 'sort_after.out']\n", - "o_gpu = task_graph.run(output_list, profile=True)\n", - "cached_sort = o_gpu['sort_after.out']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Define a function to organized the plot results. " - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "da9ca5b730cd4cb58b7c84ba0a0b558b", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "VBox(children=(Figure(axes=[Axis(label='Cumulative return', orientation='vertical', scale=LinearScale(), side=…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# define the function to format the plots\n", - "def plot_figures(o):\n", - " # format the figures\n", - " figure_width = '1200px'\n", - " figure_height = '400px'\n", - " sharpe_number = o['sharpe_ratio_trn.sharpe_out']\n", - " cum_return_train = o['cumulative_return_trn.cum_return']\n", - " cum_return_train.layout.height = figure_height\n", - " cum_return_train.layout.width = figure_width\n", - " cum_return_train.title = 'Training P & L %.3f' % (sharpe_number)\n", - " sharpe_number = o['sharpe_ratio_val.sharpe_out']\n", - " cum_return_test = o['cumulative_return_val.cum_return']\n", - " cum_return_test.layout.height = figure_height\n", - " cum_return_test.layout.width = figure_width\n", - " cum_return_test.title = 'Testing P & L %.3f' % (sharpe_number)\n", - "\n", - " return widgets.VBox([cum_return_train, cum_return_test])\n", - "plot_figures(o_gpu)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The XGBoost model does a good job to predict the next day of return. It overfits in the training dataset and gets Sharpe Ratio of 5 as shown in the figure above. In the testing period, it gets Sharpe Ratio of 1.\n", - "\n", - "The example model runs in a single GPU because of the small dataset. But in real world, the dataset usually is so large that it doesn't fit in a single GPU. Luckily, the XGBoost library natively supports multiple nodes and multiple GPU training by using Dask. You can scale out the computation using Dask dataframe.\n", - "\n", - "To show how easy it is to do distributed computation, let's run the above exmaple in the Dask environment for educational purpose. \n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To run the whole workflow, simply change the `preprocess` node to get Dask Dataframe and run the graph again. Here we look at the testing results:" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "id:stock_data process time:0.012s\n", - "id:preprocess process time:10.520s\n", - "id:xgboost process time:4.284s\n", - "id:backtest process time:0.013s\n", - "id:train_df process time:0.160s\n", - "id:portfolio_opt_train process time:0.040s\n", - "id:sharpe_ratio_trn process time:1.833s\n", - "id:cumulative_return_trn process time:1.462s\n", - "id:validation_df process time:0.007s\n", - "id:portfolio_opt_validation process time:0.042s\n", - "id:sharpe_ratio_val process time:1.401s\n", - "id:cumulative_return_val process time:1.400s\n", - "CPU times: user 12 s, sys: 690 ms, total: 12.7 s\n", - "Wall time: 35.8 s\n" - ] - } - ], - "source": [ - "%%time\n", - "\n", - "replace_spec = {'preprocess': {\"inputs\": {\"sort_node@in\": \"stock_data.dask_cudf_out\"}}}\n", - "o_gpu = task_graph.run(replace=replace_spec, profile=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "5bce35d115d64011bfa699b9404d32c9", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "VBox(children=(Figure(axes=[Axis(label='Cumulative return', orientation='vertical', scale=LinearScale(), side=…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "plot_figures(o_gpu)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "Clearly, 3 feautres is way too little here. gQuant implmented 36 technical indicators. We can change the configuration of node_technical_indicator node to include more features." - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "chaikin_para0 = 10\n", - "chaikin_para1 = 20\n", - "bollinger_para = 10\n", - "macd_para0 = 2\n", - "macd_para1 = 3\n", - "rsi_para0 = 5\n", - "atr_para0 = 10\n", - "sod_para = 2\n", - "mflow_para = 3\n", - "findex_para = 5\n", - "adis_para = 5\n", - "ccindex_para = 5\n", - "bvol_para = 3\n", - "vindex_para = 3\n", - "mindex_para0 = 10\n", - "mindex_para1 = 15\n", - "tindex_para0 = 5\n", - "tindex_para1 = 10\n", - "emove_para = 5\n", - "cc_para = 15\n", - "kchannel_para = 10\n", - "indicator_conf = {\n", - " \"indicators\": [\n", - " {\"function\": \"port_chaikin_oscillator\",\n", - " \"columns\": [\"high\", \"low\", \"close\", \"volume\"],\n", - " \"args\": [chaikin_para0, chaikin_para1]\n", - " },\n", - " {\"function\": \"port_bollinger_bands\",\n", - " \"columns\": [\"close\"],\n", - " \"args\": [bollinger_para],\n", - " \"outputs\": [\"b1\", \"b2\"]\n", - " },\n", - " {\"function\": \"port_macd\",\n", - " \"columns\": [\"close\"],\n", - " \"args\": [macd_para0, macd_para1],\n", - " \"outputs\": [\"MACDsign\", \"MACDdiff\"]\n", - " },\n", - " {\"function\": \"port_relative_strength_index\",\n", - " \"columns\": [\"high\", \"low\"],\n", - " \"args\": [rsi_para0],\n", - " },\n", - " {\"function\": \"port_average_true_range\",\n", - " \"columns\": [\"high\", \"low\", \"close\"],\n", - " \"args\": [atr_para0],\n", - " },\n", - " {\"function\": \"port_stochastic_oscillator_k\",\n", - " \"columns\": [\"high\", \"low\", \"close\"],\n", - " \"args\": [],\n", - " },\n", - " {\"function\": \"port_stochastic_oscillator_d\",\n", - " \"columns\": [\"high\", \"low\", \"close\"],\n", - " \"args\": [sod_para],\n", - " },\n", - " {\"function\": \"port_money_flow_index\",\n", - " \"columns\": [\"high\", \"low\", \"close\", \"volume\"],\n", - " \"args\": [mflow_para],\n", - " },\n", - " {\"function\": \"port_force_index\",\n", - " \"columns\": [\"close\", \"volume\"],\n", - " \"args\": [findex_para],\n", - " },\n", - " {\"function\": \"port_ultimate_oscillator\",\n", - " \"columns\": [\"high\",\"low\",\"close\"],\n", - " \"args\": [],\n", - " },\n", - " {\"function\": \"port_accumulation_distribution\",\n", - " \"columns\": [\"high\",\"low\",\"close\",\"volume\"],\n", - " \"args\": [adis_para],\n", - " },\n", - " {\"function\": \"port_commodity_channel_index\",\n", - " \"columns\": [\"high\",\"low\",\"close\"],\n", - " \"args\": [ccindex_para],\n", - " },\n", - " {\"function\": \"port_on_balance_volume\",\n", - " \"columns\": [\"close\", \"volume\"],\n", - " \"args\": [bvol_para],\n", - " },\n", - " {\"function\": \"port_vortex_indicator\",\n", - " \"columns\": [\"high\", \"low\", \"close\"],\n", - " \"args\": [vindex_para],\n", - " },\n", - " {\"function\": \"port_kst_oscillator\",\n", - " \"columns\": [\"close\"],\n", - " \"args\": [3, 4, 5, 6, 7, 8, 9, 10],\n", - " },\n", - " {\"function\": \"port_mass_index\",\n", - " \"columns\": [\"high\", \"low\"],\n", - " \"args\": [mindex_para0, mindex_para1],\n", - " },\n", - " {\"function\": \"port_true_strength_index\",\n", - " \"columns\": [\"close\"],\n", - " \"args\": [tindex_para0, tindex_para1],\n", - " },\n", - " {\"function\": \"port_ease_of_movement\",\n", - " \"columns\": [\"high\", \"low\", \"volume\"],\n", - " \"args\": [emove_para],\n", - " },\n", - " {\"function\": \"port_coppock_curve\",\n", - " \"columns\": [\"close\"],\n", - " \"args\": [cc_para],\n", - " },\n", - " {\"function\": \"port_keltner_channel\",\n", - " \"columns\": [\"high\", \"low\", \"close\"],\n", - " \"args\": [kchannel_para],\n", - " \"outputs\": [\"KelChD\", \"KelChM\", \"KelChU\"]\n", - " },\n", - " {\"function\": \"port_ppsr\",\n", - " \"columns\": [\"high\", \"low\", \"close\"],\n", - " \"args\": [],\n", - " \"outputs\": [\"PP\", \"R1\", \"S1\", \"R2\", \"S2\", \"R3\", \"S3\"]\n", - " },\n", - " {\"function\": \"port_shift\",\n", - " \"columns\": [\"returns\"],\n", - " \"args\": [-1]\n", - " } \n", - " ],\n", - " \"remove_na\": True\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "dfcd077b6b4c4f3ea348138d6310d947", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "GQuantWidget(cache={'height': 410.7, 'width': 1369, 'nodes': [{'width': 140, 'id': 'stock_data', 'type': 'CsvS…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "task_graph.draw()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Run the backtesting again" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "id:technical_indicator process time:5.200s\n", - "id:xgboost process time:6.102s\n", - "id:backtest process time:0.003s\n", - "id:train_df process time:0.007s\n", - "id:portfolio_opt_train process time:0.024s\n", - "id:sharpe_ratio_trn process time:0.001s\n", - "id:cumulative_return_trn process time:0.023s\n", - "id:validation_df process time:0.005s\n", - "id:portfolio_opt_validation process time:0.018s\n", - "id:sharpe_ratio_val process time:0.001s\n", - "id:cumulative_return_val process time:0.023s\n", - "CPU times: user 10.4 s, sys: 2.37 s, total: 12.8 s\n", - "Wall time: 12.2 s\n" - ] - } - ], - "source": [ - "%%time\n", - "replace_spec = {}\n", - "replace_spec['technical_indicator'] = {\"conf\": indicator_conf}\n", - "\n", - "replace_spec['sort_after'] = {\"load\": {'out': cached_sort}}\n", - "\n", - "o_gpu = task_graph.run(replace=replace_spec, profile=True)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "4e70cea236204b8a9033b190f4d7cd94", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "VBox(children=(Figure(axes=[Axis(label='Cumulative return', orientation='vertical', scale=LinearScale(), side=…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "plot_figures(o_gpu)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We get Sharpe Ratio of `1.93` in the testing dataset, not bad!\n", - "\n", - "Using `min_volume=400.0`, it selects 1558 stocks. Setting a lower threshhold, it can include more stocks for the backtesting and hence increase the Sharpe Ratio. But it runs out of memory of single GPU. We have shown Dask can help to break down the large task into small tasks and schedule them a distributed environment. So we can handle dataset of any sizes in this way:" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "id:stock_data process time:0.011s\n", - "id:preprocess process time:8.173s\n", - "id:xgboost process time:8.112s\n", - "id:backtest process time:0.010s\n", - "id:train_df process time:0.008s\n", - "id:portfolio_opt_train process time:0.041s\n", - "id:sharpe_ratio_trn process time:2.313s\n", - "id:cumulative_return_trn process time:2.392s\n", - "id:validation_df process time:0.008s\n", - "id:portfolio_opt_validation process time:0.042s\n", - "id:sharpe_ratio_val process time:2.314s\n", - "id:cumulative_return_val process time:2.251s\n", - "CPU times: user 12.5 s, sys: 1.47 s, total: 14 s\n", - "Wall time: 47.7 s\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "d3f9031bdd434290a8692e1156378e19", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "VBox(children=(Figure(axes=[Axis(label='Cumulative return', orientation='vertical', scale=LinearScale()), Axis…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%%time\n", - "min_volume = 4.0\n", - "min_rate = -10.0\n", - "max_rate = 10.0\n", - "replace_spec={}\n", - "replace_spec['technical_indicator'] = {\"conf\": indicator_conf}\n", - "\n", - "replace_spec['node_filterValue']={\"conf\": [{\"column\": \"volume_mean\", \"min\": min_volume},\n", - " {\"column\": \"returns_max\", \"max\": max_rate},\n", - " {\"column\": \"returns_min\", \"min\": min_rate}]}\n", - "replace_spec['preprocess'] = {\"conf\": {\"subnodes_conf\": {\n", - " \"value_filter\": {\n", - " \"conf\": [{\"column\": \"average_volume\", \"min\": min_volume},\n", - " {\"column\": \"max_return\", \"max\": max_rate},\n", - " {\"column\": \"min_return\", \"min\": min_rate}]\n", - " },\n", - " \"drop_columns\": {\n", - " \"conf\": {\n", - " \"columns\": [\"average_volume\", \"min_return\", \"max_return\"]\n", - " }\n", - " }\n", - " },\n", - " \"taskgraph\": \"taskgraphs/preprocess.gq.yaml\",\n", - " \"input\": [\"sort_node.in\"],\n", - " \"output\": [\"drop_columns.out\"]\n", - " },\n", - " \"inputs\": {\"sort_node@in\": \"stock_data.dask_cudf_out\"}}\n", - "\n", - "o_gpu = task_graph.run(replace=replace_spec, profile=True)\n", - "plot_figures(o_gpu)" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "8640f861899747c5995a54dd8e2fe68b", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "VBox(children=(Figure(axes=[Axis(label='Cumulative return', orientation='vertical', scale=LinearScale(), side=…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "plot_figures(o_gpu)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We get Sharpe Ratio of `4.7` in the testing dataset. This is a great improvement!" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Strategy parameter search\n", - "Quantitative analyst usually need to explore different parameters for their trading strategy. The exploration process is an iterative process. gQuant help to speed up this by allowing using cached dataframe and evaluating the sub-graphs.\n", - "\n", - "To find the optimal technical indicator parameters for this XGBoost strategy, we build a wiget to search the parameter interactively. " - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "e4c2a1671bd54563b66c26e87864017a", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "VBox(children=(VBox(children=(IntRangeSlider(value=(10, 20), continuous_update=False, description='Chaikin', m…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import plotutils\n", - "replace_spec={}\n", - "replace_spec['technical_indicator'] = {\"conf\": indicator_conf}\n", - "replace_spec['sort_after'] = {\"load\": {'out': cached_sort}}\n", - "plotutils.getXGBoostWidget(replace_spec, task_graph, plot_figures)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Conclusions\n", - "In this notebook, we demoed how to use gQuant to backtest XGBoost trading strategy. It is convenient and efficient to use indicator node from the gQuant to compute features for all the stocks in the dataset in the GPU. The XGBoost training are computed in the GPU, so we can get the results quickly. This example shows the XGBoost algorithm's power in finding trading signals. We can achieve close to 2 raw Sharpe ratio in the testing time period." - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'status': 'ok', 'restart': True}" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import IPython\n", - "app = IPython.Application.instance()\n", - "app.kernel.do_shutdown(True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.7" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/notebooks/07_fractional_differencing.ipynb b/notebooks/07_fractional_differencing.ipynb deleted file mode 100644 index 3c6c22c9..00000000 --- a/notebooks/07_fractional_differencing.ipynb +++ /dev/null @@ -1,825 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Fractional Differencing\n", - "\n", - "### Background\n", - "Fractional Differencing is a signal processing technique that is used to remove the non-stationarity from the time series while maintaining as much memory as possible. It is widely used in FSI to prepare training data for machine learning algorithms. In this [open-source project](https://github.com/ritchieng/fractional_differencing_gpu/blob/master/notebooks/gpu_fractional_differencing.ipynb) done by Ensemble Capital, fractional differencing computation is accelerated via `cudf.appy_chunk` method in the GPU. It achieves hundreds of times acceleration compared with CPU implementation in their [report](https://www.researchgate.net/publication/335159299_GFD_GPU_Fractional_Differencing_for_Rapid_Large-scale_Stationarizing_of_Time_Series_Data_while_Minimizing_Memory_Loss). \n", - "Using `apply_rows` and `apply_chunks` method from the cudf library is the easiest way of customizing GPU computations as covered in this [blog](https://medium.com/rapids-ai/user-defined-functions-in-rapids-cudf-2d7c3fc2728d). However, it is not the most efficient way.\n", - "\n", - "In this notebook, we are going to show how to use Numba to do fractional differencing computation efficiently. As gQuant wrap the fractional differencing function in the computation node, we are going to show it is easy for data scientists to compute fractional differencing signals and use them to generate alpha signals. \n", - "\n", - "### Environment Preparation" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import sys; sys.path.insert(0, '..')\n", - "\n", - "import warnings\n", - "import gquant\n", - "import ipywidgets as widgets\n", - "import os\n", - "import time\n", - "import numpy as np\n", - "from numba import cuda\n", - "import cudf\n", - "import inspect\n", - "from numba import njit\n", - "from numba import prange\n", - "from gquant.dataframe_flow.task import load_modules\n", - "load_modules(os.environ['MODULEPATH']+'/rapids_modules')\n", - "from rapids_modules.cuindicator import get_weights_floored, fractional_diff\n", - "warnings.simplefilter(\"ignore\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Copy the fractional differencing code from the [open-source project](https://github.com/ritchieng/fractional_differencing_gpu/blob/master/notebooks/gpu_fractional_differencing.ipynb). We will use this as our benchmark reference" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "def moving_dot_product_kernel(in_data, out, window_size, weights):\n", - " # Set the first window_size-1 rows in each chunk to np.nan due \n", - " # insufficient history\n", - " for i in range(cuda.threadIdx.x, window_size - 1, cuda.blockDim.x):\n", - " out[i] = np.nan\n", - " \n", - " # Compute dot product of preceding window_size rows\n", - " for i in range(cuda.threadIdx.x + window_size - 1, in_data.size, cuda.blockDim.x):\n", - " rolling_dot_product = 0.0\n", - " \n", - " k = 0\n", - " for j in range(i - window_size + 1, i + 1):\n", - " rolling_dot_product += in_data[j] * weights[k][0]\n", - " k += 1\n", - " \n", - " out[i] = rolling_dot_product \n", - " \n", - "def frac_diff_gpu(df, d, floor=1e-3):\n", - " r\"\"\"Fractionally difference time series via GPU.\n", - " \n", - " Args:\n", - " df (pd.DataFrame): dataframe of raw time series values.\n", - " d (float): differencing value from 0 to 1 where > 1 has no FD.\n", - " floor (float): minimum value of weights, ignoring anything smaller.\n", - " \"\"\"\n", - " \n", - " # Bring dataframe to GPU, reset index for GPU dot product kernel\n", - " # gdf_raw = cudf.from_pandas(df).reset_index(drop=True)\n", - " gdf_raw = df\n", - " gdf_raw.columns = ['in_data']\n", - "\n", - " # Get weights window\n", - " weights = get_weights_floored(d=d, num_k=len(gdf_raw), floor=floor)\n", - " weights_window_size = len(weights)\n", - " \n", - " # Reverse weights and as contiguous\n", - " weights = np.ascontiguousarray(weights[::-1])\n", - " \n", - " # Bring weights to GPU\n", - " gdf_weights = cudf.DataFrame()\n", - " gdf_weights[gdf_raw.columns[0]] = weights.reshape(-1)\n", - "\n", - " # Length of data\n", - " data_length = len(gdf_raw)\n", - "\n", - " # T4: max of 518 threads per block.\n", - " # V100: max 1024 threads per block\n", - " threads_per_block = 518\n", - "\n", - " # Chunk size split\n", - " # This has to be improved, but as a v0.1, it's sufficient to show speed-up\n", - " # Up to easily 100 million data points\n", - " trunk_size = data_length\n", - "\n", - " # Get fractionally differenced time series through GPU function\n", - " gdf_raw_fd = gdf_raw.apply_chunks(moving_dot_product_kernel,\n", - " incols=['in_data'],\n", - " outcols=dict(out=np.float64),\n", - " kwargs=dict(window_size=weights_window_size, weights=weights),\n", - " chunks=list(range(0, data_length, trunk_size)) + [data_length],\n", - " tpb=threads_per_block)\n", - " \n", - " # Bring to CPU for normal manipulation\n", - " # df_raw_fd = gdf_raw_fd.to_pandas().dropna().iloc[:-1, 1]\n", - " \n", - " return gdf_raw_fd, weights" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Following is the gQuant's fractional differencing implementation via Numba library" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "def fractional_diff(input_arr, d=0.5, floor=1e-3, min_periods=None,\n", - " thread_tile=2, number_of_threads=512):\n", - " \"\"\"\n", - " The fractional difference computation method.\n", - "\n", - " Arguments:\n", - " -------\n", - " input_arr: numba.cuda.DeviceNDArray or cudf.Series\n", - " the input array to compute the fractional difference\n", - " d: float\n", - " the differencing value. range from 0 to 1\n", - " floor: float\n", - " minimum value for the weights for computational efficiency.\n", - " min_periods: int\n", - " default the lengths of the weights. Need at least min_periods of\n", - " non-na elements to get fractional difference value\n", - " thread_tile: int\n", - " each thread will be responsible for `thread_tile` number of\n", - " elements in window computation\n", - " number_of_threads: int\n", - " number of threads in a block for CUDA computation\n", - "\n", - " Returns\n", - " -------\n", - " (numba.cuda.DeviceNDArray, np.array)\n", - " the computed fractional difference array and the weight array tuple\n", - "\n", - " \"\"\"\n", - " if isinstance(input_arr, numba.cuda.cudadrv.devicearray.DeviceNDArray):\n", - " gpu_in = input_arr\n", - " else:\n", - " gpu_in = input_arr.to_gpu_array()\n", - "\n", - " # compute the weights for the fractional difference\n", - " weights = get_weights_floored(d=d,\n", - " num_k=len(input_arr),\n", - " floor=floor)[::-1, 0]\n", - " weights_out = np.ascontiguousarray(weights)\n", - " weights = numba.cuda.to_device(weights_out)\n", - "\n", - " window = len(weights)\n", - "\n", - " if min_periods is None:\n", - " min_periods = window\n", - " else:\n", - " min_periods = min_periods\n", - "\n", - " number_of_threads = number_of_threads\n", - " array_len = len(gpu_in)\n", - "\n", - " # allocate the output array\n", - " gpu_out = numba.cuda.device_array_like(gpu_in)\n", - "\n", - " number_of_blocks = \\\n", - " (array_len + (number_of_threads * thread_tile - 1)) // \\\n", - " (number_of_threads * thread_tile)\n", - "\n", - " shared_buffer_size = (number_of_threads * thread_tile +\n", - " window - 1 + window)\n", - "\n", - " # call the conv kernel\n", - " kernel[(number_of_blocks,),\n", - " (number_of_threads,),\n", - " 0,\n", - " shared_buffer_size * 8](gpu_in,\n", - " weights,\n", - " gpu_out,\n", - " window,\n", - " array_len,\n", - " thread_tile,\n", - " min_periods)\n", - " return gpu_out, weights_out\n", - "\n" - ] - } - ], - "source": [ - "print(inspect.getsource(fractional_diff))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "It launches the Numba kernel, which defined as:" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "@cuda.jit(device=True)\n", - "def conv_window(shared, history_len, out_arr, window_size,\n", - " arr_len, offset, offset2, min_size):\n", - " \"\"\"\n", - " This function is to do convolution for one thread\n", - "\n", - " Arguments:\n", - " ------\n", - " shared: numba.cuda.DeviceNDArray\n", - " 3 chunks of data are stored in the shared memory\n", - " the first [0, window_size) elements is the chunk of data that is\n", - " necessary to compute the first convolution element.\n", - " then [window_size, window_size + thread_tile * blockDim) elements\n", - " are the inputs allocated for this block of threads\n", - " the last [window_size + thread_tile,\n", - " window_size + thread_tile + window_size) is to store the kernel values\n", - " history_len: int\n", - " total number of historical elements available for this chunk of data\n", - " out_arr: numba.cuda.DeviceNDArray\n", - " output gpu_array of size of `thread_tile`\n", - " window_size: int\n", - " the number of elements in the kernel\n", - " arr_len: int\n", - " the chunk array length, same as `thread_tile`\n", - " offset: int\n", - " indicate the starting index of the chunk array in the shared for\n", - " this thread.\n", - " offset: int\n", - " indicate the starting position of the weights/kernel array\n", - " min_size: int\n", - " the minimum number of non-na elements\n", - " \"\"\"\n", - " for i in range(arr_len):\n", - " if i + history_len < window_size-1:\n", - " out_arr[i] = np.nan\n", - " else:\n", - " s = 0.0\n", - " average_size = 0\n", - " for j in range(0, window_size):\n", - " if not (cmath.isnan(\n", - " shared[offset + i - j])):\n", - " s += (shared[offset + i - j] *\n", - " shared[offset2 + window_size - 1 - j])\n", - " average_size += 1\n", - " if average_size >= min_size:\n", - " out_arr[i] = s\n", - " else:\n", - " out_arr[i] = np.nan\n", - "\n", - " \n", - "@cuda.jit\n", - "def kernel(in_arr, weight_arr, out_arr, window,\n", - " arr_len, thread_tile, min_size):\n", - " \"\"\"\n", - " This kernel is to do 1D convlution on `in_arr` array with `weight_arr`\n", - " as kernel. The results is saved on `out_arr`.\n", - "\n", - " Arguments:\n", - " ------\n", - " in_arr: numba.cuda.DeviceNDArray\n", - " input gpu array\n", - " weight_arr: numba.cuda.DeviceNDArray\n", - " convolution kernel gpu array\n", - " out_arr: numba.cuda.DeviceNDArray\n", - " output gpu_array\n", - " window: int\n", - " the number of elements in the weight_arr\n", - " arr_len: int\n", - " the input/output array length\n", - " thread_tile: int\n", - " each thread is responsible for `thread_tile` number of elements\n", - " min_size: int\n", - " the minimum number of non-na elements\n", - " \"\"\"\n", - " shared = cuda.shared.array(shape=0,\n", - " dtype=numba.float64)\n", - " block_size = cuda.blockDim.x # total number of threads\n", - " tx = cuda.threadIdx.x\n", - " # Block id in a 1D grid\n", - " bid = cuda.blockIdx.x\n", - " starting_id = bid * block_size * thread_tile\n", - "\n", - " # copy the thread_tile * number_of_thread_per_block into the shared\n", - " for j in range(thread_tile):\n", - " offset = tx + j * block_size\n", - " if (starting_id + offset) < arr_len:\n", - " shared[offset + window - 1] = in_arr[\n", - " starting_id + offset]\n", - " cuda.syncthreads()\n", - "\n", - " # copy the window - 1 into the shared\n", - " for j in range(0, window - 1, block_size):\n", - " if (((tx + j) <\n", - " window - 1) and (\n", - " starting_id - window + 1 + tx + j >= 0)):\n", - " shared[tx + j] = \\\n", - " in_arr[starting_id - window + 1 + tx + j]\n", - " cuda.syncthreads()\n", - " # copy the weights into the shared\n", - " for j in range(0, window, block_size):\n", - " element_id = tx + j\n", - " if (((tx + j) < window) and (element_id < window)):\n", - " shared[thread_tile * block_size + window - 1 + tx +\n", - " j] = weight_arr[tx + j]\n", - " cuda.syncthreads()\n", - " # slice the shared memory for each threads\n", - " start_shared = tx * thread_tile\n", - " his_len = min(window - 1,\n", - " starting_id + tx * thread_tile)\n", - " # slice the global memory for each threads\n", - " start = starting_id + tx * thread_tile\n", - " end = min(starting_id + (tx + 1) * thread_tile, arr_len)\n", - " sub_outarr = out_arr[start:end]\n", - " sub_len = end - start\n", - " conv_window(shared, his_len, sub_outarr,\n", - " window, sub_len,\n", - " window - 1 + start_shared,\n", - " thread_tile * block_size + window - 1,\n", - " min_size)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Fractional differencing is essentially doing 1D convolution computation with the kernel values set to be the weights computed from get_weights_floored. Check the original notebook for the details of the meanings of the weights. To make convolution computation faster, we divide the long input array into small chunks and send to different thread blocks. All the array chunks and the weights are loaded into the GPU shared memory for fast IO. The device function conv_window is doing the convolution computation for one thread.\n", - "\n", - "To make a fair comparsion with CPU implementation, we implemented an efficient CPU version of the fractional differencing calculation. It is accelerated by numba.njit that take advantage of multiple cores of the CPU and fastmath compiler optimization." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "@njit(fastmath=True, parallel=True)\n", - "def moving_dot_product_cpu(in_data, out, window_size, weights):\n", - " # Set the first window_size-1 rows in each chunk to np.nan due \n", - " # insufficient history\n", - " for i in prange(0, window_size - 1):\n", - " out[i] = np.nan\n", - " \n", - " # Compute dot product of preceding window_size rows\n", - " for i in prange(window_size - 1, len(in_data)):\n", - " rolling_dot_product = 0.0\n", - " \n", - " k = 0\n", - " for j in range(i - window_size + 1, i + 1):\n", - " rolling_dot_product += in_data[j] * weights[k]\n", - " k += 1\n", - " \n", - " out[i] = rolling_dot_product \n", - "\n", - "def cpu_fractional_diff(input_arr, d=0.5, floor=1e-3):\n", - "\n", - " # compute the weights for the fractional difference\n", - " weights = get_weights_floored(d=d,\n", - " num_k=len(input_arr),\n", - " floor=floor)[::-1, 0]\n", - " weights_out = np.ascontiguousarray(weights)\n", - " weights = weights_out\n", - " weights_window_size = len(weights)\n", - " window = len(weights)\n", - " out = np.zeros_like(input_arr)\n", - " moving_dot_product_cpu(input_arr, out, weights_window_size, weights)\n", - " return out" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Fractional differencing is essentially doing 1D convolution computation with the kernel values set to be the weights computed from `get_weights_floored`. Check the original [notebook](https://github.com/ritchieng/fractional_differencing_gpu/blob/master/notebooks/gpu_fractional_differencing.ipynb) for the details of the meanings of the weights. To make convolution computation faster, we divide the long input array into small chunks and send to different thread blocks. All the array chunks and the weights are loaded into the GPU shared memory for fast IO. The device function `conv_window` is doing the convolution computation for one thread.\n", - "\n", - "We can compare the performance of gQuant GPU implementation vs the original one and CPU implementation:" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "array size 100000, Ensemble: time 0.503 s, gQuant GPU Time 0.537 s, gQuant CPU Time 0.942, speed up 0.94, speed up vs CPU 1.75, error 0.0000 \n", - "array size 1000000, Ensemble: time 0.138 s, gQuant GPU Time 0.018 s, gQuant CPU Time 0.044, speed up 7.74, speed up vs CPU 2.48, error 0.0000 \n", - "array size 10000000, Ensemble: time 0.994 s, gQuant GPU Time 0.032 s, gQuant CPU Time 0.134, speed up 31.11, speed up vs CPU 4.19, error 0.0000 \n", - "array size 100000000, Ensemble: time 9.350 s, gQuant GPU Time 0.263 s, gQuant CPU Time 1.162, speed up 35.54, speed up vs CPU 4.42, error 0.0000 \n" - ] - } - ], - "source": [ - "for i in range(5, 9):\n", - " df_raw = cudf.DataFrame()\n", - " ran_array = np.random.rand(10**int(i))\n", - " df_raw['in'] = ran_array\n", - " df_raw2 = cudf.DataFrame()\n", - " df_raw2['in'] = ran_array\n", - "\n", - " # Start timer\n", - " start = time.time()\n", - " df_raw_fd_from_gpu, weights = frac_diff_gpu(df_raw, d=0.5, floor=5e-5)\n", - " # End timer\n", - " end = time.time()\n", - " duration = end - start\n", - "\n", - " start = time.time()\n", - " gquant_gpu, weights = fractional_diff(df_raw2['in'], d=0.5, floor=5e-5)\n", - " cuda.synchronize()\n", - " end = time.time()\n", - " optimized_duration = end - start\n", - " #(df_raw_fd_from_gpu.values)\n", - " \n", - " \n", - " start = time.time()\n", - " cpu_result = cpu_fractional_diff(ran_array, d=0.5, floor=5e-5)\n", - " end = time.time()\n", - " cpu_duration = end - start\n", - " \n", - " err = np.abs(df_raw_fd_from_gpu['out'].to_array()[weights.size-1:] - np.array(gquant_gpu)[weights.size-1:]).max()\n", - " err = max(np.abs(df_raw_fd_from_gpu['out'].to_array()[weights.size-1:] - cpu_result[weights.size-1:]).max(), err)\n", - " print('array size %d, Ensemble: time %.3f s, gQuant GPU Time %.3f s, gQuant CPU Time %.3f, speed up %.2f, speed up vs CPU %.2f, error %.4f ' % (10**int(i), duration, optimized_duration, cpu_duration, duration / optimized_duration, cpu_duration/optimized_duration, err))\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For the array of length 100m, gQuant can achieve 100x speedup compare with the Ensemble Capitial's GPU implementatoin and 30x speed up compared with multiple core CPU." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Use the fractional differencing signal to trade stocks\n", - "\n", - "We will use the same [XGBoost example](https://github.com/rapidsai/gQuant/blob/master/notebooks/06_xgboost_trade.ipynbx) to do backtest with fractional differencing signals. The workflow includes the following steps:\n", - "\n", - "1. Preprocess the datasets.\n", - "\n", - "2. Compute the features based on different fractional differencing signals of the closing prices of the stocks \n", - "\n", - "3. Split the data in training and testing and build a XGBoost model based on the training data. From the XGBoost model, compute the trading signals for all the data points.\n", - "\n", - "4. Run backtesting and compute the returns from this strategy for each of the days and stock symbols \n", - "\n", - "5. Run a simple portfolio optimization by averaging the stocks together for each of the trading days.\n", - "\n", - "6. Compute the Sharpe ratio and cumulative return results for both training and testing datasets\n", - "\n", - "The whole workflow can be organized into a computation graph, which are fully described in a yaml file. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Each nodes has a unique id, a node type, configuration parameters and input nodes ids. gQuant takes this yaml file, wires it into a graph to visualize it. " - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "%reset -s -f\n", - "import sys\n", - "import os\n", - "sys.path.append('..')\n", - "import gquant\n", - "from gquant.dataframe_flow import TaskGraph\n", - "import ipywidgets as widgets\n", - "import warnings\n", - "warnings.simplefilter(\"ignore\")\n", - "\n", - "task_graph = TaskGraph.load_taskgraph('../taskgraphs/xgboost_trade.gq.yaml')\n", - "task_graph.draw(show='ipynb')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The features used for XGBoost algorithm are prepared in the `xgboost` Task node, where `cuIndicator` module is used to compute the technical indicators in the GPU for all the stock symbols. `xgboost` is the Task node that is used to compute the trading signals from the stock technical indicators. Each of the gQuant Task node is implemented by overwriting `meta_setup`, `process`, `ports_setup`, `conf_chema` methods of the Node base class. Please refer to [customize nodes notebook](https://github.com/rapidsai/gQuant/blob/master/notebooks/05_customize_nodes.ipynb) for details. Following is the source code for \"XGBoostStrategyNode\":" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "# define the function to format the plots\n", - "def plot_figures(o):\n", - " # format the figures\n", - " figure_width = '1200px'\n", - " figure_height = '400px'\n", - " sharpe_number = o['sharpe_ratio_trn.sharpe_out']\n", - " cum_return_train = o['cumulative_return_trn.cum_return']\n", - " cum_return_train.layout.height = figure_height\n", - " cum_return_train.layout.width = figure_width\n", - " cum_return_train.title = 'Training P & L %.3f' % (sharpe_number)\n", - " sharpe_number = o['sharpe_ratio_val.sharpe_out']\n", - " cum_return_test = o['cumulative_return_val.cum_return']\n", - " cum_return_test.layout.height = figure_height\n", - " cum_return_test.layout.width = figure_width\n", - " cum_return_test.title = 'Testing P & L %.3f' % (sharpe_number)\n", - " return widgets.VBox([cum_return_train, cum_return_test])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In this example, we are going to add 5 fractional differencing signals from the closing prices " - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "indicator_conf = {\n", - " \"indicators\": [\n", - " {\"function\": \"port_fractional_diff\",\n", - " \"columns\": [\"close\"],\n", - " \"args\": [0.5]\n", - " },\n", - " {\"function\": \"port_fractional_diff\",\n", - " \"columns\": [\"close\"],\n", - " \"args\": [0.3]\n", - " },\n", - " {\"function\": \"port_fractional_diff\",\n", - " \"columns\": [\"close\"],\n", - " \"args\": [0.1]\n", - " },\n", - " {\"function\": \"port_fractional_diff\",\n", - " \"columns\": [\"close\"],\n", - " \"args\": [0.7]\n", - " },\n", - " {\"function\": \"port_fractional_diff\",\n", - " \"columns\": [\"close\"],\n", - " \"args\": [0.9]\n", - " },\n", - " {\"function\": \"port_shift\",\n", - " \"columns\": [\"returns\"],\n", - " \"args\": [-1]\n", - " } \n", - " ],\n", - " \"remove_na\": True\n", - "}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Run the backtest" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "id:stock_data process time:4.121s\n", - "id:preprocess process time:1.224s\n", - "id:sort_after process time:0.049s\n", - "id:technical_indicator process time:0.906s\n", - "id:xgboost process time:2.382s\n", - "id:backtest process time:0.002s\n", - "id:train_df process time:0.152s\n", - "id:portfolio_opt_train process time:0.024s\n", - "id:sharpe_ratio_trn process time:0.001s\n", - "id:cumulative_return_trn process time:0.023s\n", - "id:validation_df process time:0.005s\n", - "id:portfolio_opt_validation process time:0.017s\n", - "id:sharpe_ratio_val process time:0.001s\n", - "id:cumulative_return_val process time:0.021s\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "f8d5c36efa0d416d813680cd601e0eaf", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "VBox(children=(Figure(axes=[Axis(label='Cumulative return', orientation='vertical', scale=LinearScale()), Axis…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "\n", - "replace_spec = {}\n", - "replace_spec['technical_indicator'] = {\"conf\": indicator_conf}\n", - "\n", - "o_gpu = task_graph.run(replace=replace_spec, profile=True)\n", - "\n", - "plot_figures(o_gpu)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We get Sharpe Ratio of `1.01` just from the fractional differencing signals of the closing prices\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To visualize the computed fractional differencing signals, we can make a TaskGraph to visualize it. We put the XGboost trade TaskGraph into a composite node. We select the asset with id `22123` and plot 4 fractional differencing signals with different `d` values. Check the updated graph below. Note, there are 2 layers of composite nodes in the following graph. As you can see, composite node is a powerful way of organizing the TaskGraphs. " - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "task_graph = TaskGraph.load_taskgraph('../taskgraphs/visualize_frac_diff.gq.yaml')\n", - "task_graph.draw(show='ipynb')" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "62f1268c0a7d40b9b486ee91867c3372", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "GQuantWidget(sub=HBox(), value=[OrderedDict([('id', 'stock_data'), ('type', 'CsvStockLoader'), ('conf', {'file…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "task_graph.draw()" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "7c637075d90e4b4ba525d75bede99d83", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Tab(children=(Output(), Output(layout=Layout(border='1px solid black'), outputs=({'output_type': 'stream', 'na…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "task_graph.run(formated=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can run the sub-graph just for plotting the signals." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note, smaller `d` value signal has more memory information but not as stationary as the high `d` value signals. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Conclusions\n", - "In this notebook, we demoed how to use Numba to implemement fractional differencing calculation in GPU. It achieves 100x speed up compared with the method done by Ensemble Capital. We also showed it is easy to use gQuant to compute fractional difference and run backtests" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'status': 'ok', 'restart': True}" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import IPython\n", - "app = IPython.Application.instance()\n", - "app.kernel.do_shutdown(True)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.7" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/task_example/cpu_port_trade.yaml b/task_example/cpu_port_trade.yaml deleted file mode 100644 index 4e81a3d2..00000000 --- a/task_example/cpu_port_trade.yaml +++ /dev/null @@ -1,143 +0,0 @@ -- id: node_cpu_csvdata - type: PandasCsvStockLoader - conf: - path: ./data/stock_price_hist.csv.gz - inputs: [] -- id: node_sort - type: SortNode - conf: - keys: - - asset - - datetime - inputs: - - node_cpu_csvdata -- id: node_addReturn - type: CpuReturnFeatureNode - conf: {} - inputs: - - node_sort -- id: node_addIndicator - type: CpuAssetIndicatorNode - conf: {} - inputs: - - node_addReturn -- id: node_volumeMean - type: AverageNode - conf: - column: volume - inputs: - - node_addIndicator -- id: node_renameMeanVolume - type: RenameNode - conf: - old: volume - new: volume_mean - inputs: - - node_volumeMean -- id: node_leftMergeMeanVolume - type: LeftMergeNode - conf: - column: asset - inputs: - - node_addIndicator - - node_renameMeanVolume -- id: node_maxReturns - type: MaxNode - conf: - column: returns - inputs: - - node_addIndicator -- id: node_renameMaxReturn - type: RenameNode - conf: - old: returns - new: returns_max - inputs: - - node_maxReturns -- id: node_leftMergeMaxReturn - type: LeftMergeNode - conf: - column: asset - inputs: - - node_leftMergeMeanVolume - - node_renameMaxReturn -- id: node_minReturns - type: MinNode - conf: - column: returns - inputs: - - node_addIndicator -- id: node_renameMinReturn - type: RenameNode - conf: - old: returns - new: returns_min - inputs: - - node_minReturns -- id: node_leftMergeMinReturn - type: LeftMergeNode - conf: - column: asset - inputs: - - node_leftMergeMaxReturn - - node_renameMinReturn -- id: node_filterValue - type: ValueFilterNode - conf: - - column: volume_mean - min: 10.0 - - column: returns_max - max: 10.0 - - column: returns_min - min: -10.0 - inputs: - - node_leftMergeMinReturn -- id: node_dropColumns - type: DropNode - conf: - columns: - - volume_mean - - returns_min - - returns_max - - open - - high - - low - - volume - inputs: - - node_filterValue -- id: node_sort2 - type: SortNode - conf: - keys: - - asset - - datetime - inputs: - - node_dropColumns -- id: node_exp_strategy - type: CpuPortExpMovingAverageStrategyNode - conf: - fast: 5 - slow: 20 - inputs: - - node_sort2 -- id: node_backtest - type: SimpleBackTestNode - conf: {} - inputs: - - node_exp_strategy -- id: node_portfolioOpt - type: SimpleAveragePortOpt - conf: {} - inputs: - - node_backtest -- id: node_sharpeRatio - type: SharpeRatioNode - conf: {} - inputs: - - node_portfolioOpt -- id: node_cumlativeReturn - type: CumReturnNode - conf: {'points': 300} - inputs: - - node_portfolioOpt - diff --git a/task_example/dask_port_trade.yaml b/task_example/dask_port_trade.yaml deleted file mode 100644 index b424efd0..00000000 --- a/task_example/dask_port_trade.yaml +++ /dev/null @@ -1,143 +0,0 @@ -- id: node_dask_csvdata - type: DaskCsvStockLoader - conf: - path: many-small - inputs: [] -- id: node_sort - type: SortNode - conf: - keys: - - asset - - datetime - inputs: - - node_dask_csvdata -- id: node_addReturn - type: ReturnFeatureNode - conf: {} - inputs: - - node_sort -- id: node_addIndicator - type: AssetIndicatorNode - conf: {} - inputs: - - node_addReturn -- id: node_volumeMean - type: AverageNode - conf: - column: volume - inputs: - - node_addIndicator -- id: node_renameMeanVolume - type: RenameNode - conf: - old: volume - new: volume_mean - inputs: - - node_volumeMean -- id: node_leftMergeMeanVolume - type: LeftMergeNode - conf: - column: asset - inputs: - - node_addIndicator - - node_renameMeanVolume -- id: node_maxReturns - type: MaxNode - conf: - column: returns - inputs: - - node_addIndicator -- id: node_renameMaxReturn - type: RenameNode - conf: - old: returns - new: returns_max - inputs: - - node_maxReturns -- id: node_leftMergeMaxReturn - type: LeftMergeNode - conf: - column: asset - inputs: - - node_leftMergeMeanVolume - - node_renameMaxReturn -- id: node_minReturns - type: MinNode - conf: - column: returns - inputs: - - node_addIndicator -- id: node_renameMinReturn - type: RenameNode - conf: - old: returns - new: returns_min - inputs: - - node_minReturns -- id: node_leftMergeMinReturn - type: LeftMergeNode - conf: - column: asset - inputs: - - node_leftMergeMaxReturn - - node_renameMinReturn -- id: node_filterValue - type: ValueFilterNode - conf: - - column: volume_mean - min: 200.0 - - column: returns_max - max: 10.0 - - column: returns_min - min: -10.0 - inputs: - - node_leftMergeMinReturn -- id: node_dropColumns - type: DropNode - conf: - columns: - - volume_mean - - returns_min - - returns_max - - open - - high - - low - - volume - inputs: - - node_filterValue -- id: node_sort2 - type: SortNode - conf: - keys: - - asset - - datetime - inputs: - - node_dropColumns -- id: node_exp_strategy - type: PortExpMovingAverageStrategyNode - conf: - fast: 5 - slow: 20 - inputs: - - node_sort2 -- id: node_backtest - type: SimpleBackTestNode - conf: {} - inputs: - - node_exp_strategy -- id: node_portfolioOpt - type: SimpleAveragePortOpt - conf: {} - inputs: - - node_backtest -- id: node_sharpeRatio - type: SharpeRatioNode - conf: {} - inputs: - - node_portfolioOpt -- id: node_cumlativeReturn - type: CumReturnNode - conf: {'points': 300} - inputs: - - node_portfolioOpt - diff --git a/task_example/dask_task.yaml b/task_example/dask_task.yaml deleted file mode 100644 index 7a2494a1..00000000 --- a/task_example/dask_task.yaml +++ /dev/null @@ -1,31 +0,0 @@ -- id: node_csvdata_dask - type: DaskCsvStockLoader - conf: - path: many-small - inputs: [] -- id: node_minVolume - type: VolumeFilterNode - conf: - min: 50.0 - inputs: - - node_csvdata_dask -- id: node_sort - type: SortNode - conf: - keys: - - asset - - datetime - inputs: - - node_minVolume -- id: node_volumeMean - type: AverageNode - conf: - column: volume - inputs: - - node_sort -- id: node_outputCsv - type: OutCsvNode - conf: - path: symbol_volume.csv - inputs: - - node_volumeMean diff --git a/task_example/port_trade.yaml b/task_example/port_trade.yaml deleted file mode 100644 index b27a81e5..00000000 --- a/task_example/port_trade.yaml +++ /dev/null @@ -1,142 +0,0 @@ -- id: load_csv_data - type: CsvStockLoader - conf: - path: ./data/stock_price_hist.csv.gz - inputs: [] -- id: sort - type: SortNode - conf: - keys: - - asset - - datetime - inputs: - - load_csv_data -- id: add_return - type: ReturnFeatureNode - conf: {} - inputs: - - sort -- id: add_indicator - type: AssetIndicatorNode - conf: {} - inputs: - - add_return -- id: volume_mean - type: AverageNode - conf: - column: volume - inputs: - - add_indicator -- id: rename_mean_volume - type: RenameNode - conf: - old: volume - new: volume_mean - inputs: - - volume_mean -- id: left_merge_mean_volume - type: LeftMergeNode - conf: - column: asset - inputs: - - add_indicator - - rename_mean_volume -- id: max_returns - type: MaxNode - conf: - column: returns - inputs: - - add_indicator -- id: rename_max_return - type: RenameNode - conf: - old: returns - new: returns_max - inputs: - - max_returns -- id: left_merge_max_return - type: LeftMergeNode - conf: - column: asset - inputs: - - left_merge_mean_volume - - rename_max_return -- id: min_returns - type: MinNode - conf: - column: returns - inputs: - - add_indicator -- id: rename_min_return - type: RenameNode - conf: - old: returns - new: returns_min - inputs: - - min_returns -- id: left_merge_min_return - type: LeftMergeNode - conf: - column: asset - inputs: - - left_merge_max_return - - rename_min_return -- id: filter_value - type: ValueFilterNode - conf: - - column: volume_mean - min: 10.0 - - column: returns_max - max: 10.0 - - column: returns_min - min: -10.0 - inputs: - - left_merge_min_return -- id: drop_columns - type: DropNode - conf: - columns: - - volume_mean - - returns_min - - returns_max - - open - - high - - low - - volume - inputs: - - filter_value -- id: sort_2 - type: SortNode - conf: - keys: - - asset - - datetime - inputs: - - drop_columns -- id: exp_strategy - type: PortExpMovingAverageStrategyNode - conf: - fast: 5 - slow: 20 - inputs: - - sort_2 -- id: backtest - type: SimpleBackTestNode - conf: {} - inputs: - - exp_strategy -- id: portfolio_opt - type: SimpleAveragePortOpt - conf: {} - inputs: - - backtest -- id: sharpe_ratio - type: SharpeRatioNode - conf: {} - inputs: - - portfolio_opt -- id: cumlative_return - type: CumReturnNode - conf: {'points': 300} - inputs: - - portfolio_opt \ No newline at end of file diff --git a/task_example/simple_task.yaml b/task_example/simple_task.yaml deleted file mode 100644 index 6e5f7ae3..00000000 --- a/task_example/simple_task.yaml +++ /dev/null @@ -1,67 +0,0 @@ -- id: node_csvdata - type: CsvStockLoader - conf: - path: ./data/stock_price_hist.csv.gz - inputs: [] -- id: node_minVolume - type: VolumeFilterNode - conf: - min: 50.0 - inputs: - - node_csvdata -- id: node_sort - type: SortNode - conf: - keys: - - asset - - datetime - inputs: - - node_minVolume -- id: node_addReturn - type: ReturnFeatureNode - conf: {} - inputs: - - node_sort -- id: node_stockSymbol - type: StockNameLoader - conf: - path: ./data/security_master.csv.gz - inputs: [] -- id: node_volumeMean - type: AverageNode - conf: - column: volume - inputs: - - node_addReturn -- id: node_leftMerge1 - type: LeftMergeNode - conf: - column: asset - inputs: - - node_volumeMean - - node_stockSymbol -- id: node_returnMean - type: AverageNode - conf: - column: returns - inputs: - - node_addReturn -- id: node_leftMerge2 - type: LeftMergeNode - conf: - column: asset - inputs: - - node_returnMean - - node_stockSymbol -- id: node_outputCsv1 - type: OutCsvNode - conf: - path: symbol_volume.csv - inputs: - - node_leftMerge1 -- id: node_outputCsv2 - type: OutCsvNode - conf: - path: symbol_returns.csv - inputs: - - node_leftMerge2 diff --git a/task_example/simple_trade.yaml b/task_example/simple_trade.yaml deleted file mode 100644 index 32e63581..00000000 --- a/task_example/simple_trade.yaml +++ /dev/null @@ -1,65 +0,0 @@ -- id: load_csv_data - type: CsvStockLoader - conf: - path: ./data/stock_price_hist.csv.gz - inputs: [] -- id: node_assetFilter - type: AssetFilterNode - conf: - asset: 22123 - inputs: - - load_csv_data -- id: node_sort - type: SortNode - conf: - keys: - - asset - - datetime - inputs: - - node_assetFilter -- id: node_addReturn - type: ReturnFeatureNode - conf: {} - inputs: - - node_sort -- id: node_ma_strategy - type: MovingAverageStrategyNode - conf: - fast: 5 - slow: 10 - inputs: - - node_addReturn -- id: node_backtest - type: SimpleBackTestNode - conf: {} - inputs: - - node_ma_strategy -- id: node_sharpeRatio - type: SharpeRatioNode - conf: {} - inputs: - - node_backtest -- id: node_cumlativeReturn - type: CumReturnNode - conf: {'points': 300} - inputs: - - node_backtest -- id: node_barplot - type: BarPlotNode - conf: {'points': 300} - inputs: - - node_backtest -- id: node_lineplot - type: LinePlotNode - conf: - points: 300 - lines: - - column: ma_slow - label: Slow - color: blue - - column: ma_fast - label: Fast - color: green - title: Signals - inputs: - - node_backtest diff --git a/task_example/xgboost_trade.yaml b/task_example/xgboost_trade.yaml deleted file mode 100644 index 1f1212e5..00000000 --- a/task_example/xgboost_trade.yaml +++ /dev/null @@ -1,206 +0,0 @@ -- id: node_csvdata - type: CsvStockLoader - conf: - path: ./data/stock_price_hist.csv.gz - inputs: [] -- id: node_sort - type: SortNode - conf: - keys: - - asset - - datetime - inputs: - - node_csvdata -- id: node_addReturn - type: ReturnFeatureNode - conf: {} - inputs: - - node_sort -- id: node_addIndicator - type: AssetIndicatorNode - conf: {} - inputs: - - node_addReturn -- id: node_volumeMean - type: AverageNode - conf: - column: volume - inputs: - - node_addIndicator -- id: node_renameMeanVolume - type: RenameNode - conf: - old: volume - new: volume_mean - inputs: - - node_volumeMean -- id: node_leftMergeMeanVolume - type: LeftMergeNode - conf: - column: asset - inputs: - - node_addIndicator - - node_renameMeanVolume -- id: node_maxReturns - type: MaxNode - conf: - column: returns - inputs: - - node_addIndicator -- id: node_renameMaxReturn - type: RenameNode - conf: - old: returns - new: returns_max - inputs: - - node_maxReturns -- id: node_leftMergeMaxReturn - type: LeftMergeNode - conf: - column: asset - inputs: - - node_leftMergeMeanVolume - - node_renameMaxReturn -- id: node_minReturns - type: MinNode - conf: - column: returns - inputs: - - node_addIndicator -- id: node_renameMinReturn - type: RenameNode - conf: - old: returns - new: returns_min - inputs: - - node_minReturns -- id: node_leftMergeMinReturn - type: LeftMergeNode - conf: - column: asset - inputs: - - node_leftMergeMaxReturn - - node_renameMinReturn -- id: node_filterValue - type: ValueFilterNode - conf: - - column: volume_mean - min: 10.0 - - column: returns_max - max: 10.0 - - column: returns_min - min: -10.0 - inputs: - - node_leftMergeMinReturn -- id: node_dropColumns - type: DropNode - conf: - columns: - - volume_mean - - returns_min - - returns_max - inputs: - - node_filterValue -- id: node_sort2 - type: SortNode - conf: - keys: - - asset - - datetime - inputs: - - node_dropColumns -- id: node_technical_indicator - type: IndicatorNode - conf: - indicators: - - function: port_chaikin_oscillator - columns: - - high - - low - - close - - volume - args: - - 10 - - 20 - - function: port_bollinger_bands - columns: - - close - args: - - 10 - outputs: - - b1 - - b2 - - function: port_shift - columns: - - returns - args: - - -1 - remove_na: true - inputs: - - node_sort2 -- id: node_xgboost_strategy - type: XGBoostStrategyNode - conf: - train_date: 2010-1-1 - target: SHIFT_-1 - no_feature: - asset: int64 - datetime: date - volume: float64 - close: float64 - open: float64 - high: float64 - low: float64 - returns: float64 - indicator: int32 - inputs: - - node_technical_indicator -- id: node_backtest - type: SimpleBackTestNode - conf: {} - inputs: - - node_xgboost_strategy -- id: node_training_df - type: DatetimeFilterNode - conf: - beg: 1900-1-1 - end: 2010-1-1 - inputs: - - node_backtest -- id: node_portOpt2 - type: SimpleAveragePortOpt - conf: {} - inputs: - - node_training_df -- id: node_sharpe_training - type: SharpeRatioNode - conf: {} - inputs: - - node_portOpt2 -- id: node_testing_df - type: DatetimeFilterNode - conf: - beg: 2010-1-1 - end: 2020-1-1 - inputs: - - node_backtest -- id: node_portOpt1 - type: SimpleAveragePortOpt - conf: {} - inputs: - - node_testing_df -- id: node_sharpe_testing - type: SharpeRatioNode - conf: {} - inputs: - - node_portOpt1 -- id: node_cumlativeReturn_testing - type: CumReturnNode - conf: {'points': 300} - inputs: - - node_portOpt1 -- id: node_cumlativeReturn_training - type: CumReturnNode - conf: {'points': 300} - inputs: - - node_portOpt2 diff --git a/taskgraphs/dask_tutorial.gq.yaml b/taskgraphs/dask_tutorial.gq.yaml deleted file mode 100644 index 30648726..00000000 --- a/taskgraphs/dask_tutorial.gq.yaml +++ /dev/null @@ -1,35 +0,0 @@ -- id: stock_data - type: CsvStockLoader - conf: - file: /home/quant/gQuant/notebooks/data/stock_price_hist.csv.gz - path: /home/quant/gQuant/notebooks/many-small - inputs: {} - module: rapids_modules -- id: sort_node - type: SortNode - conf: - keys: - - asset - - datetime - inputs: - in: stock_data.dask_cudf_out - module: rapids_modules -- id: "" - type: Output_Collector - conf: {} - inputs: - in1: output_csv.df_out -- id: average_volume - type: AverageNode - conf: - column: volume - inputs: - stock_in: sort_node.out - module: rapids_modules -- id: output_csv - type: OutCsvNode - conf: - path: /home/quant/gQuant/notebooks/dask_average_volume.csv - inputs: - df_in: average_volume.stock_out - module: rapids_modules diff --git a/taskgraphs/visualize_frac_diff.gq.yaml b/taskgraphs/visualize_frac_diff.gq.yaml deleted file mode 100644 index ac58e2df..00000000 --- a/taskgraphs/visualize_frac_diff.gq.yaml +++ /dev/null @@ -1,113 +0,0 @@ -- id: stock_data - type: CsvStockLoader - conf: - file: notebooks/data/stock_price_hist.csv.gz - path: notebooks/many-small - inputs: {} - module: rapids_modules -- id: asset_filter - type: AssetFilterNode - conf: - asset: 22123 - inputs: - stock_in: xgboost_graph.technical_indicator@stock_out - module: rapids_modules -- id: "" - type: Output_Collector - conf: {} - inputs: - in1: lineplot.lineplot -- id: lineplot - type: LinePlotNode - conf: - lines: - - column: FR_DI_0.1 - label: d 0.1 - color: blue - - column: FR_DI_0.3 - label: d 0.3 - color: orange - - column: FR_DI_0.5 - label: d 0.5 - color: green - - column: FR_DI_0.7 - label: d 0.7 - color: black - points: 300 - title: signals - inputs: - in: asset_filter.stock_out - module: rapids_modules -- id: xgboost_graph - type: CompositeNode - conf: - input: - - preprocess.sort_node@in - output: - - technical_indicator.stock_out - subnode_ids: - - preprocess - - technical_indicator - subnodes_conf: - preprocess: - conf: - input: - - sort_node.in - output: - - drop_columns.out - subnode_ids: - - value_filter - - drop_columns - subnodes_conf: - value_filter: - conf: - - column: average_volume - min: 400 - - column: min_return - min: -10 - - column: max_return - max: 10 - drop_columns: - conf: - columns: - - average_volume - - min_return - - max_return - taskgraph: taskgraphs/preprocess.gq.yaml - technical_indicator: - conf: - indicators: - - function: port_fractional_diff - args: - - 0.9 - columns: - - close - - function: port_fractional_diff - args: - - 0.7 - columns: - - close - - function: port_fractional_diff - args: - - 0.5 - columns: - - close - - function: port_fractional_diff - args: - - 0.3 - columns: - - close - - function: port_fractional_diff - args: - - 0.1 - columns: - - close - - function: port_shift - args: - - -1 - columns: - - returns - remove_na: true - taskgraph: taskgraphs/xgboost_trade.gq.yaml - inputs: - preprocess@sort_node@in: stock_data.cudf_out diff --git a/taskgraphs/xgboost_example/stock_data.gq.yaml b/taskgraphs/xgboost_example/stock_data.gq.yaml deleted file mode 100644 index 521361b2..00000000 --- a/taskgraphs/xgboost_example/stock_data.gq.yaml +++ /dev/null @@ -1,229 +0,0 @@ -- id: stock_data - type: CsvStockLoader - conf: - file: notebooks/data/stock_price_hist.csv.gz - path: notebooks/many-small - inputs: {} - module: rapids_modules -- id: "" - type: Output_Collector - conf: {} - inputs: - in1: drop_col.out -- id: stock_feature - type: CompositeNode - conf: - input: - - preprocess.sort_node@in - output: - - technical_indicator.stock_out - subnode_ids: - - technical_indicator - subnodes_conf: - technical_indicator: - conf: - indicators: - - function: port_bollinger_bands - args: - - 10 - columns: - - close - - function: port_chaikin_oscillator - args: - - 2 - - 3 - columns: - - high - - low - - close - - volume - - function: port_macd - args: - - 2 - - 3 - columns: - - close - - function: port_relative_strength_index - args: - - 2 - columns: - - high - - low - - function: port_average_true_range - args: - - 2 - columns: - - high - - low - - close - - function: port_stochastic_oscillator_k - args: - - 2 - columns: - - high - - low - - close - - function: port_stochastic_oscillator_d - args: - - 2 - columns: - - high - - low - - close - - function: port_money_flow_index - args: - - 2 - columns: - - high - - low - - close - - volume - - function: port_force_index - args: - - 2 - columns: - - close - - volume - - function: port_ultimate_oscillator - args: - - 2 - columns: - - high - - low - - close - - function: port_accumulation_distribution - args: - - 2 - columns: - - high - - low - - close - - volume - - function: port_commodity_channel_index - args: - - 2 - columns: - - high - - low - - close - - function: port_on_balance_volume - args: - - 2 - columns: - - close - - volume - - function: port_vortex_indicator - args: - - 2 - columns: - - high - - low - - close - - function: port_kst_oscillator - args: - - 3 - - 4 - - 5 - - 6 - - 7 - - 8 - - 9 - - 10 - columns: - - close - - function: port_mass_index - args: - - 2 - - 3 - columns: - - high - - low - - function: port_true_strength_index - args: - - 2 - - 3 - columns: - - close - - function: port_ease_of_movement - args: - - 2 - columns: - - high - - low - - volume - - function: port_coppock_curve - args: - - 2 - columns: - - close - - function: port_keltner_channel - args: - - 2 - columns: - - high - - low - - close - - function: port_ppsr - args: - - 2 - columns: - - high - - low - - close - - function: port_fractional_diff - args: - - 0.9 - columns: - - close - - function: port_fractional_diff - args: - - 0.7 - columns: - - close - - function: port_fractional_diff - args: - - 0.5 - columns: - - close - - function: port_fractional_diff - args: - - 0.3 - columns: - - close - - function: port_fractional_diff - args: - - 0.1 - columns: - - close - - function: port_shift - args: - - -1 - columns: - - returns - remove_na: true - taskgraph: taskgraphs/xgboost_trade.gq.yaml - inputs: - preprocess@sort_node@in: stock_data.cudf_out -- id: pos_neg_return - type: AddSignIndicatorNode - conf: - sign: sign - column: SHIFT_-1 - inputs: - in: stock_feature.technical_indicator@stock_out - module: rapids_modules -- id: drop_col - type: DropNode - conf: - columns: - - indicator - - datetime - - asset - - SHIFT_-1 - - open - - high - - low - - close - inputs: - in: pos_neg_return.out - module: rapids_modules diff --git a/taskgraphs/xgboost_example/xgboost_stock.gq.yaml b/taskgraphs/xgboost_example/xgboost_stock.gq.yaml deleted file mode 100644 index 788ff956..00000000 --- a/taskgraphs/xgboost_example/xgboost_stock.gq.yaml +++ /dev/null @@ -1,300 +0,0 @@ -- id: stock_data - type: CsvStockLoader - conf: - file: notebooks/data/stock_price_hist.csv.gz - path: notebooks/many-small - inputs: {} - module: rapids_modules -- id: "" - type: Output_Collector - conf: {} - inputs: - in1: test_roc.roc_curve - in2: train_roc.roc_curve - in3: feature_importance.importance_curve - in4: xgboost_model.train_xgboost@model_out -- id: stock_feature - type: CompositeNode - conf: - input: - - preprocess.sort_node@in - output: - - technical_indicator.stock_out - subnode_ids: - - technical_indicator - subnodes_conf: - technical_indicator: - conf: - indicators: - - function: port_bollinger_bands - args: - - 10 - columns: - - close - - function: port_chaikin_oscillator - args: - - 2 - - 3 - columns: - - high - - low - - close - - volume - - function: port_macd - args: - - 2 - - 3 - columns: - - close - - function: port_relative_strength_index - args: - - 2 - columns: - - high - - low - - function: port_average_true_range - args: - - 2 - columns: - - high - - low - - close - - function: port_stochastic_oscillator_k - args: - - 2 - columns: - - high - - low - - close - - function: port_stochastic_oscillator_d - args: - - 2 - columns: - - high - - low - - close - - function: port_money_flow_index - args: - - 2 - columns: - - high - - low - - close - - volume - - function: port_force_index - args: - - 2 - columns: - - close - - volume - - function: port_ultimate_oscillator - args: - - 2 - columns: - - high - - low - - close - - function: port_accumulation_distribution - args: - - 2 - columns: - - high - - low - - close - - volume - - function: port_commodity_channel_index - args: - - 2 - columns: - - high - - low - - close - - function: port_on_balance_volume - args: - - 2 - columns: - - close - - volume - - function: port_vortex_indicator - args: - - 2 - columns: - - high - - low - - close - - function: port_kst_oscillator - args: - - 3 - - 4 - - 5 - - 6 - - 7 - - 8 - - 9 - - 10 - columns: - - close - - function: port_mass_index - args: - - 2 - - 3 - columns: - - high - - low - - function: port_true_strength_index - args: - - 2 - - 3 - columns: - - close - - function: port_ease_of_movement - args: - - 2 - columns: - - high - - low - - volume - - function: port_coppock_curve - args: - - 2 - columns: - - close - - function: port_keltner_channel - args: - - 2 - columns: - - high - - low - - close - - function: port_ppsr - args: - - 2 - columns: - - high - - low - - close - - function: port_fractional_diff - args: - - 0.9 - columns: - - close - - function: port_fractional_diff - args: - - 0.7 - columns: - - close - - function: port_fractional_diff - args: - - 0.5 - columns: - - close - - function: port_fractional_diff - args: - - 0.3 - columns: - - close - - function: port_fractional_diff - args: - - 0.1 - columns: - - close - - function: port_shift - args: - - -1 - columns: - - returns - remove_na: true - taskgraph: taskgraphs/xgboost_trade.gq.yaml - inputs: - preprocess@sort_node@in: stock_data.dask_cudf_out -- id: pos_neg_return - type: AddSignIndicatorNode - conf: - sign: sign - column: SHIFT_-1 - inputs: - in: stock_feature.technical_indicator@stock_out - module: rapids_modules -- id: drop_col - type: DropNode - conf: - columns: - - indicator - - datetime - - asset - - SHIFT_-1 - - open - - high - - low - - close - inputs: - in: pos_neg_return.out - module: rapids_modules -- id: split_data - type: DataSplittingNode - conf: - train_size: 0.8 - target: sign - inputs: - in: drop_col.out - module: rapids_modules -- id: xgboost_model - type: CustXGBoostNode - conf: - train_norm: - conf: - columns: - - sign - include: false - train_xgboost: - conf: - num_of_rounds: 100 - columns: - - sign - include: false - xgboost_parameters: - eta: 0.3 - min_child_weight: 1 - subsample: 1 - sampling_method: uniform - colsample_bytree: 1 - colsample_bylevel: 1 - colsample_bynode: 1 - max_depth: 8 - max_leaves: 256 - grow_policy: depthwise - gamma: 0 - lambda: 1 - alpha: 0 - tree_method: gpu_hist - single_precision_histogram: false - deterministic_histogram: false - objective: binary:logistic - target: sign - inputs: - test_norm@df_in: split_data.test - train_norm@df_in: split_data.train - module: my_node -- id: train_roc - type: RocCurveNode - conf: - label: sign - prediction: predict - inputs: - in: xgboost_model.train_infer@out - module: rapids_modules -- id: test_roc - type: RocCurveNode - conf: - label: sign - prediction: predict - inputs: - in: xgboost_model.test_infer@out - module: rapids_modules -- id: feature_importance - type: ImportanceCurveNode - conf: - type: gain - inputs: - in: xgboost_model.train_xgboost@model_out - module: rapids_modules diff --git a/taskgraphs/xgboost_example/xgboost_stock_hpo.gq.yaml b/taskgraphs/xgboost_example/xgboost_stock_hpo.gq.yaml deleted file mode 100644 index 2712ec61..00000000 --- a/taskgraphs/xgboost_example/xgboost_stock_hpo.gq.yaml +++ /dev/null @@ -1,382 +0,0 @@ -- id: "" - type: Output_Collector - conf: {} - inputs: - in1: xgboost_model.train_roc@roc_curve - in2: xgboost_model.test_roc@roc_curve - in3: hpo.conf_out - in4: hpo.train_roc@roc_curve - in5: hpo.test_roc@roc_curve -- id: xgboost_model - type: ContextCompositeNode - conf: - input: - - train_norm.df_in - - test_norm.df_in - output: - - train_infer.out - - test_infer.out - - train_roc.roc_curve - - test_roc.roc_curve - context: - target: - type: string - value: sign - map: - - node_id: train_xgboost - xpath: train_xgboost.conf.target - - node_id: train_roc - xpath: train_roc.conf.label - - node_id: test_roc - xpath: test_roc.conf.label - features: - type: array_string - value: - - sign - map: - - node_id: train_norm - xpath: train_norm.conf.columns - - node_id: train_xgboost - xpath: train_xgboost.conf.columns - inclusive: - type: boolean - map: - - node_id: train_norm - xpath: train_norm.conf.include - - node_id: train_xgboost - xpath: train_xgboost.conf.include - value: false - depth: - type: number - value: 1 - map: - - node_id: train_xgboost - xpath: train_xgboost.conf.xgboost_parameters.max_depth - eta: - type: number - value: 0.1 - map: - - node_id: train_xgboost - xpath: train_xgboost.conf.xgboost_parameters.eta - subnodes_conf: {} - taskgraph: taskgraphs/xgboost_example/xgboost_model_with_metrics.gq.yaml - inputs: - train_norm@df_in: split_data.train - test_norm@df_in: split_data.test -- id: hpo - type: GridRandomSearchNode - conf: - parameters: - - search: - function: grid_search - args: - - 1 - - 3 - - 5 - name: depth - - search: - function: uniform - args: - - 0.1 - - 0.8 - name: eta - metrics: - - train_roc.value - - test_roc.value - best: - mode: max - metric: test_roc.value - tune: - local_dir: ./ray - name: stock - num_samples: 1 - resources_per_trial: - cpu: 1 - gpu: 1 - input: - - train_norm.df_in - - test_norm.df_in - output: - - train_infer.out - - test_infer.out - - train_roc.roc_curve - - test_roc.roc_curve - context: - target: - type: string - value: sign - map: - - node_id: train_xgboost - xpath: train_xgboost.conf.target - - node_id: train_roc - xpath: train_roc.conf.label - - node_id: test_roc - xpath: test_roc.conf.label - features: - type: array_string - value: - - sign - map: - - node_id: train_norm - xpath: train_norm.conf.columns - - node_id: train_xgboost - xpath: train_xgboost.conf.columns - inclusive: - type: boolean - map: - - node_id: train_norm - xpath: train_norm.conf.include - - node_id: train_xgboost - xpath: train_xgboost.conf.include - value: false - depth: - type: number - value: 1 - map: - - node_id: train_xgboost - xpath: train_xgboost.conf.xgboost_parameters.max_depth - eta: - type: number - value: 0.1 - map: - - node_id: train_xgboost - xpath: train_xgboost.conf.xgboost_parameters.eta - subnodes_conf: {} - taskgraph: taskgraphs/xgboost_example/xgboost_model_with_metrics.gq.yaml - inputs: - conf_in: xgboost_model.conf_out - train_norm@df_in: split_data.train - test_norm@df_in: split_data.test - module: rapids_modules -- id: stock_data - type: CsvStockLoader - conf: - file: notebooks/data/stock_price_hist.csv.gz - path: notebooks/many-small - inputs: {} - module: rapids_modules -- id: stock_feature - type: CompositeNode - conf: - input: - - preprocess.sort_node@in - output: - - technical_indicator.stock_out - subnode_ids: - - technical_indicator - subnodes_conf: - technical_indicator: - conf: - indicators: - - function: port_bollinger_bands - args: - - 10 - columns: - - close - - function: port_chaikin_oscillator - args: - - 2 - - 3 - columns: - - high - - low - - close - - volume - - function: port_macd - args: - - 2 - - 3 - columns: - - close - - function: port_relative_strength_index - args: - - 2 - columns: - - high - - low - - function: port_average_true_range - args: - - 2 - columns: - - high - - low - - close - - function: port_stochastic_oscillator_k - args: - - 2 - columns: - - high - - low - - close - - function: port_stochastic_oscillator_d - args: - - 2 - columns: - - high - - low - - close - - function: port_money_flow_index - args: - - 2 - columns: - - high - - low - - close - - volume - - function: port_force_index - args: - - 2 - columns: - - close - - volume - - function: port_ultimate_oscillator - args: - - 2 - columns: - - high - - low - - close - - function: port_accumulation_distribution - args: - - 2 - columns: - - high - - low - - close - - volume - - function: port_commodity_channel_index - args: - - 2 - columns: - - high - - low - - close - - function: port_on_balance_volume - args: - - 2 - columns: - - close - - volume - - function: port_vortex_indicator - args: - - 2 - columns: - - high - - low - - close - - function: port_kst_oscillator - args: - - 3 - - 4 - - 5 - - 6 - - 7 - - 8 - - 9 - - 10 - columns: - - close - - function: port_mass_index - args: - - 2 - - 3 - columns: - - high - - low - - function: port_true_strength_index - args: - - 2 - - 3 - columns: - - close - - function: port_ease_of_movement - args: - - 2 - columns: - - high - - low - - volume - - function: port_coppock_curve - args: - - 2 - columns: - - close - - function: port_keltner_channel - args: - - 2 - columns: - - high - - low - - close - - function: port_ppsr - args: - - 2 - columns: - - high - - low - - close - - function: port_fractional_diff - args: - - 0.9 - columns: - - close - - function: port_fractional_diff - args: - - 0.7 - columns: - - close - - function: port_fractional_diff - args: - - 0.5 - columns: - - close - - function: port_fractional_diff - args: - - 0.3 - columns: - - close - - function: port_fractional_diff - args: - - 0.1 - columns: - - close - - function: port_shift - args: - - -1 - columns: - - returns - remove_na: true - taskgraph: taskgraphs/xgboost_trade.gq.yaml - inputs: - preprocess@sort_node@in: stock_data.cudf_out -- id: pos_neg_return - type: AddSignIndicatorNode - conf: - sign: sign - column: SHIFT_-1 - inputs: - in: stock_feature.technical_indicator@stock_out - module: rapids_modules -- id: drop_col - type: DropNode - conf: - columns: - - indicator - - datetime - - asset - - SHIFT_-1 - - open - - high - - low - - close - inputs: - in: pos_neg_return.out - module: rapids_modules -- id: split_data - type: DataSplittingNode - conf: - train_size: 0.8 - target: sign - inputs: - in: drop_col.out - module: rapids_modules diff --git a/tests/unit/custom_port_nodes.py b/tests/unit/custom_port_nodes.py deleted file mode 100644 index d62e7395..00000000 --- a/tests/unit/custom_port_nodes.py +++ /dev/null @@ -1,491 +0,0 @@ -import math -import numpy as np -from numba import cuda -import cupy -import cudf -import dask_cudf -import dask -import rmm -from gquant.dataframe_flow import Node, MetaData -from gquant.dataframe_flow import NodePorts, PortsSpecSchema -from gquant.dataframe_flow import ConfSchema -import copy -import os -from gquant.dataframe_flow.task import load_modules -load_modules(os.getenv('MODULEPATH')+'/rapids_modules/') -from rapids_modules._port_type_node import _PortTypesMixin -import rapids_modules.cuindicator as gi - - -class PointNode(_PortTypesMixin, Node): - - def ports_setup(self): - input_ports = {} - output_ports = { - 'points_df_out': { - PortsSpecSchema.port_type: cudf.DataFrame - }, - 'points_ddf_out': { - PortsSpecSchema.port_type: dask_cudf.DataFrame - }, - } - return NodePorts(inports=input_ports, outports=output_ports) - - def conf_schema(self): - json = { - "title": "PointNode configure", - "type": "object", - "properties": { - "npts": { - "type": "number", - "description": "number of data points", - "minimum": 10 - }, - "npartitions": { - "type": "number", - "description": "num of partitions in the Dask dataframe", - "minimum": 1 - } - - }, - "required": ["npts", "npartitions"], - } - - ui = { - "npts": {"ui:widget": "updown"}, - "npartitions": {"ui:widget": "updown"} - } - return ConfSchema(json=json, ui=ui) - - def init(self): - pass - - def meta_setup(self): - columns_out = { - 'points_df_out': { - 'x': 'float64', - 'y': 'float64' - }, - 'points_ddf_out': { - 'x': 'float64', - 'y': 'float64' - } - } - return MetaData(inports={}, outports=columns_out) - - def process(self, inputs): - npts = self.conf['npts'] - seed = self.conf.get('nseed') - if seed is not None: - np.random.seed(seed) - df = cudf.DataFrame() - df['x'] = np.random.rand(npts) - df['y'] = np.random.rand(npts) - output = {} - if self.outport_connected('points_df_out'): - output.update({'points_df_out': df}) - if self.outport_connected('points_ddf_out'): - npartitions = self.conf['npartitions'] - ddf = dask_cudf.from_cudf(df, npartitions=npartitions) - output.update({'points_ddf_out': ddf}) - return output - - -class DistanceNode(_PortTypesMixin, Node): - - def ports_setup(self): - port_type = PortsSpecSchema.port_type - input_ports = { - 'points_df_in': { - port_type: [cudf.DataFrame, dask_cudf.DataFrame] - } - } - - output_ports = { - 'distance_df': { - port_type: [cudf.DataFrame, dask_cudf.DataFrame] - }, - 'distance_abs_df': { - PortsSpecSchema.port_type: [cudf.DataFrame, dask_cudf.DataFrame] - } - } - input_connections = self.get_connected_inports() - if 'points_df_in' in input_connections: - types = input_connections['points_df_in'] - # connected, use the types passed in from parent - return NodePorts(inports={'points_df_in': {port_type: types}}, - outports={'distance_df': {port_type: types}, - 'distance_abs_df': {port_type: types}, - }) - else: - return NodePorts(inports=input_ports, outports=output_ports) - - def conf_schema(self): - return ConfSchema() - - def init(self): - self.delayed_process = True - - def meta_setup(self): - req_cols = { - 'x': 'float64', - 'y': 'float64' - } - required = { - 'points_df_in': req_cols, - } - input_meta = self.get_input_meta() - output_cols = ({ - 'distance_df': { - 'distance_cudf': 'float64', - 'x': 'float64', - 'y': 'float64' - }, - 'distance_abs_df': { - 'distance_abs_cudf': 'float64', - 'x': 'float64', - 'y': 'float64' - } - }) - if 'points_df_in' in input_meta: - col_from_inport = input_meta['points_df_in'] - # additional ports - output_cols['distance_df'].update(col_from_inport) - output_cols['distance_abs_df'].update(col_from_inport) - return MetaData(inports=required, outports=output_cols) - - def process(self, inputs): - df = inputs['points_df_in'] - output = {} - if self.outport_connected('distance_df'): - copy_df = df.copy() - copy_df['distance_cudf'] = (df['x'] ** 2 + df['y'] ** 2).sqrt() - output.update({'distance_df': copy_df}) - if self.outport_connected('distance_abs_df'): - copy_df = df.copy() - copy_df['distance_abs_cudf'] = df['x'].abs() + df['y'].abs() - output.update({'distance_abs_df': copy_df}) - return output - - -@cuda.jit -def distance_kernel(x, y, distance, array_len): - # ii - overall thread index - ii = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x - if ii < array_len: - distance[ii] = math.sqrt(x[ii] ** 2 + y[ii] ** 2) - - -class NumbaDistanceNode(_PortTypesMixin, Node): - - def ports_setup(self): - port_type = PortsSpecSchema.port_type - input_ports = { - 'points_df_in': { - port_type: [cudf.DataFrame, - dask_cudf.DataFrame] - } - } - - output_ports = { - 'distance_df': { - port_type: [cudf.DataFrame, - dask_cudf.DataFrame] - } - } - - input_connections = self.get_connected_inports() - if 'points_df_in' in input_connections: - types = input_connections['points_df_in'] - # connected - return NodePorts(inports={'points_df_in': {port_type: types}}, - outports={'distance_df': {port_type: types}}) - else: - return NodePorts(inports=input_ports, outports=output_ports) - - def init(self): - self.delayed_process = True - - def meta_setup(self,): - required_cols = {'x': 'float64', 'y': 'float64'} - required = { - 'points_df_in': required_cols, - 'distance_df': required_cols - } - input_meta = self.get_input_meta() - output_cols = ({ - 'distance_df': { - 'distance_numba': 'float64', - 'x': 'float64', - 'y': 'float64' - } - }) - if 'points_df_in' in input_meta: - col_from_inport = input_meta['points_df_in'] - # additional ports - output_cols['distance_df'].update(col_from_inport) - return MetaData(inports=required, outports=output_cols) - - def conf_schema(self): - return ConfSchema() - - def process(self, inputs): - df = inputs['points_df_in'] - - # DEBUGGING - # try: - # from dask.distributed import get_worker - # worker = get_worker() - # print('worker{} process NODE "{}" worker: {}'.format( - # worker.name, self.uid, worker)) - # except (ValueError, ImportError): - # pass - - number_of_threads = 16 - number_of_blocks = ((len(df) - 1) // number_of_threads) + 1 - # Inits device array by setting 0 for each index. - # df['distance_numba'] = 0.0 - darr = rmm.device_array(len(df)) - distance_kernel[(number_of_blocks,), (number_of_threads,)]( - df['x'], - df['y'], - darr, - len(df)) - df['distance_numba'] = darr - return {'distance_df': df} - - -kernel_string = r''' - extern "C" __global__ - void compute_distance(const double* x, const double* y, - double* distance, int arr_len) { - int tid = blockDim.x * blockIdx.x + threadIdx.x; - if (tid < arr_len){ - distance[tid] = sqrt(x[tid]*x[tid] + y[tid]*y[tid]); - } - } -''' - - -class CupyDistanceNode(_PortTypesMixin, Node): - - def ports_setup(self): - port_type = PortsSpecSchema.port_type - input_ports = { - 'points_df_in': { - port_type: [cudf.DataFrame, - dask_cudf.DataFrame] - } - } - - output_ports = { - 'distance_df': { - port_type: [cudf.DataFrame, - dask_cudf.DataFrame] - } - } - - input_connections = self.get_connected_inports() - if 'points_df_in' in input_connections: - types = input_connections['points_df_in'] - # connected - return NodePorts(inports={'points_df_in': {port_type: types}}, - outports={'distance_df': {port_type: types}}) - else: - return NodePorts(inports=input_ports, outports=output_ports) - - def init(self): - self.delayed_process = True - - def meta_setup(self,): - cols_required = {'x': 'float64', - 'y': 'float64'} - required = { - 'points_df_in': cols_required, - 'distance_df': cols_required - } - input_meta = self.get_input_meta() - output_cols = ({ - 'distance_df': { - 'distance_cupy': 'float64', - 'x': 'float64', - 'y': 'float64' - } - }) - if 'points_df_in' in input_meta: - col_from_inport = input_meta['points_df_in'] - # additional ports - output_cols['distance_df'].update(col_from_inport) - return MetaData(inports=required, outports=output_cols) - - def conf_schema(self): - return ConfSchema() - - def get_kernel(self): - raw_kernel = cupy.RawKernel(kernel_string, 'compute_distance') - return raw_kernel - - def process(self, inputs): - df = inputs['points_df_in'] - cupy_x = cupy.asarray(df['x']) - cupy_y = cupy.asarray(df['y']) - number_of_threads = 16 - number_of_blocks = (len(df) - 1) // number_of_threads + 1 - dis = cupy.ndarray(len(df), dtype=cupy.float64) - self.get_kernel()((number_of_blocks,), (number_of_threads,), - (cupy_x, cupy_y, dis, len(df))) - df['distance_cupy'] = dis - - return {'distance_df': df} - - -class DistributedNode(_PortTypesMixin, Node): - - def ports_setup(self): - input_ports = { - 'points_df_in': { - PortsSpecSchema.port_type: cudf.DataFrame - } - } - - output_ports = { - 'points_ddf_out': { - PortsSpecSchema.port_type: dask_cudf.DataFrame - } - } - - return NodePorts(inports=input_ports, outports=output_ports) - - def init(self): - pass - - def meta_setup(self,): - cols_required = { - 'x': 'float64', - 'y': 'float64' - } - required = { - 'points_df_in': cols_required, - 'points_ddf_out': cols_required - } - input_meta = self.get_input_meta() - output_cols = ({ - 'points_ddf_out': { - 'x': 'float64', - 'y': 'float64' - } - }) - if 'points_df_in' in input_meta: - col_from_inport = input_meta['points_df_in'] - # additional ports - output_cols['points_ddf_out'].update(col_from_inport) - return MetaData(inports=required, outports=output_cols) - - def conf_schema(self): - json = { - "title": "DistributedNode configure", - "type": "object", - "properties": { - "npartitions": { - "type": "number", - "description": "num of partitions in the Dask dataframe", - "minimum": 1 - } - }, - "required": ["npartitions"], - } - - ui = { - "npartitions": {"ui:widget": "updown"} - } - return ConfSchema(json=json, ui=ui) - - def process(self, inputs): - npartitions = self.conf['npartitions'] - df = inputs['points_df_in'] - ddf = dask_cudf.from_cudf(df, npartitions=npartitions) - return {'points_ddf_out': ddf} - - -class VerifyNode(_PortTypesMixin, Node): - - def ports_setup(self): - input_ports = { - 'df1': { - PortsSpecSchema.port_type: [cudf.DataFrame, - dask_cudf.DataFrame] - }, - 'df2': { - PortsSpecSchema.port_type: [cudf.DataFrame, - dask_cudf.DataFrame] - } - } - output_ports = { - 'max_diff': { - PortsSpecSchema.port_type: float - } - } - - connections = self.get_connected_inports() - for key in input_ports: - if key in connections: - # connected - types = connections[key] - input_ports[key].update({PortsSpecSchema.port_type: types}) - return NodePorts(inports=input_ports, outports=output_ports) - - def meta_setup(self): - required ={ - "df1": {}, - "df2": {} - } - return MetaData(inports=required, outports={'max_diff': {}}) - - def conf_schema(self): - json = { - "title": "VerifyNode configure", - "type": "object", - "properties": { - "df1_col": { - "type": "string", - "description": "dataframe1 column name" - }, - "df2_col": { - "type": "string", - "description": "dataframe2 column name" - } - }, - "required": ["df1_col", "df2_col"], - } - - ui = { - "df1_col": {"ui:widget": "text"}, - "df2_col": {"ui:widget": "text"} - } - return ConfSchema(json=json, ui=ui) - - def process(self, inputs): - df1 = inputs['df1'] - df2 = inputs['df2'] - col_df1 = self.conf['df1_col'] - col_df2 = self.conf['df2_col'] - - df1_col = df1[col_df1] - if isinstance(df1, dask_cudf.DataFrame): - # df1_col = df1_col.compute() - pass - - df2_col = df2[col_df2] - if isinstance(df2, dask_cudf.DataFrame): - # df2_col = df2_col.compute() - pass - - max_difference = (df1_col - df2_col).abs().max() - - if isinstance(max_difference, dask.dataframe.core.Scalar): - max_difference = float(max_difference.compute()) - max_difference = float(max_difference) - # print('Max Difference: {}'.format(max_difference)) - # assert(max_difference < 1e-8) - - return {'max_diff': max_difference}