diff --git a/README.md b/README.md index 889fcf22..ebda8ebe 100644 --- a/README.md +++ b/README.md @@ -15,16 +15,22 @@ pip install polars-ds

-# The Project +# PDS (polars_ds) -PDS is a modern take on data science and traditional tabular machine learning. It is dataframe-centric in design, and provides parallelism for free via **Polars**. It offers Polars syntax that works both in normal and aggregation contexts, and provides these conveniences to the end user without any additional dependency. It includes the most common functions from NumPy, SciPy, edit distances, KNN-related queries, EDA tools, feature engineering queries, etc. Yes, it only depends on Polars (unless you want to use the plotting functionalities and want to interop with NumPy). Most of the code is rewritten in **Rust** and is on par or even faster than existing functions in SciPy and Scikit-learn. The following are some examples: +PDS is a modern data science package that -Parallel evaluations of classification metrics on segments +1. is fast and furious +2. is small and lean, with minimal dependencies +3. has an intuitive and concise API (if you know Polars already) +4. has dataframe friendly design +5. and covers a wide variety of data science topics, such as simple statistics, linear regression, string edit distances, tabular data transforms, feature extraction, traditional modelling pipelines, model evaluation metrics, etc., etc.. + +It stands on the shoulders of the great **Polars** dataframe. You can see [examples](./examples/basics.ipynb). Here are some highlights! ```python import polars as pl import polars_ds as pds - +# Parallel evaluation of multiple ML metrics on different segments of data df.lazy().group_by("segments").agg( pds.query_roc_auc("actual", "predicted").alias("roc_auc"), pds.query_log_loss("actual", "predicted").alias("log_loss"), @@ -41,6 +47,43 @@ shape: (2, 3) └──────────┴──────────┴──────────┘ ``` +Tabular Machine Learning Data Transformation Pipeline + +```Python +import polars as pl +import polars.selectors as cs +from polars_ds.pipeline import Pipeline, Blueprint + +bp = ( + # If we specify a target, then target will be excluded from any transformations. + Blueprint(df, name = "example", target = "approved") + .lowercase() # lowercase all columns + .select(cs.numeric() | cs.by_name(["gender", "employer_category1", "city_category"])) + .linear_impute(features = ["var1", "existing_emi"], target = "loan_period") + .impute(["existing_emi"], method = "median") + .append_expr( # generate some features + pl.col("existing_emi").log1p().alias("existing_emi_log1p"), + pl.col("loan_amount").log1p().alias("loan_amount_log1p"), + pl.col("loan_amount").sqrt().alias("loan_amount_sqrt"), + pl.col("loan_amount").shift(-1).alias("loan_amount_lag_1") # any kind of lag transform + ) + .scale( + cs.numeric().exclude(["var1", "existing_emi_log1p"]), method = "standard" + ) # Scale the columns up to this point. The columns below won't be scaled + .append_expr( # Add missing flags + pl.col("employer_category1").is_null().cast(pl.UInt8).alias("employer_category1_is_missing") + ) + .one_hot_encode("gender", drop_first=True) + .woe_encode("city_category") + .target_encode("employer_category1", min_samples_leaf = 20, smoothing = 10.0) # same as above +) + +pipe:Pipeline = bp.materialize() +# Check out the result in our example notebooks! (examples/pipeline.ipynb) +df_transformed = pipe.transform(df) +df_transformed.head() +``` + Get all neighbors within radius r, call them best friends, and count the number ```python @@ -71,7 +114,7 @@ shape: (5, 3) └─────┴───────────────────┴────────────────────┘ ``` -Ridge Regression on Categories +Run a linear regression on each category: ```Python @@ -120,9 +163,9 @@ In-dataframe statistical tests ```Python df.group_by("market_id").agg( - pds.query_ttest_ind("var1", "var2", equal_var=False).alias("t-test"), - pds.query_chi2("category_1", "category_2").alias("chi2-test"), - pds.query_f_test("var1", group = "category_1").alias("f-test") + pds.ttest_ind("var1", "var2", equal_var=False).alias("t-test"), + pds.chi2("category_1", "category_2").alias("chi2-test"), + pds.f_test("var1", group = "category_1").alias("f-test") ) shape: (3, 4) @@ -151,46 +194,6 @@ df.select( ).head() ``` -Tabular Machine Learning Data Transformation Pipeline - -```Python -import polars as pl -import polars.selectors as cs -from polars_ds.pipeline import Pipeline, Blueprint - -bp = ( - # If we specify a target, then target will be excluded from any transformations. - Blueprint(df, name = "example", target = "approved") - .lowercase() # lowercase all columns - .select(cs.numeric() | cs.by_name(["gender", "employer_category1", "city_category"])) - # Impute loan_period by running a simple linear regression. - # Explicitly put target, since this is not the target for prediction. - .linear_impute(features = ["var1", "existing_emi"], target = "loan_period") - .impute(["existing_emi"], method = "median") - .append_expr( # generate some features - pl.col("existing_emi").log1p().alias("existing_emi_log1p"), - pl.col("loan_amount").log1p().alias("loan_amount_log1p"), - pl.col("loan_amount").sqrt().alias("loan_amount_sqrt"), - pl.col("loan_amount").shift(-1).alias("loan_amount_lag_1") # any kind of lag transform - ) - .scale( # target is numerical, but will be excluded automatically because bp is initialzied with a target - cs.numeric().exclude(["var1", "existing_emi_log1p"]), method = "standard" - ) # Scale the columns up to this point. The columns below won't be scaled - .append_expr( - # Add missing flags - pl.col("employer_category1").is_null().cast(pl.UInt8).alias("employer_category1_is_missing") - ) - .one_hot_encode("gender", drop_first=True) - .woe_encode("city_category") # No need to specify target because we initialized bp with a target - .target_encode("employer_category1", min_samples_leaf = 20, smoothing = 10.0) # same as above -) - -pipe:Pipeline = bp.materialize() -# Check out the result in our example notebooks! -df_transformed = pipe.transform(df) -df_transformed.head() -``` - And more! ## Getting Started @@ -205,11 +208,15 @@ To make full use of the Diagnosis module, do pip install "polars_ds[plot]" ``` -## More Examples +## How Fast is it? + +Feel free to take a look at our [benchmark notebook](./benchmarks/benchmarks.ipynb)! + +Generally speaking, the more expressions you want to evaluate simultaneously, the faster Polars + PDS will be than Pandas + (SciPy / Sklearn / NumPy). The more CPU cores you have on your machine, the bigger the time difference will be in favor of Polars + PDS. -See this for Polars Extensions: [notebook](./examples/basics.ipynb) +Why does speed matter? -See this for Native Polars DataFrame Explorative tools: [notebook](./examples/diagnosis.ipynb) +If your code already executes under 1s, then maybe it doesn't. But as your data grow, having a 5s run vs. a 1s run will make a lot of difference in your iterations for your project. Speed of execution becomes a bigger issues if you are building reports on demand, or if you need to pay extra for additional compute. ## HELP WANTED! @@ -217,9 +224,8 @@ See this for Native Polars DataFrame Explorative tools: [notebook](./examples/di ## Road Map -1. Standalone KNN and linear regression module. -2. K-means, K-medoids clustering as expressions and also standalone modules. -3. Other. +1. K-means, K-medoids clustering as expressions and also standalone modules. +2. Other improvement items. See issues. # Disclaimer @@ -232,8 +238,8 @@ This package is not tested with Polars streaming mode and is not designed to wor 1. Rust Snowball Stemmer is taken from Tsoding's Seroost project (MIT). See [here](https://github.com/tsoding/seroost) 2. Some statistics functions are taken from Statrs (MIT) and internalized. See [here](https://github.com/statrs-dev/statrs/tree/master) 3. Linear algebra routines are powered partly by [faer](https://crates.io/crates/faer) +4. String similarity metrics are soooo fast because of [RapidFuzz](https://github.com/maxbachmann/rapidfuzz-rs) # Other related Projects -1. Take a look at our friendly neighbor [functime](https://github.com/TracecatHQ/functime) -2. String similarity metrics is soooo fast and easy to use because of [RapidFuzz](https://github.com/maxbachmann/rapidfuzz-rs) \ No newline at end of file +1. Take a look at our friendly neighbor [functime](https://github.com/TracecatHQ/functime) \ No newline at end of file diff --git a/benchmarks/benchmarks.ipynb b/benchmarks/benchmarks.ipynb new file mode 100644 index 00000000..383f4891 --- /dev/null +++ b/benchmarks/benchmarks.ipynb @@ -0,0 +1,307 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Notebook Used to Generate Benchmark Results " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl\n", + "import polars_ds as pds" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Parallel ML metrics evaluations on segments \n", + "\n", + "Use cases:\n", + "\n", + "1. Evaluate ML model performance in market A, B, C.\n", + "2. The Dataframe contains a column that defines the \"split\" of the dataframe. Then this can simulatneously evaluate ML model's performances on each of the train, test, recent, or any other split you have.\n", + "3. Evaluate ML model performance over time, e.g. weekly / monthly \n", + "\n", + "Comparison: \n", + "\n", + "Polars + PDS vs Pandas + Sklearn" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Generate a \n", + "from datetime import date\n", + "\n", + "dates = pl.date_range(date(2020, 1, 1), date(2024, 10, 1), \"1d\", eager=True)\n", + "df = pds.frame(size=len(dates)).select(\n", + " pds.random().alias(\"predicted\"),\n", + " (pds.random() > 0.25).cast(pl.UInt8).alias(\"actual_target\"),\n", + " dates = dates,\n", + ")\n", + "df_pd = df.to_pandas()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
predictedactual_targetdates
00.41376712020-01-01
10.12578312020-01-02
20.38294312020-01-03
30.69045502020-01-04
40.49248802020-01-05
............
17310.36531812024-09-27
17320.63510512024-09-28
17330.15605412024-09-29
17340.73670412024-09-30
17350.66052512024-10-01
\n", + "

1736 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " predicted actual_target dates\n", + "0 0.413767 1 2020-01-01\n", + "1 0.125783 1 2020-01-02\n", + "2 0.382943 1 2020-01-03\n", + "3 0.690455 0 2020-01-04\n", + "4 0.492488 0 2020-01-05\n", + "... ... ... ...\n", + "1731 0.365318 1 2024-09-27\n", + "1732 0.635105 1 2024-09-28\n", + "1733 0.156054 1 2024-09-29\n", + "1734 0.736704 1 2024-09-30\n", + "1735 0.660525 1 2024-10-01\n", + "\n", + "[1736 rows x 3 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_pd" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.metrics import roc_auc_score, log_loss, brier_score_loss" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "10.3 ms ± 83 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" + ] + } + ], + "source": [ + "%%timeit\n", + "df_pd[\"year\"] = df['dates'].dt.year()\n", + "df_pd.groupby([\"year\"]).apply(\n", + " lambda df_group: pd.Series({\n", + " \"count\": len(df_group[\"actual_target\"]),\n", + " \"roc_auc\": roc_auc_score(df_group[\"actual_target\"], df_group[\"predicted\"]),\n", + " \"log_loss\": roc_auc_score(df_group[\"actual_target\"], df_group[\"predicted\"])\n", + " })\n", + " , include_groups=False\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2.13 ms ± 67.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" + ] + } + ], + "source": [ + "%%timeit\n", + "df.group_by(pl.col(\"dates\").dt.year()).agg(\n", + " count = pl.len(),\n", + " roc_auc = pds.query_roc_auc(\"actual_target\", \"predicted\"),\n", + " log_loss = pds.query_log_loss(\"actual_target\", \"predicted\")\n", + ").sort(\"dates\")\n", + "# 1/5 of the time, less lines of code + easier to understand syntax" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/index.md b/docs/index.md index eb52d8d6..28ca7992 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,19 +1,25 @@ -# Polars Extension for General Data Science Use +# Polars for Data Science (PDS) A comprehensive [walkthrough](https://github.com/abstractqqq/polars_ds_extension/blob/knn_entropy/examples/basics.ipynb). Read the [Docs](https://polars-ds-extension.readthedocs.io/en/latest/). -# The Project +# Introduction -PDS is a modern take on data science and traditional tabular machine learning. It is dataframe-centric in design, and provides parallelism for free via **Polars**. It offers Polars syntax that works both in normal and aggregation contexts, and provides these conveniences to the end user without any additional dependency. It includes the most common functions from NumPy, SciPy, edit distances, KNN-related queries, EDA tools, feature engineering queries, etc. Yes, it only depends on Polars (unless you want to use the plotting functionalities and want to interop with NumPy). Most of the code is rewritten in **Rust** and is on par or even faster than existing functions in SciPy and Scikit-learn. The following are some examples: +PDS is a modern data science package that -Parallel evaluations of classification metrics on segments +1. is fast and furious +2. is small and lean, with minimal dependencies +3. has an intuitive and concise API (if you know Polars already) +4. has dataframe friendly design +5. and covers a wide variety of data science topics, such as simple statistics, linear regression, string edit distances, tabular data transforms, feature extraction, traditional modelling pipelines, model evaluation metrics, etc., etc.. + +It stands on the shoulders of the great **Polars** dataframe. You can see [examples](./examples/basics.ipynb). Here are some highlights! ```python import polars as pl import polars_ds as pds - +# Parallel evaluation of multiple ML metrics on different segments of data df.lazy().group_by("segments").agg( pds.query_roc_auc("actual", "predicted").alias("roc_auc"), pds.query_log_loss("actual", "predicted").alias("log_loss"), @@ -30,6 +36,43 @@ shape: (2, 3) └──────────┴──────────┴──────────┘ ``` +Tabular Machine Learning Data Transformation Pipeline + +```Python +import polars as pl +import polars.selectors as cs +from polars_ds.pipeline import Pipeline, Blueprint + +bp = ( + # If we specify a target, then target will be excluded from any transformations. + Blueprint(df, name = "example", target = "approved") + .lowercase() # lowercase all columns + .select(cs.numeric() | cs.by_name(["gender", "employer_category1", "city_category"])) + .linear_impute(features = ["var1", "existing_emi"], target = "loan_period") + .impute(["existing_emi"], method = "median") + .append_expr( # generate some features + pl.col("existing_emi").log1p().alias("existing_emi_log1p"), + pl.col("loan_amount").log1p().alias("loan_amount_log1p"), + pl.col("loan_amount").sqrt().alias("loan_amount_sqrt"), + pl.col("loan_amount").shift(-1).alias("loan_amount_lag_1") # any kind of lag transform + ) + .scale( + cs.numeric().exclude(["var1", "existing_emi_log1p"]), method = "standard" + ) # Scale the columns up to this point. The columns below won't be scaled + .append_expr( # Add missing flags + pl.col("employer_category1").is_null().cast(pl.UInt8).alias("employer_category1_is_missing") + ) + .one_hot_encode("gender", drop_first=True) + .woe_encode("city_category") + .target_encode("employer_category1", min_samples_leaf = 20, smoothing = 10.0) # same as above +) + +pipe:Pipeline = bp.materialize() +# Check out the result in our example notebooks! (examples/pipeline.ipynb) +df_transformed = pipe.transform(df) +df_transformed.head() +``` + Get all neighbors within radius r, call them best friends, and count the number ```python @@ -60,7 +103,7 @@ shape: (5, 3) └─────┴───────────────────┴────────────────────┘ ``` -Ridge Regression on Categories +Run a linear regression on each category: ```Python @@ -109,9 +152,9 @@ In-dataframe statistical tests ```Python df.group_by("market_id").agg( - pds.query_ttest_ind("var1", "var2", equal_var=False).alias("t-test"), - pds.query_chi2("category_1", "category_2").alias("chi2-test"), - pds.query_f_test("var1", group = "category_1").alias("f-test") + pds.ttest_ind("var1", "var2", equal_var=False).alias("t-test"), + pds.chi2("category_1", "category_2").alias("chi2-test"), + pds.f_test("var1", group = "category_1").alias("f-test") ) shape: (3, 4) @@ -140,46 +183,6 @@ df.select( ).head() ``` -Tabular Machine Learning Data Transformation Pipeline - -```Python -import polars as pl -import polars.selectors as cs -from polars_ds.pipeline import Pipeline, Blueprint - -bp = ( - # If we specify a target, then target will be excluded from any transformations. - Blueprint(df, name = "example", target = "approved") - .lowercase() # lowercase all columns - .select(cs.numeric() | cs.by_name(["gender", "employer_category1", "city_category"])) - # Impute loan_period by running a simple linear regression. - # Explicitly put target, since this is not the target for prediction. - .linear_impute(features = ["var1", "existing_emi"], target = "loan_period") - .impute(["existing_emi"], method = "median") - .append_expr( # generate some features - pl.col("existing_emi").log1p().alias("existing_emi_log1p"), - pl.col("loan_amount").log1p().alias("loan_amount_log1p"), - pl.col("loan_amount").sqrt().alias("loan_amount_sqrt"), - pl.col("loan_amount").shift(-1).alias("loan_amount_lag_1") # any kind of lag transform - ) - .scale( # target is numerical, but will be excluded automatically because bp is initialzied with a target - cs.numeric().exclude(["var1", "existing_emi_log1p"]), method = "standard" - ) # Scale the columns up to this point. The columns below won't be scaled - .append_expr( - # Add missing flags - pl.col("employer_category1").is_null().cast(pl.UInt8).alias("employer_category1_is_missing") - ) - .one_hot_encode("gender", drop_first=True) - .woe_encode("city_category") # No need to specify target because we initialized bp with a target - .target_encode("employer_category1", min_samples_leaf = 20, smoothing = 10.0) # same as above -) - -pipe:Pipeline = bp.materialize() -# Check out the result in our example notebooks! -df_transformed = pipe.transform(df) -df_transformed.head() -``` - And more! ## Getting Started @@ -194,11 +197,15 @@ To make full use of the Diagnosis module, do pip install "polars_ds[plot]" ``` -## More Examples +## How Fast is it? + +Feel free to take a look at our [benchmark notebook](./benchmarks/benchmarks.ipynb)! + +Generally speaking, the more expressions you want to evaluate simultaneously, the faster Polars + PDS will be than Pandas + (SciPy / Sklearn / NumPy). The more CPU cores you have on your machine, the bigger the time difference will be in favor of Polars + PDS. -See this for Polars Extensions: [notebook](./examples/basics.ipynb) +Why does speed matter? -See this for Native Polars DataFrame Explorative tools: [notebook](./examples/diagnosis.ipynb) +If your code already executes under 1s, then maybe it doesn't. But as your data grow, having a 5s run vs. a 1s run will make a lot of difference in your iterations for your project. Speed of execution becomes a bigger issues if you are building reports on demand, or if you need to pay extra for additional compute. ## HELP WANTED! @@ -206,9 +213,8 @@ See this for Native Polars DataFrame Explorative tools: [notebook](./examples/di ## Road Map -1. Standalone KNN and linear regression module. -2. K-means, K-medoids clustering as expressions and also standalone modules. -3. Other. +1. K-means, K-medoids clustering as expressions and also standalone modules. +2. Other improvement items. See issues. # Disclaimer @@ -220,10 +226,9 @@ This package is not tested with Polars streaming mode and is not designed to wor 1. Rust Snowball Stemmer is taken from Tsoding's Seroost project (MIT). See [here](https://github.com/tsoding/seroost) 2. Some statistics functions are taken from Statrs (MIT) and internalized. See [here](https://github.com/statrs-dev/statrs/tree/master) -3. Graph functionalities are powered by the petgragh crate. See [here](https://crates.io/crates/petgraph) -4. Linear algebra routines are powered partly by [faer](https://crates.io/crates/faer) +3. Linear algebra routines are powered partly by [faer](https://crates.io/crates/faer) +4. String similarity metrics are soooo fast because of [RapidFuzz](https://github.com/maxbachmann/rapidfuzz-rs) # Other related Projects -1. Take a look at our friendly neighbor [functime](https://github.com/TracecatHQ/functime) -2. String similarity metrics is soooo fast and easy to use because of [RapidFuzz](https://github.com/maxbachmann/rapidfuzz-rs) \ No newline at end of file +1. Take a look at our friendly neighbor [functime](https://github.com/TracecatHQ/functime) \ No newline at end of file diff --git a/examples/basics.ipynb b/examples/basics.ipynb index fa62f047..7b42d389 100644 --- a/examples/basics.ipynb +++ b/examples/basics.ipynb @@ -46,7 +46,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 12)
ftime_idxdummyactualpredicteddummy_groupsx1x2x3aby
f64i64stri32f64strf64f64f64f64f64f64
0.00"a"10.373234"a"0.5501060.6630550.6494280.0524510.331711-0.69266
0.8414711"a"10.807054"a"0.0046390.8882840.1611270.0075140.5007070.025568
0.9092972"a"00.672976"a"0.2751450.29290.7513390.7919960.645909-0.997863
0.141123"a"00.805052"a"0.5565640.8733220.3195290.5666440.960852-0.133722
-0.7568024"a"00.153986"a"0.2909040.6554520.1707090.94490.640427-0.01572
" + "shape: (5, 12)
ftime_idxdummyactualpredicteddummy_groupsx1x2x3aby
f64i64stri32f64strf64f64f64f64f64f64
0.00"a"10.651933"a"0.9725640.2994320.6005710.9016760.481886-0.665053
0.8414711"a"10.599058"a"0.0559680.5475830.4489080.7950430.100833-0.500674
0.9092972"a"10.264728"a"0.8667580.73820.2461480.9162530.868888-0.017706
0.141123"a"10.599733"a"0.1541840.7301040.4065640.0129620.736355-0.367598
-0.7568024"a"10.292312"a"0.2610950.9639280.3926110.9823070.879509-0.260554
" ], "text/plain": [ "shape: (5, 12)\n", @@ -55,11 +55,11 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ f64 ┆ i64 ┆ str ┆ i32 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", "╞═══════════╪══════════╪═══════╪════════╪═══╪══════════╪══════════╪══════════╪═══════════╡\n", - "│ 0.0 ┆ 0 ┆ a ┆ 1 ┆ … ┆ 0.649428 ┆ 0.052451 ┆ 0.331711 ┆ -0.69266 │\n", - "│ 0.841471 ┆ 1 ┆ a ┆ 1 ┆ … ┆ 0.161127 ┆ 0.007514 ┆ 0.500707 ┆ 0.025568 │\n", - "│ 0.909297 ┆ 2 ┆ a ┆ 0 ┆ … ┆ 0.751339 ┆ 0.791996 ┆ 0.645909 ┆ -0.997863 │\n", - "│ 0.14112 ┆ 3 ┆ a ┆ 0 ┆ … ┆ 0.319529 ┆ 0.566644 ┆ 0.960852 ┆ -0.133722 │\n", - "│ -0.756802 ┆ 4 ┆ a ┆ 0 ┆ … ┆ 0.170709 ┆ 0.9449 ┆ 0.640427 ┆ -0.01572 │\n", + "│ 0.0 ┆ 0 ┆ a ┆ 1 ┆ … ┆ 0.600571 ┆ 0.901676 ┆ 0.481886 ┆ -0.665053 │\n", + "│ 0.841471 ┆ 1 ┆ a ┆ 1 ┆ … ┆ 0.448908 ┆ 0.795043 ┆ 0.100833 ┆ -0.500674 │\n", + "│ 0.909297 ┆ 2 ┆ a ┆ 1 ┆ … ┆ 0.246148 ┆ 0.916253 ┆ 0.868888 ┆ -0.017706 │\n", + "│ 0.14112 ┆ 3 ┆ a ┆ 1 ┆ … ┆ 0.406564 ┆ 0.012962 ┆ 0.736355 ┆ -0.367598 │\n", + "│ -0.756802 ┆ 4 ┆ a ┆ 1 ┆ … ┆ 0.392611 ┆ 0.982307 ┆ 0.879509 ┆ -0.260554 │\n", "└───────────┴──────────┴───────┴────────┴───┴──────────┴──────────┴──────────┴───────────┘" ] }, @@ -217,7 +217,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 3)
fab
f64f64f64
1.3944e-15-0.052451-0.331711
-0.841471-0.007514-0.500707
-0.909297-0.791996-0.645909
-0.14112-0.566644-0.960852
0.756802-0.892449-0.308716
" + "shape: (5, 3)
fab
f64f64f64
1.3944e-15-0.901676-0.481886
-0.841471-0.795043-0.100833
-0.909297-0.916253-0.868888
-0.14112-0.012962-0.736355
0.756802-0.08063-0.397623
" ], "text/plain": [ "shape: (5, 3)\n", @@ -226,11 +226,11 @@ "│ --- ┆ --- ┆ --- │\n", "│ f64 ┆ f64 ┆ f64 │\n", "╞════════════╪═══════════╪═══════════╡\n", - "│ 1.3944e-15 ┆ -0.052451 ┆ -0.331711 │\n", - "│ -0.841471 ┆ -0.007514 ┆ -0.500707 │\n", - "│ -0.909297 ┆ -0.791996 ┆ -0.645909 │\n", - "│ -0.14112 ┆ -0.566644 ┆ -0.960852 │\n", - "│ 0.756802 ┆ -0.892449 ┆ -0.308716 │\n", + "│ 1.3944e-15 ┆ -0.901676 ┆ -0.481886 │\n", + "│ -0.841471 ┆ -0.795043 ┆ -0.100833 │\n", + "│ -0.909297 ┆ -0.916253 ┆ -0.868888 │\n", + "│ -0.14112 ┆ -0.012962 ┆ -0.736355 │\n", + "│ 0.756802 ┆ -0.08063 ┆ -0.397623 │\n", "└────────────┴───────────┴───────────┘" ] }, @@ -267,7 +267,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (1, 1)
lstsq_coeffs
list[f64]
[-0.484661, -0.352412]
" + "shape: (1, 1)
lstsq_coeffs
list[f64]
[-0.506473, -0.335296]
" ], "text/plain": [ "shape: (1, 1)\n", @@ -276,7 +276,7 @@ "│ --- │\n", "│ list[f64] │\n", "╞════════════════════════╡\n", - "│ [-0.484661, -0.352412] │\n", + "│ [-0.506473, -0.335296] │\n", "└────────────────────────┘" ] }, @@ -312,7 +312,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (4, 7)
featuresbetastd_errtp>|t|0.0250.975
strf64f64f64f64f64f64
"ln(x1+1)"0.2189580.001674130.7627530.00.2156750.22224
"exp(x2)"0.1745190.000674258.9592520.00.1731980.17584
"sin(x3)"-1.7427670.001337-1303.0281690.0-1.745389-1.740145
"__bias__"-0.1078590.001491-72.3221360.0-0.110782-0.104935
" + "shape: (4, 7)
featuresbetastd_errtp>|t|0.0250.975
strf64f64f64f64f64f64
"ln(x1+1)"0.2178160.001697128.343610.00.214490.221143
"exp(x2)"0.1752030.00068257.7435390.00.173870.176535
"sin(x3)"-1.7454580.00135-1292.9109810.0-1.748104-1.742812
"__bias__"-0.1078410.001514-71.2099470.0-0.110809-0.104872
" ], "text/plain": [ "shape: (4, 7)\n", @@ -321,10 +321,10 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ str ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", "╞══════════╪═══════════╪══════════╪══════════════╪═══════╪═══════════╪═══════════╡\n", - "│ ln(x1+1) ┆ 0.218958 ┆ 0.001674 ┆ 130.762753 ┆ 0.0 ┆ 0.215675 ┆ 0.22224 │\n", - "│ exp(x2) ┆ 0.174519 ┆ 0.000674 ┆ 258.959252 ┆ 0.0 ┆ 0.173198 ┆ 0.17584 │\n", - "│ sin(x3) ┆ -1.742767 ┆ 0.001337 ┆ -1303.028169 ┆ 0.0 ┆ -1.745389 ┆ -1.740145 │\n", - "│ __bias__ ┆ -0.107859 ┆ 0.001491 ┆ -72.322136 ┆ 0.0 ┆ -0.110782 ┆ -0.104935 │\n", + "│ ln(x1+1) ┆ 0.217816 ┆ 0.001697 ┆ 128.34361 ┆ 0.0 ┆ 0.21449 ┆ 0.221143 │\n", + "│ exp(x2) ┆ 0.175203 ┆ 0.00068 ┆ 257.743539 ┆ 0.0 ┆ 0.17387 ┆ 0.176535 │\n", + "│ sin(x3) ┆ -1.745458 ┆ 0.00135 ┆ -1292.910981 ┆ 0.0 ┆ -1.748104 ┆ -1.742812 │\n", + "│ __bias__ ┆ -0.107841 ┆ 0.001514 ┆ -71.209947 ┆ 0.0 ┆ -0.110809 ┆ -0.104872 │\n", "└──────────┴───────────┴──────────┴──────────────┴───────┴───────────┴───────────┘" ] }, @@ -361,7 +361,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (1, 1)
lstsq_coeffs
list[f64]
[-0.484661, -0.352412]
" + "shape: (1, 1)
lstsq_coeffs
list[f64]
[-0.506473, -0.335296]
" ], "text/plain": [ "shape: (1, 1)\n", @@ -370,7 +370,7 @@ "│ --- │\n", "│ list[f64] │\n", "╞════════════════════════╡\n", - "│ [-0.484661, -0.352412] │\n", + "│ [-0.506473, -0.335296] │\n", "└────────────────────────┘" ] }, @@ -405,27 +405,27 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (10_000, 2)
dummylstsq_coeffs
strlist[f64]
"a"[-0.46771, -0.37172]
"a"[-0.46771, -0.37172]
"a"[-0.46771, -0.37172]
"a"[-0.46771, -0.37172]
"a"[-0.46771, -0.37172]
"b"[-0.501249, -0.33307]
"b"[-0.501249, -0.33307]
"b"[-0.501249, -0.33307]
"b"[-0.501249, -0.33307]
"b"[-0.501249, -0.33307]
" + "shape: (10_000, 2)
dummylstsq_coeffs
strlist[f64]
"a"[-0.49053, -0.350722]
"a"[-0.49053, -0.350722]
"a"[-0.49053, -0.350722]
"a"[-0.49053, -0.350722]
"a"[-0.49053, -0.350722]
"b"[-0.522734, -0.319316]
"b"[-0.522734, -0.319316]
"b"[-0.522734, -0.319316]
"b"[-0.522734, -0.319316]
"b"[-0.522734, -0.319316]
" ], "text/plain": [ "shape: (10_000, 2)\n", - "┌───────┬───────────────────────┐\n", - "│ dummy ┆ lstsq_coeffs │\n", - "│ --- ┆ --- │\n", - "│ str ┆ list[f64] │\n", - "╞═══════╪═══════════════════════╡\n", - "│ a ┆ [-0.46771, -0.37172] │\n", - "│ a ┆ [-0.46771, -0.37172] │\n", - "│ a ┆ [-0.46771, -0.37172] │\n", - "│ a ┆ [-0.46771, -0.37172] │\n", - "│ a ┆ [-0.46771, -0.37172] │\n", - "│ … ┆ … │\n", - "│ b ┆ [-0.501249, -0.33307] │\n", - "│ b ┆ [-0.501249, -0.33307] │\n", - "│ b ┆ [-0.501249, -0.33307] │\n", - "│ b ┆ [-0.501249, -0.33307] │\n", - "│ b ┆ [-0.501249, -0.33307] │\n", - "└───────┴───────────────────────┘" + "┌───────┬────────────────────────┐\n", + "│ dummy ┆ lstsq_coeffs │\n", + "│ --- ┆ --- │\n", + "│ str ┆ list[f64] │\n", + "╞═══════╪════════════════════════╡\n", + "│ a ┆ [-0.49053, -0.350722] │\n", + "│ a ┆ [-0.49053, -0.350722] │\n", + "│ a ┆ [-0.49053, -0.350722] │\n", + "│ a ┆ [-0.49053, -0.350722] │\n", + "│ a ┆ [-0.49053, -0.350722] │\n", + "│ … ┆ … │\n", + "│ b ┆ [-0.522734, -0.319316] │\n", + "│ b ┆ [-0.522734, -0.319316] │\n", + "│ b ┆ [-0.522734, -0.319316] │\n", + "│ b ┆ [-0.522734, -0.319316] │\n", + "│ b ┆ [-0.522734, -0.319316] │\n", + "└───────┴────────────────────────┘" ] }, "execution_count": 10, @@ -460,7 +460,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 5)
x1x2ypredresid
f64f64f64f64f64
0.5501060.663055-0.69266-0.500284-0.192377
0.0046390.8882840.025568-0.3152910.340859
0.2751450.2929-0.997863-0.236574-0.761289
0.5565640.873322-0.133722-0.5775150.443793
0.2909040.655452-0.01572-0.3719790.356259
" + "shape: (5, 5)
x1x2ypredresid
f64f64f64f64f64
0.9725640.299432-0.665053-0.592975-0.072078
0.0559680.547583-0.500674-0.211949-0.288725
0.8667580.7382-0.017706-0.6865040.668798
0.1541840.730104-0.367598-0.322891-0.044707
0.2610950.963928-0.260554-0.4554380.194884
" ], "text/plain": [ "shape: (5, 5)\n", @@ -469,11 +469,11 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", "╞══════════╪══════════╪═══════════╪═══════════╪═══════════╡\n", - "│ 0.550106 ┆ 0.663055 ┆ -0.69266 ┆ -0.500284 ┆ -0.192377 │\n", - "│ 0.004639 ┆ 0.888284 ┆ 0.025568 ┆ -0.315291 ┆ 0.340859 │\n", - "│ 0.275145 ┆ 0.2929 ┆ -0.997863 ┆ -0.236574 ┆ -0.761289 │\n", - "│ 0.556564 ┆ 0.873322 ┆ -0.133722 ┆ -0.577515 ┆ 0.443793 │\n", - "│ 0.290904 ┆ 0.655452 ┆ -0.01572 ┆ -0.371979 ┆ 0.356259 │\n", + "│ 0.972564 ┆ 0.299432 ┆ -0.665053 ┆ -0.592975 ┆ -0.072078 │\n", + "│ 0.055968 ┆ 0.547583 ┆ -0.500674 ┆ -0.211949 ┆ -0.288725 │\n", + "│ 0.866758 ┆ 0.7382 ┆ -0.017706 ┆ -0.686504 ┆ 0.668798 │\n", + "│ 0.154184 ┆ 0.730104 ┆ -0.367598 ┆ -0.322891 ┆ -0.044707 │\n", + "│ 0.261095 ┆ 0.963928 ┆ -0.260554 ┆ -0.455438 ┆ 0.194884 │\n", "└──────────┴──────────┴───────────┴───────────┴───────────┘" ] }, @@ -513,18 +513,18 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (2, 2)
dummylstsq_coeffs
strlist[f64]
"a"[-0.46771, -0.37172]
"b"[-0.501249, -0.33307]
" + "shape: (2, 2)
dummylstsq_coeffs
strlist[f64]
"a"[-0.49053, -0.350722]
"b"[-0.522734, -0.319316]
" ], "text/plain": [ "shape: (2, 2)\n", - "┌───────┬───────────────────────┐\n", - "│ dummy ┆ lstsq_coeffs │\n", - "│ --- ┆ --- │\n", - "│ str ┆ list[f64] │\n", - "╞═══════╪═══════════════════════╡\n", - "│ a ┆ [-0.46771, -0.37172] │\n", - "│ b ┆ [-0.501249, -0.33307] │\n", - "└───────┴───────────────────────┘" + "┌───────┬────────────────────────┐\n", + "│ dummy ┆ lstsq_coeffs │\n", + "│ --- ┆ --- │\n", + "│ str ┆ list[f64] │\n", + "╞═══════╪════════════════════════╡\n", + "│ a ┆ [-0.49053, -0.350722] │\n", + "│ b ┆ [-0.522734, -0.319316] │\n", + "└───────┴────────────────────────┘" ] }, "execution_count": 12, @@ -558,7 +558,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (2, 2)
dummylstsq_coeffs
strlist[f64]
"a"[-0.298076, -0.20613]
"b"[-0.334865, -0.150524]
" + "shape: (2, 2)
dummylstsq_coeffs
strlist[f64]
"b"[-0.352896, -0.148746]
"a"[-0.310077, -0.187359]
" ], "text/plain": [ "shape: (2, 2)\n", @@ -567,8 +567,8 @@ "│ --- ┆ --- │\n", "│ str ┆ list[f64] │\n", "╞═══════╪════════════════════════╡\n", - "│ a ┆ [-0.298076, -0.20613] │\n", - "│ b ┆ [-0.334865, -0.150524] │\n", + "│ b ┆ [-0.352896, -0.148746] │\n", + "│ a ┆ [-0.310077, -0.187359] │\n", "└───────┴────────────────────────┘" ] }, @@ -605,7 +605,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (2, 2)
dummylasso_r2
strf64
"b"-0.533325
"a"-0.537092
" + "shape: (2, 2)
dummylasso_r2
strf64
"b"-0.530366
"a"-0.547831
" ], "text/plain": [ "shape: (2, 2)\n", @@ -614,8 +614,8 @@ "│ --- ┆ --- │\n", "│ str ┆ f64 │\n", "╞═══════╪═══════════╡\n", - "│ b ┆ -0.533325 │\n", - "│ a ┆ -0.537092 │\n", + "│ b ┆ -0.530366 │\n", + "│ a ┆ -0.547831 │\n", "└───────┴───────────┘" ] }, @@ -656,27 +656,27 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (10_000, 5)
yx1x2coeffspred
f64f64f64list[f64]f64
-0.692660.5501060.663055nullnull
0.0255680.0046390.888284nullnull
-0.9978630.2751450.2929nullnull
-0.1337220.5565640.873322nullnull
-0.015720.2909040.655452[-1.311733, 0.247865]-0.219125
-1.0808430.1095520.899619[-0.532829, -0.86932]-0.840429
-0.8449090.2312570.687855[0.438977, -1.353505]-0.829498
0.121670.6089520.322586[0.606546, -1.421551]-0.089216
-1.1009910.1944160.953846[0.75856, -1.392892]-1.181129
-0.1073730.3494280.481579[1.019164, -1.360391]-0.299012
" + "shape: (10_000, 5)
yx1x2coeffspred
f64f64f64list[f64]f64
-0.6650530.9725640.299432nullnull
-0.5006740.0559680.547583nullnull
-0.0177060.8667580.7382nullnull
-0.3675980.1541840.730104nullnull
-0.2605540.2610950.963928[-0.244051, -0.284693]-0.338143
-1.0924410.3268690.779862[-1.209222, -0.168168]-0.526405
-0.533140.7224380.79217[-0.959132, -0.162519]-0.821656
-1.341070.135050.312516[0.412189, -0.97953]-0.250452
-0.1835820.245260.816613[-0.189443, -0.693278]-0.612602
-0.5753890.695830.454125[-0.146486, -0.86184]-0.493313
" ], "text/plain": [ "shape: (10_000, 5)\n", - "┌───────────┬──────────┬──────────┬───────────────────────┬───────────┐\n", - "│ y ┆ x1 ┆ x2 ┆ coeffs ┆ pred │\n", - "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", - "│ f64 ┆ f64 ┆ f64 ┆ list[f64] ┆ f64 │\n", - "╞═══════════╪══════════╪══════════╪═══════════════════════╪═══════════╡\n", - "│ -0.69266 ┆ 0.550106 ┆ 0.663055 ┆ null ┆ null │\n", - "│ 0.025568 ┆ 0.004639 ┆ 0.888284 ┆ null ┆ null │\n", - "│ -0.997863 ┆ 0.275145 ┆ 0.2929 ┆ null ┆ null │\n", - "│ -0.133722 ┆ 0.556564 ┆ 0.873322 ┆ null ┆ null │\n", - "│ -0.01572 ┆ 0.290904 ┆ 0.655452 ┆ [-1.311733, 0.247865] ┆ -0.219125 │\n", - "│ … ┆ … ┆ … ┆ … ┆ … │\n", - "│ -1.080843 ┆ 0.109552 ┆ 0.899619 ┆ [-0.532829, -0.86932] ┆ -0.840429 │\n", - "│ -0.844909 ┆ 0.231257 ┆ 0.687855 ┆ [0.438977, -1.353505] ┆ -0.829498 │\n", - "│ 0.12167 ┆ 0.608952 ┆ 0.322586 ┆ [0.606546, -1.421551] ┆ -0.089216 │\n", - "│ -1.100991 ┆ 0.194416 ┆ 0.953846 ┆ [0.75856, -1.392892] ┆ -1.181129 │\n", - "│ -0.107373 ┆ 0.349428 ┆ 0.481579 ┆ [1.019164, -1.360391] ┆ -0.299012 │\n", - "└───────────┴──────────┴──────────┴───────────────────────┴───────────┘" + "┌───────────┬──────────┬──────────┬────────────────────────┬───────────┐\n", + "│ y ┆ x1 ┆ x2 ┆ coeffs ┆ pred │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ f64 ┆ f64 ┆ f64 ┆ list[f64] ┆ f64 │\n", + "╞═══════════╪══════════╪══════════╪════════════════════════╪═══════════╡\n", + "│ -0.665053 ┆ 0.972564 ┆ 0.299432 ┆ null ┆ null │\n", + "│ -0.500674 ┆ 0.055968 ┆ 0.547583 ┆ null ┆ null │\n", + "│ -0.017706 ┆ 0.866758 ┆ 0.7382 ┆ null ┆ null │\n", + "│ -0.367598 ┆ 0.154184 ┆ 0.730104 ┆ null ┆ null │\n", + "│ -0.260554 ┆ 0.261095 ┆ 0.963928 ┆ [-0.244051, -0.284693] ┆ -0.338143 │\n", + "│ … ┆ … ┆ … ┆ … ┆ … │\n", + "│ -1.092441 ┆ 0.326869 ┆ 0.779862 ┆ [-1.209222, -0.168168] ┆ -0.526405 │\n", + "│ -0.53314 ┆ 0.722438 ┆ 0.79217 ┆ [-0.959132, -0.162519] ┆ -0.821656 │\n", + "│ -1.34107 ┆ 0.13505 ┆ 0.312516 ┆ [0.412189, -0.97953] ┆ -0.250452 │\n", + "│ -0.183582 ┆ 0.24526 ┆ 0.816613 ┆ [-0.189443, -0.693278] ┆ -0.612602 │\n", + "│ -0.575389 ┆ 0.69583 ┆ 0.454125 ┆ [-0.146486, -0.86184] ┆ -0.493313 │\n", + "└───────────┴──────────┴──────────┴────────────────────────┴───────────┘" ] }, "execution_count": 15, @@ -756,7 +756,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (1, 1)
a
list[f64]
[29.067812, 28.980247, 28.665133]
" + "shape: (1, 1)
a
list[f64]
[29.176906, 28.875477, 28.619583]
" ], "text/plain": [ "shape: (1, 1)\n", @@ -765,7 +765,7 @@ "│ --- │\n", "│ list[f64] │\n", "╞═════════════════════════════════╡\n", - "│ [29.067812, 28.980247, 28.6651… │\n", + "│ [29.176906, 28.875477, 28.6195… │\n", "└─────────────────────────────────┘" ] }, @@ -797,7 +797,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (2, 2)
singular_valueweight_vector
f64list[f64]
29.024322[0.709965, 0.704237]
28.897161[-0.704237, 0.709965]
" + "shape: (2, 2)
singular_valueweight_vector
f64list[f64]
29.139846[0.981344, -0.192261]
28.771057[0.192261, 0.981344]
" ], "text/plain": [ "shape: (2, 2)\n", @@ -806,8 +806,8 @@ "│ --- ┆ --- │\n", "│ f64 ┆ list[f64] │\n", "╞════════════════╪═══════════════════════╡\n", - "│ 29.024322 ┆ [0.709965, 0.704237] │\n", - "│ 28.897161 ┆ [-0.704237, 0.709965] │\n", + "│ 29.139846 ┆ [0.981344, -0.192261] │\n", + "│ 28.771057 ┆ [0.192261, 0.981344] │\n", "└────────────────┴───────────────────────┘" ] }, @@ -839,7 +839,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 1)
pc1
f64
-0.439477
-0.352367
0.306843
0.368646
0.411539
" + "shape: (5, 1)
pc1
f64
0.39868
0.367297
0.33858
-0.522379
0.401359
" ], "text/plain": [ "shape: (5, 1)\n", @@ -848,11 +848,11 @@ "│ --- │\n", "│ f64 │\n", "╞═══════════╡\n", - "│ -0.439477 │\n", - "│ -0.352367 │\n", - "│ 0.306843 │\n", - "│ 0.368646 │\n", - "│ 0.411539 │\n", + "│ 0.39868 │\n", + "│ 0.367297 │\n", + "│ 0.33858 │\n", + "│ -0.522379 │\n", + "│ 0.401359 │\n", "└───────────┘" ] }, @@ -892,7 +892,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (2, 8)
dummy_groupsl2log lossprecisionrecallfaverage_precisionroc_auc
strf64f64f64f64f64f64f64
"b"0.3338851.0011430.5043930.5047960.5045940.5041590.501021
"a"0.3332070.9898030.4928480.4868790.4898460.5026030.500806
" + "shape: (2, 8)
dummy_groupsl2log lossprecisionrecallfaverage_precisionroc_auc
strf64f64f64f64f64f64f64
"b"0.3353091.0004730.5135570.4937740.5034720.5164590.499138
"a"0.3354211.0015620.4960630.4770830.4863880.4991390.495529
" ], "text/plain": [ "shape: (2, 8)\n", @@ -902,8 +902,8 @@ "│ str ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ --- ┆ f64 │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ f64 ┆ │\n", "╞══════════════╪══════════╪══════════╪═══════════╪══════════╪══════════╪════════════════╪══════════╡\n", - "│ b ┆ 0.333885 ┆ 1.001143 ┆ 0.504393 ┆ 0.504796 ┆ 0.504594 ┆ 0.504159 ┆ 0.501021 │\n", - "│ a ┆ 0.333207 ┆ 0.989803 ┆ 0.492848 ┆ 0.486879 ┆ 0.489846 ┆ 0.502603 ┆ 0.500806 │\n", + "│ b ┆ 0.335309 ┆ 1.000473 ┆ 0.513557 ┆ 0.493774 ┆ 0.503472 ┆ 0.516459 ┆ 0.499138 │\n", + "│ a ┆ 0.335421 ┆ 1.001562 ┆ 0.496063 ┆ 0.477083 ┆ 0.486388 ┆ 0.499139 ┆ 0.495529 │\n", "└──────────────┴──────────┴──────────┴───────────┴──────────┴──────────┴────────────────┴──────────┘" ] }, @@ -991,7 +991,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 1)
sen
str
"church"
"going"
"hello"
"world"
"to"
" + "shape: (5, 1)
sen
str
"hello"
"church"
"going"
"world"
"to"
" ], "text/plain": [ "shape: (5, 1)\n", @@ -1000,9 +1000,9 @@ "│ --- │\n", "│ str │\n", "╞════════╡\n", + "│ hello │\n", "│ church │\n", "│ going │\n", - "│ hello │\n", "│ world │\n", "│ to │\n", "└────────┘" @@ -1036,7 +1036,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 1)
sen
str
"hello"
"world"
"go"
""
"church"
" + "shape: (5, 1)
sen
str
"go"
"hello"
""
"church"
"world"
" ], "text/plain": [ "shape: (5, 1)\n", @@ -1045,11 +1045,11 @@ "│ --- │\n", "│ str │\n", "╞════════╡\n", - "│ hello │\n", - "│ world │\n", "│ go │\n", + "│ hello │\n", "│ │\n", "│ church │\n", + "│ world │\n", "└────────┘" ] }, @@ -1419,7 +1419,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 1)
a
f64
null
null
2.764739
-1.3346
1.103795
" + "shape: (5, 1)
a
f64
null
null
1.637908
0.39391
1.135519
" ], "text/plain": [ "shape: (5, 1)\n", @@ -1430,9 +1430,9 @@ "╞══════════╡\n", "│ null │\n", "│ null │\n", - "│ 2.764739 │\n", - "│ -1.3346 │\n", - "│ 1.103795 │\n", + "│ 1.637908 │\n", + "│ 0.39391 │\n", + "│ 1.135519 │\n", "└──────────┘" ] }, @@ -1466,7 +1466,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 3)
arandom_normalrandom_normal_that_respects_null_of_a
f64f64f64
null-0.287753null
null1.137891null
2.7647392.052795-1.09632
-1.3346-1.83142-0.259511
1.1037951.3496151.692965
" + "shape: (5, 3)
arandom_normalrandom_normal_that_respects_null_of_a
f64f64f64
null0.984349null
null1.268442null
1.6379081.8294651.125404
0.393910.2585541.03578
1.135519-0.2666871.180521
" ], "text/plain": [ "shape: (5, 3)\n", @@ -1475,11 +1475,11 @@ "│ --- ┆ --- ┆ --- │\n", "│ f64 ┆ f64 ┆ f64 │\n", "╞══════════╪═══════════════╪═════════════════════════════════╡\n", - "│ null ┆ -0.287753 ┆ null │\n", - "│ null ┆ 1.137891 ┆ null │\n", - "│ 2.764739 ┆ 2.052795 ┆ -1.09632 │\n", - "│ -1.3346 ┆ -1.83142 ┆ -0.259511 │\n", - "│ 1.103795 ┆ 1.349615 ┆ 1.692965 │\n", + "│ null ┆ 0.984349 ┆ null │\n", + "│ null ┆ 1.268442 ┆ null │\n", + "│ 1.637908 ┆ 1.829465 ┆ 1.125404 │\n", + "│ 0.39391 ┆ 0.258554 ┆ 1.03578 │\n", + "│ 1.135519 ┆ -0.266687 ┆ 1.180521 │\n", "└──────────┴───────────────┴─────────────────────────────────┘" ] }, @@ -1514,7 +1514,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 3)
arandom_strrandom_str_that_respects_null_of_a
f64strstr
null"NMw"null
null"3vbZ"null
2.764739"h8So9""TvB9a"
-1.3346"hw""TD4"
1.103795"2Znps""V"
" + "shape: (5, 3)
arandom_strrandom_str_that_respects_null_of_a
f64strstr
null"78"null
null"TJsSs"null
1.637908"ZCXwr""R"
0.39391"iXS""rMIb"
1.135519"1g""Z"
" ], "text/plain": [ "shape: (5, 3)\n", @@ -1523,11 +1523,11 @@ "│ --- ┆ --- ┆ --- │\n", "│ f64 ┆ str ┆ str │\n", "╞══════════╪════════════╪═════════════════════════════════╡\n", - "│ null ┆ NMw ┆ null │\n", - "│ null ┆ 3vbZ ┆ null │\n", - "│ 2.764739 ┆ h8So9 ┆ TvB9a │\n", - "│ -1.3346 ┆ hw ┆ TD4 │\n", - "│ 1.103795 ┆ 2Znps ┆ V │\n", + "│ null ┆ 78 ┆ null │\n", + "│ null ┆ TJsSs ┆ null │\n", + "│ 1.637908 ┆ ZCXwr ┆ R │\n", + "│ 0.39391 ┆ iXS ┆ rMIb │\n", + "│ 1.135519 ┆ 1g ┆ Z │\n", "└──────────┴────────────┴─────────────────────────────────┘" ] }, @@ -1562,7 +1562,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 2)
arandom_str
f64str
nullnull
nullnull
2.764739"BvTXp"
-1.3346"x0cZn"
1.103795"xih81"
" + "shape: (5, 2)
arandom_str
f64str
nullnull
nullnull
1.637908"HwXg9"
0.39391"bwxYr"
1.135519"KBLag"
" ], "text/plain": [ "shape: (5, 2)\n", @@ -1573,9 +1573,9 @@ "╞══════════╪════════════╡\n", "│ null ┆ null │\n", "│ null ┆ null │\n", - "│ 2.764739 ┆ BvTXp │\n", - "│ -1.3346 ┆ x0cZn │\n", - "│ 1.103795 ┆ xih81 │\n", + "│ 1.637908 ┆ HwXg9 │\n", + "│ 0.39391 ┆ bwxYr │\n", + "│ 1.135519 ┆ KBLag │\n", "└──────────┴────────────┘" ] }, @@ -1609,7 +1609,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 4)
atest1literaltest1_perturbed
f64f64f64f64
null1.343124null1.342837
null0.207816null0.208056
2.7647390.3661282.3819080.36599
-1.33461.7660081.6578591.766105
1.103795-0.4240221.071486-0.424504
" + "shape: (5, 4)
atest1literaltest1_perturbed
f64f64f64f64
null-0.999841null-1.00034
null-0.014981null-0.014588
1.637908-0.0453080.906578-0.044914
0.39391-0.4984351.22988-0.498401
1.135519-1.3512061.429987-1.351569
" ], "text/plain": [ "shape: (5, 4)\n", @@ -1618,11 +1618,11 @@ "│ --- ┆ --- ┆ --- ┆ --- │\n", "│ f64 ┆ f64 ┆ f64 ┆ f64 │\n", "╞══════════╪═══════════╪══════════╪═════════════════╡\n", - "│ null ┆ 1.343124 ┆ null ┆ 1.342837 │\n", - "│ null ┆ 0.207816 ┆ null ┆ 0.208056 │\n", - "│ 2.764739 ┆ 0.366128 ┆ 2.381908 ┆ 0.36599 │\n", - "│ -1.3346 ┆ 1.766008 ┆ 1.657859 ┆ 1.766105 │\n", - "│ 1.103795 ┆ -0.424022 ┆ 1.071486 ┆ -0.424504 │\n", + "│ null ┆ -0.999841 ┆ null ┆ -1.00034 │\n", + "│ null ┆ -0.014981 ┆ null ┆ -0.014588 │\n", + "│ 1.637908 ┆ -0.045308 ┆ 0.906578 ┆ -0.044914 │\n", + "│ 0.39391 ┆ -0.498435 ┆ 1.22988 ┆ -0.498401 │\n", + "│ 1.135519 ┆ -1.351206 ┆ 1.429987 ┆ -1.351569 │\n", "└──────────┴───────────┴──────────┴─────────────────┘" ] }, @@ -1661,7 +1661,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 4)
a[0, 1)NormalInt from [0, 10)
f64f64f64i32
null0.7583160.202447
null0.491458-0.7256068
2.7647390.1917771.0353725
-1.33460.08989-1.2256233
1.1037950.8917581.5283855
" + "shape: (5, 4)
a[0, 1)NormalInt from [0, 10)
f64f64f64i32
null0.355994-0.038333
null0.8022050.6755713
1.6379080.937455-0.4232577
0.393910.9090490.4197916
1.1355190.7600012.5511087
" ], "text/plain": [ "shape: (5, 4)\n", @@ -1670,11 +1670,11 @@ "│ --- ┆ --- ┆ --- ┆ --- │\n", "│ f64 ┆ f64 ┆ f64 ┆ i32 │\n", "╞══════════╪══════════╪═══════════╪══════════════════╡\n", - "│ null ┆ 0.758316 ┆ 0.20244 ┆ 7 │\n", - "│ null ┆ 0.491458 ┆ -0.725606 ┆ 8 │\n", - "│ 2.764739 ┆ 0.191777 ┆ 1.035372 ┆ 5 │\n", - "│ -1.3346 ┆ 0.08989 ┆ -1.225623 ┆ 3 │\n", - "│ 1.103795 ┆ 0.891758 ┆ 1.528385 ┆ 5 │\n", + "│ null ┆ 0.355994 ┆ -0.03833 ┆ 3 │\n", + "│ null ┆ 0.802205 ┆ 0.675571 ┆ 3 │\n", + "│ 1.637908 ┆ 0.937455 ┆ -0.423257 ┆ 7 │\n", + "│ 0.39391 ┆ 0.909049 ┆ 0.419791 ┆ 6 │\n", + "│ 1.135519 ┆ 0.760001 ┆ 2.551108 ┆ 7 │\n", "└──────────┴──────────┴───────────┴──────────────────┘" ] }, @@ -1709,7 +1709,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (1, 4)
t-tests: statisticst-tests: pvaluenormality_test: statisticsnormality_test: pvalue
f64f64f64f64
0.6573040.5110870.3602160.83518
" + "shape: (1, 4)
t-tests: statisticst-tests: pvaluenormality_test: statisticsnormality_test: pvalue
f64f64f64f64
0.3239340.74603411.0044120.004078
" ], "text/plain": [ "shape: (1, 4)\n", @@ -1718,7 +1718,7 @@ "│ --- ┆ --- ┆ --- ┆ --- │\n", "│ f64 ┆ f64 ┆ f64 ┆ f64 │\n", "╞═════════════════════╪═════════════════╪════════════════════════════╪════════════════════════╡\n", - "│ 0.657304 ┆ 0.511087 ┆ 0.360216 ┆ 0.83518 │\n", + "│ 0.323934 ┆ 0.746034 ┆ 11.004412 ┆ 0.004078 │\n", "└─────────────────────┴─────────────────┴────────────────────────────┴────────────────────────┘" ] }, @@ -1736,7 +1736,7 @@ " pds.random_normal(0.5, 1.0).alias(\"test1\"),\n", " pds.random_normal(0.5, 2.0).alias(\"test2\"),\n", ").select(\n", - " pds.query_ttest_ind(\"test1\", \"test2\", equal_var=False).alias(\"t-test\"),\n", + " pds.ttest_ind(\"test1\", \"test2\", equal_var=False).alias(\"t-test\"),\n", " pds.normal_test(\"test1\").alias(\"normality_test\")\n", ").select(\n", " pl.col(\"t-test\").struct.field(\"statistic\").alias(\"t-tests: statistics\")\n", @@ -1762,7 +1762,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 5)
market_idvar1var2category_1category_2
i64f64f64i32i32
00.0955260.21979448
10.5029570.12798615
20.0295820.80817710
00.4240240.3901416
10.0616540.392548
" + "shape: (5, 5)
market_idvar1var2category_1category_2
i64f64f64i32i32
00.5532790.87215201
10.8999390.27161405
20.875960.82702241
00.0697820.64796217
10.57160.35766524
" ], "text/plain": [ "shape: (5, 5)\n", @@ -1771,11 +1771,11 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ i64 ┆ f64 ┆ f64 ┆ i32 ┆ i32 │\n", "╞═══════════╪══════════╪══════════╪════════════╪════════════╡\n", - "│ 0 ┆ 0.095526 ┆ 0.219794 ┆ 4 ┆ 8 │\n", - "│ 1 ┆ 0.502957 ┆ 0.127986 ┆ 1 ┆ 5 │\n", - "│ 2 ┆ 0.029582 ┆ 0.808177 ┆ 1 ┆ 0 │\n", - "│ 0 ┆ 0.424024 ┆ 0.39014 ┆ 1 ┆ 6 │\n", - "│ 1 ┆ 0.061654 ┆ 0.3925 ┆ 4 ┆ 8 │\n", + "│ 0 ┆ 0.553279 ┆ 0.872152 ┆ 0 ┆ 1 │\n", + "│ 1 ┆ 0.899939 ┆ 0.271614 ┆ 0 ┆ 5 │\n", + "│ 2 ┆ 0.87596 ┆ 0.827022 ┆ 4 ┆ 1 │\n", + "│ 0 ┆ 0.069782 ┆ 0.647962 ┆ 1 ┆ 7 │\n", + "│ 1 ┆ 0.5716 ┆ 0.357665 ┆ 2 ┆ 4 │\n", "└───────────┴──────────┴──────────┴────────────┴────────────┘" ] }, @@ -1815,17 +1815,17 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (1, 3)
t-testchi2-testf-test
struct[2]struct[2]struct[2]
{-0.584836,0.558671}{41.626592,0.239132}{0.368606,0.831148}
" + "shape: (1, 3)
t-testchi2-testf-test
struct[2]struct[2]struct[2]
{0.331295,0.740429}{41.700303,0.236697}{0.667292,0.614652}
" ], "text/plain": [ "shape: (1, 3)\n", - "┌──────────────────────┬──────────────────────┬─────────────────────┐\n", - "│ t-test ┆ chi2-test ┆ f-test │\n", - "│ --- ┆ --- ┆ --- │\n", - "│ struct[2] ┆ struct[2] ┆ struct[2] │\n", - "╞══════════════════════╪══════════════════════╪═════════════════════╡\n", - "│ {-0.584836,0.558671} ┆ {41.626592,0.239132} ┆ {0.368606,0.831148} │\n", - "└──────────────────────┴──────────────────────┴─────────────────────┘" + "┌─────────────────────┬──────────────────────┬─────────────────────┐\n", + "│ t-test ┆ chi2-test ┆ f-test │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ struct[2] ┆ struct[2] ┆ struct[2] │\n", + "╞═════════════════════╪══════════════════════╪═════════════════════╡\n", + "│ {0.331295,0.740429} ┆ {41.700303,0.236697} ┆ {0.667292,0.614652} │\n", + "└─────────────────────┴──────────────────────┴─────────────────────┘" ] }, "execution_count": 40, @@ -1836,9 +1836,9 @@ "source": [ "# In dataframe statistical tests!\n", "df.select(\n", - " pds.query_ttest_ind(\"var1\", \"var2\", equal_var=True).alias(\"t-test\"),\n", - " pds.query_chi2(\"category_1\", \"category_2\").alias(\"chi2-test\"),\n", - " pds.query_f_test(\"var1\", group = \"category_1\").alias(\"f-test\")\n", + " pds.ttest_ind(\"var1\", \"var2\", equal_var=True).alias(\"t-test\"),\n", + " pds.chi2(\"category_1\", \"category_2\").alias(\"chi2-test\"),\n", + " pds.f_test(\"var1\", group = \"category_1\").alias(\"f-test\")\n", ")" ] }, @@ -1858,9 +1858,9 @@ "│ --- ┆ --- ┆ --- ┆ --- │\n", "│ i64 ┆ struct[2] ┆ struct[2] ┆ struct[2] │\n", "╞═══════════╪══════════════════════╪══════════════════════╪═════════════════════╡\n", - "│ 0 ┆ {-0.096041,0.923494} ┆ {44.646803,0.152806} ┆ {0.806849,0.520709} │\n", - "│ 1 ┆ {0.030178,0.975927} ┆ {42.866921,0.200349} ┆ {0.747775,0.559465} │\n", - "│ 2 ┆ {-0.952252,0.341039} ┆ {32.888416,0.617373} ┆ {0.888775,0.4698} │\n", + "│ 0 ┆ {-0.039752,0.968293} ┆ {31.876725,0.665088} ┆ {0.93687,0.441541} │\n", + "│ 1 ┆ {0.465675,0.641479} ┆ {35.629246,0.486083} ┆ {0.071499,0.990691} │\n", + "│ 2 ┆ {0.14725,0.882944} ┆ {44.042169,0.167903} ┆ {0.955711,0.430808} │\n", "└───────────┴──────────────────────┴──────────────────────┴─────────────────────┘\n" ] } @@ -1869,9 +1869,9 @@ "# Can also be done in group by context\n", "print(\n", " df.group_by(\"market_id\").agg(\n", - " pds.query_ttest_ind(\"var1\", \"var2\", equal_var=False).alias(\"t-test\"),\n", - " pds.query_chi2(\"category_1\", \"category_2\").alias(\"chi2-test\"),\n", - " pds.query_f_test(\"var1\", group = \"category_1\").alias(\"f-test\")\n", + " pds.ttest_ind(\"var1\", \"var2\", equal_var=False).alias(\"t-test\"),\n", + " pds.chi2(\"category_1\", \"category_2\").alias(\"chi2-test\"),\n", + " pds.f_test(\"var1\", group = \"category_1\").alias(\"f-test\")\n", " )\n", ")" ] @@ -1892,7 +1892,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (9, 2)
first_digit_cntfirst_digit_distribution
u32f64
5530.1106
5880.1176
5490.1098
5930.1186
5630.1126
5910.1182
5170.1034
5350.107
5110.1022
" + "shape: (9, 2)
first_digit_cntfirst_digit_distribution
u32f64
5440.1088
5380.1076
5460.1092
6030.1206
5370.1074
5440.1088
5550.111
5770.1154
5560.1112
" ], "text/plain": [ "shape: (9, 2)\n", @@ -1901,15 +1901,15 @@ "│ --- ┆ --- │\n", "│ u32 ┆ f64 │\n", "╞═════════════════╪══════════════════════════╡\n", - "│ 553 ┆ 0.1106 │\n", - "│ 588 ┆ 0.1176 │\n", - "│ 549 ┆ 0.1098 │\n", - "│ 593 ┆ 0.1186 │\n", - "│ 563 ┆ 0.1126 │\n", - "│ 591 ┆ 0.1182 │\n", - "│ 517 ┆ 0.1034 │\n", - "│ 535 ┆ 0.107 │\n", - "│ 511 ┆ 0.1022 │\n", + "│ 544 ┆ 0.1088 │\n", + "│ 538 ┆ 0.1076 │\n", + "│ 546 ┆ 0.1092 │\n", + "│ 603 ┆ 0.1206 │\n", + "│ 537 ┆ 0.1074 │\n", + "│ 544 ┆ 0.1088 │\n", + "│ 555 ┆ 0.111 │\n", + "│ 577 ┆ 0.1154 │\n", + "│ 556 ┆ 0.1112 │\n", "└─────────────────┴──────────────────────────┘" ] }, @@ -1975,7 +1975,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 7)
idvar1var2var3rrhnb_l_inf_cnt
u32f64f64f64f64f64u32
00.9844930.5741520.0331310.8286065.06147
10.1874970.8232860.6871180.3739367.49639516
20.3845690.8021130.3697320.3869915.97964723
30.2436920.1798020.4386910.413512.61465121
40.7624250.9300850.3751690.2489613.3330712
" + "shape: (5, 7)
idvar1var2var3rrhnb_l_inf_cnt
u32f64f64f64f64f64u32
00.6043730.1676660.1631730.3646411.93324218
10.0398750.7072290.9401020.7222328.107538
20.7081460.6652070.412720.1969221.80077115
30.0733320.3564130.6569070.7914033.91356918
40.4523090.7888150.7836610.1343798.15585216
" ], "text/plain": [ "shape: (5, 7)\n", @@ -1984,11 +1984,11 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ u32 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ u32 │\n", "╞═════╪══════════╪══════════╪══════════╪══════════╪══════════╪══════════════╡\n", - "│ 0 ┆ 0.984493 ┆ 0.574152 ┆ 0.033131 ┆ 0.828606 ┆ 5.0614 ┆ 7 │\n", - "│ 1 ┆ 0.187497 ┆ 0.823286 ┆ 0.687118 ┆ 0.373936 ┆ 7.496395 ┆ 16 │\n", - "│ 2 ┆ 0.384569 ┆ 0.802113 ┆ 0.369732 ┆ 0.386991 ┆ 5.979647 ┆ 23 │\n", - "│ 3 ┆ 0.243692 ┆ 0.179802 ┆ 0.438691 ┆ 0.41351 ┆ 2.614651 ┆ 21 │\n", - "│ 4 ┆ 0.762425 ┆ 0.930085 ┆ 0.375169 ┆ 0.248961 ┆ 3.33307 ┆ 12 │\n", + "│ 0 ┆ 0.604373 ┆ 0.167666 ┆ 0.163173 ┆ 0.364641 ┆ 1.933242 ┆ 18 │\n", + "│ 1 ┆ 0.039875 ┆ 0.707229 ┆ 0.940102 ┆ 0.722232 ┆ 8.10753 ┆ 8 │\n", + "│ 2 ┆ 0.708146 ┆ 0.665207 ┆ 0.41272 ┆ 0.196922 ┆ 1.800771 ┆ 15 │\n", + "│ 3 ┆ 0.073332 ┆ 0.356413 ┆ 0.656907 ┆ 0.791403 ┆ 3.913569 ┆ 18 │\n", + "│ 4 ┆ 0.452309 ┆ 0.788815 ┆ 0.783661 ┆ 0.134379 ┆ 8.155852 ┆ 16 │\n", "└─────┴──────────┴──────────┴──────────┴──────────┴──────────┴──────────────┘" ] }, @@ -2025,7 +2025,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 7)
idvar1var2var3rrhnb_l1_r_cnt
u32f64f64f64f64f64u32
00.9844930.5741520.0331310.8286065.0614398
10.1874970.8232860.6871180.3739367.496395138
20.3845690.8021130.3697320.3869915.979647152
30.2436920.1798020.4386910.413512.614651146
40.7624250.9300850.3751690.2489613.3330733
" + "shape: (5, 7)
idvar1var2var3rrhnb_l1_r_cnt
u32f64f64f64f64f64u32
00.6043730.1676660.1631730.3646411.933242104
10.0398750.7072290.9401020.7222328.10753285
20.7081460.6652070.412720.1969221.80077119
30.0733320.3564130.6569070.7914033.913569627
40.4523090.7888150.7836610.1343798.1558524
" ], "text/plain": [ "shape: (5, 7)\n", @@ -2034,11 +2034,11 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ u32 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ u32 │\n", "╞═════╪══════════╪══════════╪══════════╪══════════╪══════════╪═════════════╡\n", - "│ 0 ┆ 0.984493 ┆ 0.574152 ┆ 0.033131 ┆ 0.828606 ┆ 5.0614 ┆ 398 │\n", - "│ 1 ┆ 0.187497 ┆ 0.823286 ┆ 0.687118 ┆ 0.373936 ┆ 7.496395 ┆ 138 │\n", - "│ 2 ┆ 0.384569 ┆ 0.802113 ┆ 0.369732 ┆ 0.386991 ┆ 5.979647 ┆ 152 │\n", - "│ 3 ┆ 0.243692 ┆ 0.179802 ┆ 0.438691 ┆ 0.41351 ┆ 2.614651 ┆ 146 │\n", - "│ 4 ┆ 0.762425 ┆ 0.930085 ┆ 0.375169 ┆ 0.248961 ┆ 3.33307 ┆ 33 │\n", + "│ 0 ┆ 0.604373 ┆ 0.167666 ┆ 0.163173 ┆ 0.364641 ┆ 1.933242 ┆ 104 │\n", + "│ 1 ┆ 0.039875 ┆ 0.707229 ┆ 0.940102 ┆ 0.722232 ┆ 8.10753 ┆ 285 │\n", + "│ 2 ┆ 0.708146 ┆ 0.665207 ┆ 0.41272 ┆ 0.196922 ┆ 1.800771 ┆ 19 │\n", + "│ 3 ┆ 0.073332 ┆ 0.356413 ┆ 0.656907 ┆ 0.791403 ┆ 3.913569 ┆ 627 │\n", + "│ 4 ┆ 0.452309 ┆ 0.788815 ┆ 0.783661 ┆ 0.134379 ┆ 8.155852 ┆ 4 │\n", "└─────┴──────────┴──────────┴──────────┴──────────┴──────────┴─────────────┘" ] }, @@ -2074,7 +2074,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 7)
idvar1var2var3rrhbest friends
u32f64f64f64f64f64list[u32]
00.9844930.5741520.0331310.8286065.0614[0, 359, … 1993]
10.1874970.8232860.6871180.3739367.496395[1, 285, … 583]
20.3845690.8021130.3697320.3869915.979647[2, 1907, … 15]
30.2436920.1798020.4386910.413512.614651[3, 1247, … 1616]
40.7624250.9300850.3751690.2489613.33307[4, 840, … 207]
" + "shape: (5, 7)
idvar1var2var3rrhbest friends
u32f64f64f64f64f64list[u32]
00.6043730.1676660.1631730.3646411.933242[0, 802, … 1942]
10.0398750.7072290.9401020.7222328.10753[1, 998, … 1858]
20.7081460.6652070.412720.1969221.800771[2, 1049, … 1348]
30.0733320.3564130.6569070.7914033.913569[3, 1028, … 676]
40.4523090.7888150.7836610.1343798.155852[4, 1518, … 1587]
" ], "text/plain": [ "shape: (5, 7)\n", @@ -2083,11 +2083,11 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ u32 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ list[u32] │\n", "╞═════╪══════════╪══════════╪══════════╪══════════╪══════════╪═══════════════════╡\n", - "│ 0 ┆ 0.984493 ┆ 0.574152 ┆ 0.033131 ┆ 0.828606 ┆ 5.0614 ┆ [0, 359, … 1993] │\n", - "│ 1 ┆ 0.187497 ┆ 0.823286 ┆ 0.687118 ┆ 0.373936 ┆ 7.496395 ┆ [1, 285, … 583] │\n", - "│ 2 ┆ 0.384569 ┆ 0.802113 ┆ 0.369732 ┆ 0.386991 ┆ 5.979647 ┆ [2, 1907, … 15] │\n", - "│ 3 ┆ 0.243692 ┆ 0.179802 ┆ 0.438691 ┆ 0.41351 ┆ 2.614651 ┆ [3, 1247, … 1616] │\n", - "│ 4 ┆ 0.762425 ┆ 0.930085 ┆ 0.375169 ┆ 0.248961 ┆ 3.33307 ┆ [4, 840, … 207] │\n", + "│ 0 ┆ 0.604373 ┆ 0.167666 ┆ 0.163173 ┆ 0.364641 ┆ 1.933242 ┆ [0, 802, … 1942] │\n", + "│ 1 ┆ 0.039875 ┆ 0.707229 ┆ 0.940102 ┆ 0.722232 ┆ 8.10753 ┆ [1, 998, … 1858] │\n", + "│ 2 ┆ 0.708146 ┆ 0.665207 ┆ 0.41272 ┆ 0.196922 ┆ 1.800771 ┆ [2, 1049, … 1348] │\n", + "│ 3 ┆ 0.073332 ┆ 0.356413 ┆ 0.656907 ┆ 0.791403 ┆ 3.913569 ┆ [3, 1028, … 676] │\n", + "│ 4 ┆ 0.452309 ┆ 0.788815 ┆ 0.783661 ┆ 0.134379 ┆ 8.155852 ┆ [4, 1518, … 1587] │\n", "└─────┴──────────┴──────────┴──────────┴──────────┴──────────┴───────────────────┘" ] }, @@ -2121,17 +2121,17 @@ "output_type": "stream", "text": [ "shape: (5, 3)\n", - "┌─────┬──────────────────┬────────────────────┐\n", - "│ id ┆ best friends ┆ best friends count │\n", - "│ --- ┆ --- ┆ --- │\n", - "│ u32 ┆ list[u32] ┆ u32 │\n", - "╞═════╪══════════════════╪════════════════════╡\n", - "│ 0 ┆ [0, 359, … 431] ┆ 4 │\n", - "│ 1 ┆ [1, 285, … 46] ┆ 8 │\n", - "│ 2 ┆ [2, 1907, … 853] ┆ 10 │\n", - "│ 3 ┆ [3, 1247, … 276] ┆ 13 │\n", - "│ 4 ┆ [4, 840, … 794] ┆ 4 │\n", - "└─────┴──────────────────┴────────────────────┘\n" + "┌─────┬───────────────────┬────────────────────┐\n", + "│ id ┆ best friends ┆ best friends count │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ u32 ┆ list[u32] ┆ u32 │\n", + "╞═════╪═══════════════════╪════════════════════╡\n", + "│ 0 ┆ [0, 802, … 1614] ┆ 11 │\n", + "│ 1 ┆ [1, 998, … 914] ┆ 5 │\n", + "│ 2 ┆ [2, 1049, … 1726] ┆ 7 │\n", + "│ 3 ┆ [3, 1028, … 912] ┆ 12 │\n", + "│ 4 ┆ [4, 1518, … 1950] ┆ 5 │\n", + "└─────┴───────────────────┴────────────────────┘\n" ] } ], @@ -2171,7 +2171,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 8)
idvar1var2var3rrhidxdist
u32f64f64f64f64f64list[u32]list[f64]
00.9844930.5741520.0331310.8286065.0614[0, 359, … 1993][0.0, 0.075875, … 0.084589]
10.1874970.8232860.6871180.3739367.496395[1, 285, … 583][0.0, 0.066303, … 0.074323]
20.3845690.8021130.3697320.3869915.979647[2, 1907, … 15][0.0, 0.034667, … 0.052971]
30.2436920.1798020.4386910.413512.614651[3, 1247, … 1616][0.0, 0.067122, … 0.069608]
40.7624250.9300850.3751690.2489613.33307[4, 840, … 207][0.0, 0.082641, … 0.098739]
" + "shape: (5, 8)
idvar1var2var3rrhidxdist
u32f64f64f64f64f64list[u32]list[f64]
00.6043730.1676660.1631730.3646411.933242[0, 802, … 1942][0.0, 0.029826, … 0.043563]
10.0398750.7072290.9401020.7222328.10753[1, 998, … 1858][0.0, 0.044368, … 0.074784]
20.7081460.6652070.412720.1969221.800771[2, 1049, … 1348][0.0, 0.040579, … 0.047743]
30.0733320.3564130.6569070.7914033.913569[3, 1028, … 676][0.0, 0.071795, … 0.077792]
40.4523090.7888150.7836610.1343798.155852[4, 1518, … 1587][0.0, 0.035496, … 0.081266]
" ], "text/plain": [ "shape: (5, 8)\n", @@ -2180,16 +2180,16 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ u32 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ list[u32] ┆ list[f64] │\n", "╞═════╪══════════╪══════════╪══════════╪══════════╪══════════╪══════════════════╪══════════════════╡\n", - "│ 0 ┆ 0.984493 ┆ 0.574152 ┆ 0.033131 ┆ 0.828606 ┆ 5.0614 ┆ [0, 359, … 1993] ┆ [0.0, 0.075875, │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … 0.084589] │\n", - "│ 1 ┆ 0.187497 ┆ 0.823286 ┆ 0.687118 ┆ 0.373936 ┆ 7.496395 ┆ [1, 285, … 583] ┆ [0.0, 0.066303, │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … 0.074323] │\n", - "│ 2 ┆ 0.384569 ┆ 0.802113 ┆ 0.369732 ┆ 0.386991 ┆ 5.979647 ┆ [2, 1907, … 15] ┆ [0.0, 0.034667, │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … 0.052971] │\n", - "│ 3 ┆ 0.243692 ┆ 0.179802 ┆ 0.438691 ┆ 0.41351 ┆ 2.614651 ┆ [3, 1247, … ┆ [0.0, 0.067122, │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ 1616] ┆ … 0.069608] │\n", - "│ 4 ┆ 0.762425 ┆ 0.930085 ┆ 0.375169 ┆ 0.248961 ┆ 3.33307 ┆ [4, 840, … 207] ┆ [0.0, 0.082641, │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … 0.098739] │\n", + "│ 0 ┆ 0.604373 ┆ 0.167666 ┆ 0.163173 ┆ 0.364641 ┆ 1.933242 ┆ [0, 802, … 1942] ┆ [0.0, 0.029826, │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … 0.043563] │\n", + "│ 1 ┆ 0.039875 ┆ 0.707229 ┆ 0.940102 ┆ 0.722232 ┆ 8.10753 ┆ [1, 998, … 1858] ┆ [0.0, 0.044368, │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … 0.074784] │\n", + "│ 2 ┆ 0.708146 ┆ 0.665207 ┆ 0.41272 ┆ 0.196922 ┆ 1.800771 ┆ [2, 1049, … ┆ [0.0, 0.040579, │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ 1348] ┆ … 0.047743] │\n", + "│ 3 ┆ 0.073332 ┆ 0.356413 ┆ 0.656907 ┆ 0.791403 ┆ 3.913569 ┆ [3, 1028, … 676] ┆ [0.0, 0.071795, │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … 0.077792] │\n", + "│ 4 ┆ 0.452309 ┆ 0.788815 ┆ 0.783661 ┆ 0.134379 ┆ 8.155852 ┆ [4, 1518, … ┆ [0.0, 0.035496, │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ 1587] ┆ … 0.081266] │\n", "└─────┴──────────┴──────────┴──────────┴──────────┴──────────┴──────────────────┴──────────────────┘" ] }, @@ -2229,7 +2229,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 6)
idvar1var2var3rrh
u32f64f64f64f64f64
20.3845690.8021130.3697320.3869915.979647
30.2436920.1798020.4386910.413512.614651
90.1553140.7432130.5728710.4937235.70132
140.4382810.4664490.4327850.8412717.552722
150.4195960.8212440.3349020.6945419.895788
" + "shape: (5, 6)
idvar1var2var3rrh
u32f64f64f64f64f64
20.7081460.6652070.412720.1969221.800771
40.4523090.7888150.7836610.1343798.155852
80.5313350.7106720.1462740.0316672.591195
110.3095240.6176980.8388480.9262613.831364
130.1350020.5701810.5287090.8437581.314187
" ], "text/plain": [ "shape: (5, 6)\n", @@ -2238,11 +2238,11 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ u32 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", "╞═════╪══════════╪══════════╪══════════╪══════════╪══════════╡\n", - "│ 2 ┆ 0.384569 ┆ 0.802113 ┆ 0.369732 ┆ 0.386991 ┆ 5.979647 │\n", - "│ 3 ┆ 0.243692 ┆ 0.179802 ┆ 0.438691 ┆ 0.41351 ┆ 2.614651 │\n", - "│ 9 ┆ 0.155314 ┆ 0.743213 ┆ 0.572871 ┆ 0.493723 ┆ 5.70132 │\n", - "│ 14 ┆ 0.438281 ┆ 0.466449 ┆ 0.432785 ┆ 0.841271 ┆ 7.552722 │\n", - "│ 15 ┆ 0.419596 ┆ 0.821244 ┆ 0.334902 ┆ 0.694541 ┆ 9.895788 │\n", + "│ 2 ┆ 0.708146 ┆ 0.665207 ┆ 0.41272 ┆ 0.196922 ┆ 1.800771 │\n", + "│ 4 ┆ 0.452309 ┆ 0.788815 ┆ 0.783661 ┆ 0.134379 ┆ 8.155852 │\n", + "│ 8 ┆ 0.531335 ┆ 0.710672 ┆ 0.146274 ┆ 0.031667 ┆ 2.591195 │\n", + "│ 11 ┆ 0.309524 ┆ 0.617698 ┆ 0.838848 ┆ 0.926261 ┆ 3.831364 │\n", + "│ 13 ┆ 0.135002 ┆ 0.570181 ┆ 0.528709 ┆ 0.843758 ┆ 1.314187 │\n", "└─────┴──────────┴──────────┴──────────┴──────────┴──────────┘" ] }, @@ -2279,7 +2279,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 6)
idvar1var2var3rrh
u32f64f64f64f64f64
140.4382810.4664490.4327850.8412717.552722
350.5606630.4431250.7128120.1576778.754831
1600.5147030.5565170.2276850.567522.089827
2250.5083740.4776480.0240970.6502022.372914
2280.4495170.473110.9176710.1401318.827047
" + "shape: (5, 6)
idvar1var2var3rrh
u32f64f64f64f64f64
580.4198840.5359030.0397640.0519811.401684
770.5498550.5347480.1851990.2745215.945685
1410.4512630.4639350.8920090.4165475.000283
1440.5841730.5147940.2893690.5890629.860071
1980.4659090.5327280.2335420.5551944.687478
" ], "text/plain": [ "shape: (5, 6)\n", @@ -2288,11 +2288,11 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ u32 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", "╞═════╪══════════╪══════════╪══════════╪══════════╪══════════╡\n", - "│ 14 ┆ 0.438281 ┆ 0.466449 ┆ 0.432785 ┆ 0.841271 ┆ 7.552722 │\n", - "│ 35 ┆ 0.560663 ┆ 0.443125 ┆ 0.712812 ┆ 0.157677 ┆ 8.754831 │\n", - "│ 160 ┆ 0.514703 ┆ 0.556517 ┆ 0.227685 ┆ 0.56752 ┆ 2.089827 │\n", - "│ 225 ┆ 0.508374 ┆ 0.477648 ┆ 0.024097 ┆ 0.650202 ┆ 2.372914 │\n", - "│ 228 ┆ 0.449517 ┆ 0.47311 ┆ 0.917671 ┆ 0.140131 ┆ 8.827047 │\n", + "│ 58 ┆ 0.419884 ┆ 0.535903 ┆ 0.039764 ┆ 0.051981 ┆ 1.401684 │\n", + "│ 77 ┆ 0.549855 ┆ 0.534748 ┆ 0.185199 ┆ 0.274521 ┆ 5.945685 │\n", + "│ 141 ┆ 0.451263 ┆ 0.463935 ┆ 0.892009 ┆ 0.416547 ┆ 5.000283 │\n", + "│ 144 ┆ 0.584173 ┆ 0.514794 ┆ 0.289369 ┆ 0.589062 ┆ 9.860071 │\n", + "│ 198 ┆ 0.465909 ┆ 0.532728 ┆ 0.233542 ┆ 0.555194 ┆ 4.687478 │\n", "└─────┴──────────┴──────────┴──────────┴──────────┴──────────┘" ] }, @@ -2329,21 +2329,21 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 6)
idvar1var2var3rrh
u32f64f64f64f64f64
2280.4495170.473110.9176710.1401318.827047
4330.4990180.5364970.9166730.7800589.844932
7560.5382590.5032510.5755170.6129859.982001
7890.5151830.4569970.0090570.9228299.109619
8290.5300490.5052230.6557720.8007847.142362
" + "shape: (5, 6)
idvar1var2var3rrh
u32f64f64f64f64f64
1440.5841730.5147940.2893690.5890629.860071
3370.4434690.4400430.0246720.5509969.993739
5740.5553380.5278560.003250.8369927.741302
6810.5722320.5107640.5693580.2192049.860488
10540.4836680.5500310.078290.3372936.788518
" ], "text/plain": [ "shape: (5, 6)\n", - "┌─────┬──────────┬──────────┬──────────┬──────────┬──────────┐\n", - "│ id ┆ var1 ┆ var2 ┆ var3 ┆ r ┆ rh │\n", - "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", - "│ u32 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", - "╞═════╪══════════╪══════════╪══════════╪══════════╪══════════╡\n", - "│ 228 ┆ 0.449517 ┆ 0.47311 ┆ 0.917671 ┆ 0.140131 ┆ 8.827047 │\n", - "│ 433 ┆ 0.499018 ┆ 0.536497 ┆ 0.916673 ┆ 0.780058 ┆ 9.844932 │\n", - "│ 756 ┆ 0.538259 ┆ 0.503251 ┆ 0.575517 ┆ 0.612985 ┆ 9.982001 │\n", - "│ 789 ┆ 0.515183 ┆ 0.456997 ┆ 0.009057 ┆ 0.922829 ┆ 9.109619 │\n", - "│ 829 ┆ 0.530049 ┆ 0.505223 ┆ 0.655772 ┆ 0.800784 ┆ 7.142362 │\n", - "└─────┴──────────┴──────────┴──────────┴──────────┴──────────┘" + "┌──────┬──────────┬──────────┬──────────┬──────────┬──────────┐\n", + "│ id ┆ var1 ┆ var2 ┆ var3 ┆ r ┆ rh │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ u32 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", + "╞══════╪══════════╪══════════╪══════════╪══════════╪══════════╡\n", + "│ 144 ┆ 0.584173 ┆ 0.514794 ┆ 0.289369 ┆ 0.589062 ┆ 9.860071 │\n", + "│ 337 ┆ 0.443469 ┆ 0.440043 ┆ 0.024672 ┆ 0.550996 ┆ 9.993739 │\n", + "│ 574 ┆ 0.555338 ┆ 0.527856 ┆ 0.00325 ┆ 0.836992 ┆ 7.741302 │\n", + "│ 681 ┆ 0.572232 ┆ 0.510764 ┆ 0.569358 ┆ 0.219204 ┆ 9.860488 │\n", + "│ 1054 ┆ 0.483668 ┆ 0.550031 ┆ 0.07829 ┆ 0.337293 ┆ 6.788518 │\n", + "└──────┴──────────┴──────────┴──────────┴──────────┴──────────┘" ] }, "execution_count": 51, @@ -2379,21 +2379,21 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 3)
idfriendscount
u64list[u32]u32
0[0, 1908]2
1[1, 13, … 256]4
2[2, 616, … 1266]4
3[3, 1247]2
4[4]1
" + "shape: (5, 3)
idfriendscount
u64list[u32]u32
0[0, 1942, … 1869]6
1[1, 1576, 154]3
2[2, 1227]2
3[3, 1044]2
4[4, 663, 1248]3
" ], "text/plain": [ "shape: (5, 3)\n", - "┌─────┬──────────────────┬───────┐\n", - "│ id ┆ friends ┆ count │\n", - "│ --- ┆ --- ┆ --- │\n", - "│ u64 ┆ list[u32] ┆ u32 │\n", - "╞═════╪══════════════════╪═══════╡\n", - "│ 0 ┆ [0, 1908] ┆ 2 │\n", - "│ 1 ┆ [1, 13, … 256] ┆ 4 │\n", - "│ 2 ┆ [2, 616, … 1266] ┆ 4 │\n", - "│ 3 ┆ [3, 1247] ┆ 2 │\n", - "│ 4 ┆ [4] ┆ 1 │\n", - "└─────┴──────────────────┴───────┘" + "┌─────┬───────────────────┬───────┐\n", + "│ id ┆ friends ┆ count │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ u64 ┆ list[u32] ┆ u32 │\n", + "╞═════╪═══════════════════╪═══════╡\n", + "│ 0 ┆ [0, 1942, … 1869] ┆ 6 │\n", + "│ 1 ┆ [1, 1576, 154] ┆ 3 │\n", + "│ 2 ┆ [2, 1227] ┆ 2 │\n", + "│ 3 ┆ [3, 1044] ┆ 2 │\n", + "│ 4 ┆ [4, 663, 1248] ┆ 3 │\n", + "└─────┴───────────────────┴───────┘" ] }, "execution_count": 52, @@ -2441,7 +2441,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 4)
actualpredicted0-20-9
f64f64i32i32
1.00.81108423
0.00.80339501
0.00.65213824
1.00.88197404
0.00.71760302
" + "shape: (5, 4)
actualpredicted0-20-9
f64f64i32i32
0.00.39598413
1.00.56853808
0.00.4780216
1.00.50554607
0.00.6459624
" ], "text/plain": [ "shape: (5, 4)\n", @@ -2450,11 +2450,11 @@ "│ --- ┆ --- ┆ --- ┆ --- │\n", "│ f64 ┆ f64 ┆ i32 ┆ i32 │\n", "╞════════╪═══════════╪═════╪═════╡\n", - "│ 1.0 ┆ 0.811084 ┆ 2 ┆ 3 │\n", - "│ 0.0 ┆ 0.803395 ┆ 0 ┆ 1 │\n", - "│ 0.0 ┆ 0.652138 ┆ 2 ┆ 4 │\n", - "│ 1.0 ┆ 0.881974 ┆ 0 ┆ 4 │\n", - "│ 0.0 ┆ 0.717603 ┆ 0 ┆ 2 │\n", + "│ 0.0 ┆ 0.395984 ┆ 1 ┆ 3 │\n", + "│ 1.0 ┆ 0.568538 ┆ 0 ┆ 8 │\n", + "│ 0.0 ┆ 0.47802 ┆ 1 ┆ 6 │\n", + "│ 1.0 ┆ 0.505546 ┆ 0 ┆ 7 │\n", + "│ 0.0 ┆ 0.64596 ┆ 2 ┆ 4 │\n", "└────────┴───────────┴─────┴─────┘" ] }, @@ -2530,17 +2530,17 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (1, 5)
precisionrecallfaverage_precisionroc_auc
f64f64f64f64f64
0.5012080.502150.5016790.4998450.5011
" + "shape: (1, 5)
precisionrecallfaverage_precisionroc_auc
f64f64f64f64f64
0.4977840.4991790.498480.4984270.498643
" ], "text/plain": [ "shape: (1, 5)\n", - "┌───────────┬─────────┬──────────┬───────────────────┬─────────┐\n", - "│ precision ┆ recall ┆ f ┆ average_precision ┆ roc_auc │\n", - "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", - "│ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", - "╞═══════════╪═════════╪══════════╪═══════════════════╪═════════╡\n", - "│ 0.501208 ┆ 0.50215 ┆ 0.501679 ┆ 0.499845 ┆ 0.5011 │\n", - "└───────────┴─────────┴──────────┴───────────────────┴─────────┘" + "┌───────────┬──────────┬─────────┬───────────────────┬──────────┐\n", + "│ precision ┆ recall ┆ f ┆ average_precision ┆ roc_auc │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", + "╞═══════════╪══════════╪═════════╪═══════════════════╪══════════╡\n", + "│ 0.497784 ┆ 0.499179 ┆ 0.49848 ┆ 0.498427 ┆ 0.498643 │\n", + "└───────────┴──────────┴─────────┴───────────────────┴──────────┘" ] }, "execution_count": 55, @@ -2571,7 +2571,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 4)
cnt<=baseline_pctactual_pctpsi_bin
f64f64f64f64
0.2138560.20.220.001906
0.4004510.20.2070.000241
0.5889330.20.2040.000079
0.7948550.20.1940.000183
inf0.20.1750.003338
" + "shape: (5, 4)
cnt<=baseline_pctactual_pctpsi_bin
f64f64f64f64
0.2042980.20.2140.000947
0.3880130.20.1720.004223
0.5799940.20.1980.00002
0.7655230.20.1850.001169
inf0.20.2310.004467
" ], "text/plain": [ "shape: (5, 4)\n", @@ -2580,11 +2580,11 @@ "│ --- ┆ --- ┆ --- ┆ --- │\n", "│ f64 ┆ f64 ┆ f64 ┆ f64 │\n", "╞══════════╪══════════════╪════════════╪══════════╡\n", - "│ 0.213856 ┆ 0.2 ┆ 0.22 ┆ 0.001906 │\n", - "│ 0.400451 ┆ 0.2 ┆ 0.207 ┆ 0.000241 │\n", - "│ 0.588933 ┆ 0.2 ┆ 0.204 ┆ 0.000079 │\n", - "│ 0.794855 ┆ 0.2 ┆ 0.194 ┆ 0.000183 │\n", - "│ inf ┆ 0.2 ┆ 0.175 ┆ 0.003338 │\n", + "│ 0.204298 ┆ 0.2 ┆ 0.214 ┆ 0.000947 │\n", + "│ 0.388013 ┆ 0.2 ┆ 0.172 ┆ 0.004223 │\n", + "│ 0.579994 ┆ 0.2 ┆ 0.198 ┆ 0.00002 │\n", + "│ 0.765523 ┆ 0.2 ┆ 0.185 ┆ 0.001169 │\n", + "│ inf ┆ 0.2 ┆ 0.231 ┆ 0.004467 │\n", "└──────────┴──────────────┴────────────┴──────────┘" ] }, @@ -2620,7 +2620,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (1, 1)
cid_ce
f64
12.762909
" + "shape: (1, 1)
cid_ce
f64
13.129367
" ], "text/plain": [ "shape: (1, 1)\n", @@ -2629,7 +2629,7 @@ "│ --- │\n", "│ f64 │\n", "╞═══════════╡\n", - "│ 12.762909 │\n", + "│ 13.129367 │\n", "└───────────┘" ] }, @@ -2661,7 +2661,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (1, 1)
c3_stats
f64
0.126182
" + "shape: (1, 1)
c3_stats
f64
0.123437
" ], "text/plain": [ "shape: (1, 1)\n", @@ -2670,7 +2670,7 @@ "│ --- │\n", "│ f64 │\n", "╞══════════╡\n", - "│ 0.126182 │\n", + "│ 0.123437 │\n", "└──────────┘" ] }, diff --git a/python/polars_ds/__init__.py b/python/polars_ds/__init__.py index 23d32ec3..f9f8bfbb 100644 --- a/python/polars_ds/__init__.py +++ b/python/polars_ds/__init__.py @@ -1,6 +1,6 @@ from __future__ import annotations import polars as pl -from .type_alias import str_to_expr +from ._utils import str_to_expr from polars_ds.num import * # noqa: F403 from polars_ds.metrics import * # noqa: F403 diff --git a/python/polars_ds/_utils.py b/python/polars_ds/_utils.py index b49b9699..97460ebc 100644 --- a/python/polars_ds/_utils.py +++ b/python/polars_ds/_utils.py @@ -1,5 +1,7 @@ """Not meant for outside use.""" +from __future__ import annotations + import polars as pl from typing import Any, Optional, List, Dict, Union from pathlib import Path @@ -33,3 +35,21 @@ def pl_plugin( cast_to_supertype=cast_to_supertype, pass_name_to_apply=pass_name_to_apply, ) + + +# Auxiliary functions for type conversions +def str_to_expr(e: str | pl.Expr) -> pl.Expr: + """ + Turns a string into an expression + + Parameters + ---------- + e + Either a str represeting a column name or an expression + """ + if isinstance(e, str): + return pl.col(e) + elif isinstance(e, pl.Expr): + return e + else: + raise ValueError("Input must either be a string or a Polars expression.") diff --git a/python/polars_ds/expr_knn.py b/python/polars_ds/expr_knn.py index b44678cc..7835a2ab 100644 --- a/python/polars_ds/expr_knn.py +++ b/python/polars_ds/expr_knn.py @@ -5,8 +5,8 @@ from __future__ import annotations import polars as pl from typing import Iterable, List -from .type_alias import str_to_expr, Distance -from ._utils import pl_plugin +from .type_alias import Distance +from ._utils import pl_plugin, str_to_expr __all__ = [ "query_knn_ptwise", diff --git a/python/polars_ds/metrics.py b/python/polars_ds/metrics.py index 81783b89..2fc913e7 100644 --- a/python/polars_ds/metrics.py +++ b/python/polars_ds/metrics.py @@ -4,8 +4,8 @@ import polars as pl -from ._utils import pl_plugin -from .type_alias import MultiAUCStrategy, str_to_expr +from ._utils import pl_plugin, str_to_expr +from .type_alias import MultiAUCStrategy __all__ = [ "query_r2", diff --git a/python/polars_ds/num.py b/python/polars_ds/num.py index 7fe6ffff..1ed210af 100644 --- a/python/polars_ds/num.py +++ b/python/polars_ds/num.py @@ -8,9 +8,8 @@ DetrendMethod, ConvMode, ConvMethod, - str_to_expr, ) -from ._utils import pl_plugin +from ._utils import pl_plugin, str_to_expr __all__ = [ "singular_values", @@ -44,9 +43,46 @@ "center", "z_normalize", "isotonic_regression", + "is_increasing", + "is_decreasing", ] +def is_increasing(x: str | pl.Expr, strict: bool = False) -> pl.Expr: + """ + Checks whether the column is monotonically increasing. + + Parameters + ---------- + x + A numerical column + strict + Whether the check should be strict + """ + if strict: + return (str_to_expr(x).diff() > 0.0).all() + else: + return (str_to_expr(x).diff() >= 0.0).all() + + +def is_decreasing(x: str | pl.Expr, strict: bool = False) -> pl.Expr: + """ + Checks whether the column is monotonically decreasing. + + Parameters + ---------- + x + A numerical column + strict + Whether the check should be strict + """ + xx = str_to_expr(x) + if strict: + return (xx.diff() < 0.0).all() + else: + return (xx.diff() <= 0.0).all() + + def center(x: str | pl.Expr) -> pl.Expr: """ Centers the column. diff --git a/python/polars_ds/pipeline.py b/python/polars_ds/pipeline.py index ae0472fe..43f8c484 100644 --- a/python/polars_ds/pipeline.py +++ b/python/polars_ds/pipeline.py @@ -18,7 +18,6 @@ FitTransformFunc, SimpleImputeMethod, SimpleScaleMethod, - StrOrExpr, QuantileMethod, EncoderDefaultStrategy, ) @@ -405,7 +404,7 @@ def __str__(self) -> str: out += f"Features Expected: {self.feature_names_in_}\n" return out - def _get_target(self, target: str | pl.Expr | None = None) -> StrOrExpr: + def _get_target(self, target: str | pl.Expr | None = None) -> str | pl.Expr: if target is None: if self.target is None: raise ValueError( diff --git a/python/polars_ds/stats.py b/python/polars_ds/stats.py index 3ad640bd..a3459515 100644 --- a/python/polars_ds/stats.py +++ b/python/polars_ds/stats.py @@ -4,18 +4,17 @@ import polars as pl import math -from .type_alias import Alternative, str_to_expr, CorrMethod, Noise, QuantileMethod -from typing import Union -from ._utils import pl_plugin +from .type_alias import Alternative, CorrMethod, Noise, QuantileMethod +from ._utils import pl_plugin, str_to_expr __all__ = [ - "query_ttest_ind", - "query_ttest_1samp", - "query_ttest_ind_from_stats", - "query_ks_2samp", - "query_f_test", - "query_mann_whitney_u", - "query_chi2", + "ttest_ind", + "ttest_1samp", + "ttest_ind_from_stats", + "ks_2samp", + "f_test", + "mann_whitney_u", + "chi2", "perturb", "jitter", "add_noise", @@ -43,7 +42,7 @@ ] -def query_ttest_ind( +def ttest_ind( var1: str | pl.Expr, var2: str | pl.Expr, alternative: Alternative = "two-sided", @@ -101,7 +100,7 @@ def query_ttest_ind( ) -def query_ttest_1samp( +def ttest_1samp( var1: str | pl.Expr, pop_mean: float, alternative: Alternative = "two-sided" ) -> pl.Expr: """ @@ -131,7 +130,7 @@ def query_ttest_1samp( ) -def query_ttest_ind_from_stats( +def ttest_ind_from_stats( var1: str | pl.Expr, mean: float, var: float, @@ -187,7 +186,7 @@ def query_ttest_ind_from_stats( ) -def query_ks_2samp( +def ks_2samp( var1: str | pl.Expr, var2: str | pl.Expr, alpha: float = 0.05, @@ -237,7 +236,7 @@ def query_ks_2samp( ) -def query_f_test(*variables: str | pl.Expr, group: str | pl.Expr) -> pl.Expr: +def f_test(*variables: str | pl.Expr, group: str | pl.Expr) -> pl.Expr: """ Performs the ANOVA F-test. @@ -258,7 +257,7 @@ def query_f_test(*variables: str | pl.Expr, group: str | pl.Expr) -> pl.Expr: return pl_plugin(symbol="pl_f_test", args=vars_, changes_length=True) -def query_chi2(var1: str | pl.Expr, var2: str | pl.Expr) -> pl.Expr: +def chi2(var1: str | pl.Expr, var2: str | pl.Expr) -> pl.Expr: """ Computes the Chi Squared statistic and p value between two categorical values. @@ -280,7 +279,7 @@ def query_chi2(var1: str | pl.Expr, var2: str | pl.Expr) -> pl.Expr: ) -def query_mann_whitney_u( +def mann_whitney_u( var1: str | pl.Expr, var2: str | pl.Expr, alternative: Alternative = "two-sided", @@ -475,8 +474,8 @@ def normal_test(var: str | pl.Expr) -> pl.Expr: def random( - lower: Union[pl.Expr, float] = 0.0, - upper: Union[pl.Expr, float] = 1.0, + lower: pl.Expr | float = 0.0, + upper: pl.Expr | float = 1.0, seed: int | None = None, ) -> pl.Expr: """ @@ -523,9 +522,7 @@ def random_null(var: str | pl.Expr, pct: float, seed: int | None = None) -> pl.E return pl.when(to_null).then(None).otherwise(str_to_expr(var)) -def random_int( - lower: Union[int, pl.Expr], upper: Union[int, pl.Expr], seed: int | None = None -) -> pl.Expr: +def random_int(lower: int | pl.Expr, upper: int | pl.Expr, seed: int | None = None) -> pl.Expr: """ Generates random integer between lower and upper. @@ -631,9 +628,7 @@ def random_exp(lambda_: float, seed: int | None = None) -> pl.Expr: ) -def random_normal( - mean: Union[pl.Expr, float], std: Union[pl.Expr, float], seed: int | None = None -) -> pl.Expr: +def random_normal(mean: pl.Expr | float, std: pl.Expr | float, seed: int | None = None) -> pl.Expr: """ Generates random number following a normal distribution. @@ -756,7 +751,7 @@ def weighted_var(var: str | pl.Expr, weights: str | pl.Expr, freq_weights: bool return summand / w.sum() -def weighted_cov(x: str | pl.Expr, y: str | pl.Expr, weights: Union[pl.Expr, float]) -> pl.Expr: +def weighted_cov(x: str | pl.Expr, y: str | pl.Expr, weights: pl.Expr | float) -> pl.Expr: """ Computes the weighted covariance between x and y. The weights column must have the same length as both x an y. diff --git a/python/polars_ds/string.py b/python/polars_ds/string.py index fba209fb..d43e1451 100644 --- a/python/polars_ds/string.py +++ b/python/polars_ds/string.py @@ -4,8 +4,7 @@ from typing import List, Literal, Dict import polars as pl -from ._utils import pl_plugin -from .type_alias import str_to_expr +from ._utils import pl_plugin, str_to_expr __all__ = [ diff --git a/python/polars_ds/ts_features.py b/python/polars_ds/ts_features.py index 4288ef9f..4f9307dc 100644 --- a/python/polars_ds/ts_features.py +++ b/python/polars_ds/ts_features.py @@ -4,8 +4,8 @@ import math import polars as pl -from .type_alias import str_to_expr, Distance, NullPolicy -from ._utils import pl_plugin +from .type_alias import Distance, NullPolicy +from ._utils import pl_plugin, str_to_expr from typing import Iterable, Literal __all__ = [ diff --git a/python/polars_ds/type_alias.py b/python/polars_ds/type_alias.py index fe4a11ad..a3cd4539 100644 --- a/python/polars_ds/type_alias.py +++ b/python/polars_ds/type_alias.py @@ -29,25 +29,6 @@ # Other Custom Types PolarsFrame: TypeAlias = Union[pl.DataFrame, pl.LazyFrame] -StrOrExpr: TypeAlias = Union[str, pl.Expr] ExprTransform: TypeAlias = Union[pl.Expr, List[pl.Expr]] # Need ... FitTransformFunc: TypeAlias = Callable[[PolarsFrame, List[str]], ExprTransform] - - -# Auxiliary functions for type conversions -def str_to_expr(e: StrOrExpr) -> pl.Expr: - """ - Turns a string into an expression - - Parameters - ---------- - e - Either a str represeting a column name or an expression - """ - if isinstance(e, str): - return pl.col(e) - elif isinstance(e, pl.Expr): - return e - else: - raise ValueError("Input must either be a string or a Polars expression.") diff --git a/tests/test_many.py b/tests/test_many.py index 71fe5163..e37446af 100644 --- a/tests/test_many.py +++ b/tests/test_many.py @@ -279,7 +279,7 @@ def test_fft(arr, n): def test_f_test(df): from sklearn.feature_selection import f_classif - res = df.select(pds.query_f_test(pl.col("a"), group=pl.col("target"))) + res = df.select(pds.f_test(pl.col("a"), group=pl.col("target"))) res = res.item(0, 0) # A dictionary statistic = res["statistic"] pvalue = res["pvalue"] @@ -308,7 +308,7 @@ def test_f_test(df): def test_mann_whitney_u(df): from scipy.stats import mannwhitneyu - res = df.select(pds.query_mann_whitney_u("x1", "x2")) + res = df.select(pds.mann_whitney_u("x1", "x2")) res = res.item(0, 0) # A dictionary res_statistic = res["statistic"] res_pvalue = res["pvalue"] @@ -1145,7 +1145,7 @@ def test_ks_stats(): stats = ks_2samp(a, b).statistic # Only statistic for now - res = df.select(pds.query_ks_2samp("a", "b").struct.field("statistic")).item(0, 0) + res = df.select(pds.ks_2samp("a", "b").struct.field("statistic")).item(0, 0) assert np.isclose(stats, res) @@ -1187,7 +1187,7 @@ def test_knn_entropy(df, k, dist, res): def test_ttest_ind(df, eq_var): from scipy.stats import ttest_ind - res = df.select(pds.query_ttest_ind("a", "b", equal_var=eq_var)) + res = df.select(pds.ttest_ind("a", "b", equal_var=eq_var)) res = res.item(0, 0) # A dictionary statistic = res["statistic"] pvalue = res["pvalue"] @@ -1214,7 +1214,7 @@ def test_ttest_ind(df, eq_var): def test_welch_t(df): from scipy.stats import ttest_ind - res = df.select(pds.query_ttest_ind("a", "b", equal_var=False)) + res = df.select(pds.ttest_ind("a", "b", equal_var=False)) res = res.item(0, 0) # A dictionary statistic = res["statistic"] pvalue = res["pvalue"] @@ -1244,7 +1244,7 @@ def test_chi2(df): import pandas as pd from scipy.stats import chi2_contingency - res = df.select(pds.query_chi2("x", "y")).item(0, 0) + res = df.select(pds.chi2("x", "y")).item(0, 0) stats, p = res["statistic"], res["pvalue"] df2 = df.to_pandas()