diff --git a/README.md b/README.md index 889fcf22..ebda8ebe 100644 --- a/README.md +++ b/README.md @@ -15,16 +15,22 @@ pip install polars-ds
-# The Project +# PDS (polars_ds) -PDS is a modern take on data science and traditional tabular machine learning. It is dataframe-centric in design, and provides parallelism for free via **Polars**. It offers Polars syntax that works both in normal and aggregation contexts, and provides these conveniences to the end user without any additional dependency. It includes the most common functions from NumPy, SciPy, edit distances, KNN-related queries, EDA tools, feature engineering queries, etc. Yes, it only depends on Polars (unless you want to use the plotting functionalities and want to interop with NumPy). Most of the code is rewritten in **Rust** and is on par or even faster than existing functions in SciPy and Scikit-learn. The following are some examples: +PDS is a modern data science package that -Parallel evaluations of classification metrics on segments +1. is fast and furious +2. is small and lean, with minimal dependencies +3. has an intuitive and concise API (if you know Polars already) +4. has dataframe friendly design +5. and covers a wide variety of data science topics, such as simple statistics, linear regression, string edit distances, tabular data transforms, feature extraction, traditional modelling pipelines, model evaluation metrics, etc., etc.. + +It stands on the shoulders of the great **Polars** dataframe. You can see [examples](./examples/basics.ipynb). Here are some highlights! ```python import polars as pl import polars_ds as pds - +# Parallel evaluation of multiple ML metrics on different segments of data df.lazy().group_by("segments").agg( pds.query_roc_auc("actual", "predicted").alias("roc_auc"), pds.query_log_loss("actual", "predicted").alias("log_loss"), @@ -41,6 +47,43 @@ shape: (2, 3) └──────────┴──────────┴──────────┘ ``` +Tabular Machine Learning Data Transformation Pipeline + +```Python +import polars as pl +import polars.selectors as cs +from polars_ds.pipeline import Pipeline, Blueprint + +bp = ( + # If we specify a target, then target will be excluded from any transformations. + Blueprint(df, name = "example", target = "approved") + .lowercase() # lowercase all columns + .select(cs.numeric() | cs.by_name(["gender", "employer_category1", "city_category"])) + .linear_impute(features = ["var1", "existing_emi"], target = "loan_period") + .impute(["existing_emi"], method = "median") + .append_expr( # generate some features + pl.col("existing_emi").log1p().alias("existing_emi_log1p"), + pl.col("loan_amount").log1p().alias("loan_amount_log1p"), + pl.col("loan_amount").sqrt().alias("loan_amount_sqrt"), + pl.col("loan_amount").shift(-1).alias("loan_amount_lag_1") # any kind of lag transform + ) + .scale( + cs.numeric().exclude(["var1", "existing_emi_log1p"]), method = "standard" + ) # Scale the columns up to this point. The columns below won't be scaled + .append_expr( # Add missing flags + pl.col("employer_category1").is_null().cast(pl.UInt8).alias("employer_category1_is_missing") + ) + .one_hot_encode("gender", drop_first=True) + .woe_encode("city_category") + .target_encode("employer_category1", min_samples_leaf = 20, smoothing = 10.0) # same as above +) + +pipe:Pipeline = bp.materialize() +# Check out the result in our example notebooks! (examples/pipeline.ipynb) +df_transformed = pipe.transform(df) +df_transformed.head() +``` + Get all neighbors within radius r, call them best friends, and count the number ```python @@ -71,7 +114,7 @@ shape: (5, 3) └─────┴───────────────────┴────────────────────┘ ``` -Ridge Regression on Categories +Run a linear regression on each category: ```Python @@ -120,9 +163,9 @@ In-dataframe statistical tests ```Python df.group_by("market_id").agg( - pds.query_ttest_ind("var1", "var2", equal_var=False).alias("t-test"), - pds.query_chi2("category_1", "category_2").alias("chi2-test"), - pds.query_f_test("var1", group = "category_1").alias("f-test") + pds.ttest_ind("var1", "var2", equal_var=False).alias("t-test"), + pds.chi2("category_1", "category_2").alias("chi2-test"), + pds.f_test("var1", group = "category_1").alias("f-test") ) shape: (3, 4) @@ -151,46 +194,6 @@ df.select( ).head() ``` -Tabular Machine Learning Data Transformation Pipeline - -```Python -import polars as pl -import polars.selectors as cs -from polars_ds.pipeline import Pipeline, Blueprint - -bp = ( - # If we specify a target, then target will be excluded from any transformations. - Blueprint(df, name = "example", target = "approved") - .lowercase() # lowercase all columns - .select(cs.numeric() | cs.by_name(["gender", "employer_category1", "city_category"])) - # Impute loan_period by running a simple linear regression. - # Explicitly put target, since this is not the target for prediction. - .linear_impute(features = ["var1", "existing_emi"], target = "loan_period") - .impute(["existing_emi"], method = "median") - .append_expr( # generate some features - pl.col("existing_emi").log1p().alias("existing_emi_log1p"), - pl.col("loan_amount").log1p().alias("loan_amount_log1p"), - pl.col("loan_amount").sqrt().alias("loan_amount_sqrt"), - pl.col("loan_amount").shift(-1).alias("loan_amount_lag_1") # any kind of lag transform - ) - .scale( # target is numerical, but will be excluded automatically because bp is initialzied with a target - cs.numeric().exclude(["var1", "existing_emi_log1p"]), method = "standard" - ) # Scale the columns up to this point. The columns below won't be scaled - .append_expr( - # Add missing flags - pl.col("employer_category1").is_null().cast(pl.UInt8).alias("employer_category1_is_missing") - ) - .one_hot_encode("gender", drop_first=True) - .woe_encode("city_category") # No need to specify target because we initialized bp with a target - .target_encode("employer_category1", min_samples_leaf = 20, smoothing = 10.0) # same as above -) - -pipe:Pipeline = bp.materialize() -# Check out the result in our example notebooks! -df_transformed = pipe.transform(df) -df_transformed.head() -``` - And more! ## Getting Started @@ -205,11 +208,15 @@ To make full use of the Diagnosis module, do pip install "polars_ds[plot]" ``` -## More Examples +## How Fast is it? + +Feel free to take a look at our [benchmark notebook](./benchmarks/benchmarks.ipynb)! + +Generally speaking, the more expressions you want to evaluate simultaneously, the faster Polars + PDS will be than Pandas + (SciPy / Sklearn / NumPy). The more CPU cores you have on your machine, the bigger the time difference will be in favor of Polars + PDS. -See this for Polars Extensions: [notebook](./examples/basics.ipynb) +Why does speed matter? -See this for Native Polars DataFrame Explorative tools: [notebook](./examples/diagnosis.ipynb) +If your code already executes under 1s, then maybe it doesn't. But as your data grow, having a 5s run vs. a 1s run will make a lot of difference in your iterations for your project. Speed of execution becomes a bigger issues if you are building reports on demand, or if you need to pay extra for additional compute. ## HELP WANTED! @@ -217,9 +224,8 @@ See this for Native Polars DataFrame Explorative tools: [notebook](./examples/di ## Road Map -1. Standalone KNN and linear regression module. -2. K-means, K-medoids clustering as expressions and also standalone modules. -3. Other. +1. K-means, K-medoids clustering as expressions and also standalone modules. +2. Other improvement items. See issues. # Disclaimer @@ -232,8 +238,8 @@ This package is not tested with Polars streaming mode and is not designed to wor 1. Rust Snowball Stemmer is taken from Tsoding's Seroost project (MIT). See [here](https://github.com/tsoding/seroost) 2. Some statistics functions are taken from Statrs (MIT) and internalized. See [here](https://github.com/statrs-dev/statrs/tree/master) 3. Linear algebra routines are powered partly by [faer](https://crates.io/crates/faer) +4. String similarity metrics are soooo fast because of [RapidFuzz](https://github.com/maxbachmann/rapidfuzz-rs) # Other related Projects -1. Take a look at our friendly neighbor [functime](https://github.com/TracecatHQ/functime) -2. String similarity metrics is soooo fast and easy to use because of [RapidFuzz](https://github.com/maxbachmann/rapidfuzz-rs) \ No newline at end of file +1. Take a look at our friendly neighbor [functime](https://github.com/TracecatHQ/functime) \ No newline at end of file diff --git a/benchmarks/benchmarks.ipynb b/benchmarks/benchmarks.ipynb new file mode 100644 index 00000000..383f4891 --- /dev/null +++ b/benchmarks/benchmarks.ipynb @@ -0,0 +1,307 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Notebook Used to Generate Benchmark Results " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl\n", + "import polars_ds as pds" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Parallel ML metrics evaluations on segments \n", + "\n", + "Use cases:\n", + "\n", + "1. Evaluate ML model performance in market A, B, C.\n", + "2. The Dataframe contains a column that defines the \"split\" of the dataframe. Then this can simulatneously evaluate ML model's performances on each of the train, test, recent, or any other split you have.\n", + "3. Evaluate ML model performance over time, e.g. weekly / monthly \n", + "\n", + "Comparison: \n", + "\n", + "Polars + PDS vs Pandas + Sklearn" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Generate a \n", + "from datetime import date\n", + "\n", + "dates = pl.date_range(date(2020, 1, 1), date(2024, 10, 1), \"1d\", eager=True)\n", + "df = pds.frame(size=len(dates)).select(\n", + " pds.random().alias(\"predicted\"),\n", + " (pds.random() > 0.25).cast(pl.UInt8).alias(\"actual_target\"),\n", + " dates = dates,\n", + ")\n", + "df_pd = df.to_pandas()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " | predicted | \n", + "actual_target | \n", + "dates | \n", + "
---|---|---|---|
0 | \n", + "0.413767 | \n", + "1 | \n", + "2020-01-01 | \n", + "
1 | \n", + "0.125783 | \n", + "1 | \n", + "2020-01-02 | \n", + "
2 | \n", + "0.382943 | \n", + "1 | \n", + "2020-01-03 | \n", + "
3 | \n", + "0.690455 | \n", + "0 | \n", + "2020-01-04 | \n", + "
4 | \n", + "0.492488 | \n", + "0 | \n", + "2020-01-05 | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "
1731 | \n", + "0.365318 | \n", + "1 | \n", + "2024-09-27 | \n", + "
1732 | \n", + "0.635105 | \n", + "1 | \n", + "2024-09-28 | \n", + "
1733 | \n", + "0.156054 | \n", + "1 | \n", + "2024-09-29 | \n", + "
1734 | \n", + "0.736704 | \n", + "1 | \n", + "2024-09-30 | \n", + "
1735 | \n", + "0.660525 | \n", + "1 | \n", + "2024-10-01 | \n", + "
1736 rows × 3 columns
\n", + "f | time_idx | dummy | actual | predicted | dummy_groups | x1 | x2 | x3 | a | b | y |
---|---|---|---|---|---|---|---|---|---|---|---|
f64 | i64 | str | i32 | f64 | str | f64 | f64 | f64 | f64 | f64 | f64 |
0.0 | 0 | "a" | 1 | 0.373234 | "a" | 0.550106 | 0.663055 | 0.649428 | 0.052451 | 0.331711 | -0.69266 |
0.841471 | 1 | "a" | 1 | 0.807054 | "a" | 0.004639 | 0.888284 | 0.161127 | 0.007514 | 0.500707 | 0.025568 |
0.909297 | 2 | "a" | 0 | 0.672976 | "a" | 0.275145 | 0.2929 | 0.751339 | 0.791996 | 0.645909 | -0.997863 |
0.14112 | 3 | "a" | 0 | 0.805052 | "a" | 0.556564 | 0.873322 | 0.319529 | 0.566644 | 0.960852 | -0.133722 |
-0.756802 | 4 | "a" | 0 | 0.153986 | "a" | 0.290904 | 0.655452 | 0.170709 | 0.9449 | 0.640427 | -0.01572 |
f | time_idx | dummy | actual | predicted | dummy_groups | x1 | x2 | x3 | a | b | y |
---|---|---|---|---|---|---|---|---|---|---|---|
f64 | i64 | str | i32 | f64 | str | f64 | f64 | f64 | f64 | f64 | f64 |
0.0 | 0 | "a" | 1 | 0.651933 | "a" | 0.972564 | 0.299432 | 0.600571 | 0.901676 | 0.481886 | -0.665053 |
0.841471 | 1 | "a" | 1 | 0.599058 | "a" | 0.055968 | 0.547583 | 0.448908 | 0.795043 | 0.100833 | -0.500674 |
0.909297 | 2 | "a" | 1 | 0.264728 | "a" | 0.866758 | 0.7382 | 0.246148 | 0.916253 | 0.868888 | -0.017706 |
0.14112 | 3 | "a" | 1 | 0.599733 | "a" | 0.154184 | 0.730104 | 0.406564 | 0.012962 | 0.736355 | -0.367598 |
-0.756802 | 4 | "a" | 1 | 0.292312 | "a" | 0.261095 | 0.963928 | 0.392611 | 0.982307 | 0.879509 | -0.260554 |
f | a | b |
---|---|---|
f64 | f64 | f64 |
1.3944e-15 | -0.052451 | -0.331711 |
-0.841471 | -0.007514 | -0.500707 |
-0.909297 | -0.791996 | -0.645909 |
-0.14112 | -0.566644 | -0.960852 |
0.756802 | -0.892449 | -0.308716 |
f | a | b |
---|---|---|
f64 | f64 | f64 |
1.3944e-15 | -0.901676 | -0.481886 |
-0.841471 | -0.795043 | -0.100833 |
-0.909297 | -0.916253 | -0.868888 |
-0.14112 | -0.012962 | -0.736355 |
0.756802 | -0.08063 | -0.397623 |
lstsq_coeffs |
---|
list[f64] |
[-0.484661, -0.352412] |
lstsq_coeffs |
---|
list[f64] |
[-0.506473, -0.335296] |
features | beta | std_err | t | p>|t| | 0.025 | 0.975 |
---|---|---|---|---|---|---|
str | f64 | f64 | f64 | f64 | f64 | f64 |
"ln(x1+1)" | 0.218958 | 0.001674 | 130.762753 | 0.0 | 0.215675 | 0.22224 |
"exp(x2)" | 0.174519 | 0.000674 | 258.959252 | 0.0 | 0.173198 | 0.17584 |
"sin(x3)" | -1.742767 | 0.001337 | -1303.028169 | 0.0 | -1.745389 | -1.740145 |
"__bias__" | -0.107859 | 0.001491 | -72.322136 | 0.0 | -0.110782 | -0.104935 |
features | beta | std_err | t | p>|t| | 0.025 | 0.975 |
---|---|---|---|---|---|---|
str | f64 | f64 | f64 | f64 | f64 | f64 |
"ln(x1+1)" | 0.217816 | 0.001697 | 128.34361 | 0.0 | 0.21449 | 0.221143 |
"exp(x2)" | 0.175203 | 0.00068 | 257.743539 | 0.0 | 0.17387 | 0.176535 |
"sin(x3)" | -1.745458 | 0.00135 | -1292.910981 | 0.0 | -1.748104 | -1.742812 |
"__bias__" | -0.107841 | 0.001514 | -71.209947 | 0.0 | -0.110809 | -0.104872 |
lstsq_coeffs |
---|
list[f64] |
[-0.484661, -0.352412] |
lstsq_coeffs |
---|
list[f64] |
[-0.506473, -0.335296] |
dummy | lstsq_coeffs |
---|---|
str | list[f64] |
"a" | [-0.46771, -0.37172] |
"a" | [-0.46771, -0.37172] |
"a" | [-0.46771, -0.37172] |
"a" | [-0.46771, -0.37172] |
"a" | [-0.46771, -0.37172] |
… | … |
"b" | [-0.501249, -0.33307] |
"b" | [-0.501249, -0.33307] |
"b" | [-0.501249, -0.33307] |
"b" | [-0.501249, -0.33307] |
"b" | [-0.501249, -0.33307] |
dummy | lstsq_coeffs |
---|---|
str | list[f64] |
"a" | [-0.49053, -0.350722] |
"a" | [-0.49053, -0.350722] |
"a" | [-0.49053, -0.350722] |
"a" | [-0.49053, -0.350722] |
"a" | [-0.49053, -0.350722] |
… | … |
"b" | [-0.522734, -0.319316] |
"b" | [-0.522734, -0.319316] |
"b" | [-0.522734, -0.319316] |
"b" | [-0.522734, -0.319316] |
"b" | [-0.522734, -0.319316] |
x1 | x2 | y | pred | resid |
---|---|---|---|---|
f64 | f64 | f64 | f64 | f64 |
0.550106 | 0.663055 | -0.69266 | -0.500284 | -0.192377 |
0.004639 | 0.888284 | 0.025568 | -0.315291 | 0.340859 |
0.275145 | 0.2929 | -0.997863 | -0.236574 | -0.761289 |
0.556564 | 0.873322 | -0.133722 | -0.577515 | 0.443793 |
0.290904 | 0.655452 | -0.01572 | -0.371979 | 0.356259 |
x1 | x2 | y | pred | resid |
---|---|---|---|---|
f64 | f64 | f64 | f64 | f64 |
0.972564 | 0.299432 | -0.665053 | -0.592975 | -0.072078 |
0.055968 | 0.547583 | -0.500674 | -0.211949 | -0.288725 |
0.866758 | 0.7382 | -0.017706 | -0.686504 | 0.668798 |
0.154184 | 0.730104 | -0.367598 | -0.322891 | -0.044707 |
0.261095 | 0.963928 | -0.260554 | -0.455438 | 0.194884 |
dummy | lstsq_coeffs |
---|---|
str | list[f64] |
"a" | [-0.46771, -0.37172] |
"b" | [-0.501249, -0.33307] |
dummy | lstsq_coeffs |
---|---|
str | list[f64] |
"a" | [-0.49053, -0.350722] |
"b" | [-0.522734, -0.319316] |
dummy | lstsq_coeffs |
---|---|
str | list[f64] |
"a" | [-0.298076, -0.20613] |
"b" | [-0.334865, -0.150524] |
dummy | lstsq_coeffs |
---|---|
str | list[f64] |
"b" | [-0.352896, -0.148746] |
"a" | [-0.310077, -0.187359] |
dummy | lasso_r2 |
---|---|
str | f64 |
"b" | -0.533325 |
"a" | -0.537092 |
dummy | lasso_r2 |
---|---|
str | f64 |
"b" | -0.530366 |
"a" | -0.547831 |
y | x1 | x2 | coeffs | pred |
---|---|---|---|---|
f64 | f64 | f64 | list[f64] | f64 |
-0.69266 | 0.550106 | 0.663055 | null | null |
0.025568 | 0.004639 | 0.888284 | null | null |
-0.997863 | 0.275145 | 0.2929 | null | null |
-0.133722 | 0.556564 | 0.873322 | null | null |
-0.01572 | 0.290904 | 0.655452 | [-1.311733, 0.247865] | -0.219125 |
… | … | … | … | … |
-1.080843 | 0.109552 | 0.899619 | [-0.532829, -0.86932] | -0.840429 |
-0.844909 | 0.231257 | 0.687855 | [0.438977, -1.353505] | -0.829498 |
0.12167 | 0.608952 | 0.322586 | [0.606546, -1.421551] | -0.089216 |
-1.100991 | 0.194416 | 0.953846 | [0.75856, -1.392892] | -1.181129 |
-0.107373 | 0.349428 | 0.481579 | [1.019164, -1.360391] | -0.299012 |
y | x1 | x2 | coeffs | pred |
---|---|---|---|---|
f64 | f64 | f64 | list[f64] | f64 |
-0.665053 | 0.972564 | 0.299432 | null | null |
-0.500674 | 0.055968 | 0.547583 | null | null |
-0.017706 | 0.866758 | 0.7382 | null | null |
-0.367598 | 0.154184 | 0.730104 | null | null |
-0.260554 | 0.261095 | 0.963928 | [-0.244051, -0.284693] | -0.338143 |
… | … | … | … | … |
-1.092441 | 0.326869 | 0.779862 | [-1.209222, -0.168168] | -0.526405 |
-0.53314 | 0.722438 | 0.79217 | [-0.959132, -0.162519] | -0.821656 |
-1.34107 | 0.13505 | 0.312516 | [0.412189, -0.97953] | -0.250452 |
-0.183582 | 0.24526 | 0.816613 | [-0.189443, -0.693278] | -0.612602 |
-0.575389 | 0.69583 | 0.454125 | [-0.146486, -0.86184] | -0.493313 |
a |
---|
list[f64] |
[29.067812, 28.980247, 28.665133] |
a |
---|
list[f64] |
[29.176906, 28.875477, 28.619583] |
singular_value | weight_vector |
---|---|
f64 | list[f64] |
29.024322 | [0.709965, 0.704237] |
28.897161 | [-0.704237, 0.709965] |
singular_value | weight_vector |
---|---|
f64 | list[f64] |
29.139846 | [0.981344, -0.192261] |
28.771057 | [0.192261, 0.981344] |
pc1 |
---|
f64 |
-0.439477 |
-0.352367 |
0.306843 |
0.368646 |
0.411539 |
pc1 |
---|
f64 |
0.39868 |
0.367297 |
0.33858 |
-0.522379 |
0.401359 |
dummy_groups | l2 | log loss | precision | recall | f | average_precision | roc_auc |
---|---|---|---|---|---|---|---|
str | f64 | f64 | f64 | f64 | f64 | f64 | f64 |
"b" | 0.333885 | 1.001143 | 0.504393 | 0.504796 | 0.504594 | 0.504159 | 0.501021 |
"a" | 0.333207 | 0.989803 | 0.492848 | 0.486879 | 0.489846 | 0.502603 | 0.500806 |
dummy_groups | l2 | log loss | precision | recall | f | average_precision | roc_auc |
---|---|---|---|---|---|---|---|
str | f64 | f64 | f64 | f64 | f64 | f64 | f64 |
"b" | 0.335309 | 1.000473 | 0.513557 | 0.493774 | 0.503472 | 0.516459 | 0.499138 |
"a" | 0.335421 | 1.001562 | 0.496063 | 0.477083 | 0.486388 | 0.499139 | 0.495529 |
sen |
---|
str |
"church" |
"going" |
"hello" |
"world" |
"to" |
sen |
---|
str |
"hello" |
"church" |
"going" |
"world" |
"to" |
sen |
---|
str |
"hello" |
"world" |
"go" |
"" |
"church" |
sen |
---|
str |
"go" |
"hello" |
"" |
"church" |
"world" |
a |
---|
f64 |
null |
null |
2.764739 |
-1.3346 |
1.103795 |
a |
---|
f64 |
null |
null |
1.637908 |
0.39391 |
1.135519 |
a | random_normal | random_normal_that_respects_null_of_a |
---|---|---|
f64 | f64 | f64 |
null | -0.287753 | null |
null | 1.137891 | null |
2.764739 | 2.052795 | -1.09632 |
-1.3346 | -1.83142 | -0.259511 |
1.103795 | 1.349615 | 1.692965 |
a | random_normal | random_normal_that_respects_null_of_a |
---|---|---|
f64 | f64 | f64 |
null | 0.984349 | null |
null | 1.268442 | null |
1.637908 | 1.829465 | 1.125404 |
0.39391 | 0.258554 | 1.03578 |
1.135519 | -0.266687 | 1.180521 |
a | random_str | random_str_that_respects_null_of_a |
---|---|---|
f64 | str | str |
null | "NMw" | null |
null | "3vbZ" | null |
2.764739 | "h8So9" | "TvB9a" |
-1.3346 | "hw" | "TD4" |
1.103795 | "2Znps" | "V" |
a | random_str | random_str_that_respects_null_of_a |
---|---|---|
f64 | str | str |
null | "78" | null |
null | "TJsSs" | null |
1.637908 | "ZCXwr" | "R" |
0.39391 | "iXS" | "rMIb" |
1.135519 | "1g" | "Z" |
a | random_str |
---|---|
f64 | str |
null | null |
null | null |
2.764739 | "BvTXp" |
-1.3346 | "x0cZn" |
1.103795 | "xih81" |
a | random_str |
---|---|
f64 | str |
null | null |
null | null |
1.637908 | "HwXg9" |
0.39391 | "bwxYr" |
1.135519 | "KBLag" |
a | test1 | literal | test1_perturbed |
---|---|---|---|
f64 | f64 | f64 | f64 |
null | 1.343124 | null | 1.342837 |
null | 0.207816 | null | 0.208056 |
2.764739 | 0.366128 | 2.381908 | 0.36599 |
-1.3346 | 1.766008 | 1.657859 | 1.766105 |
1.103795 | -0.424022 | 1.071486 | -0.424504 |
a | test1 | literal | test1_perturbed |
---|---|---|---|
f64 | f64 | f64 | f64 |
null | -0.999841 | null | -1.00034 |
null | -0.014981 | null | -0.014588 |
1.637908 | -0.045308 | 0.906578 | -0.044914 |
0.39391 | -0.498435 | 1.22988 | -0.498401 |
1.135519 | -1.351206 | 1.429987 | -1.351569 |
a | [0, 1) | Normal | Int from [0, 10) |
---|---|---|---|
f64 | f64 | f64 | i32 |
null | 0.758316 | 0.20244 | 7 |
null | 0.491458 | -0.725606 | 8 |
2.764739 | 0.191777 | 1.035372 | 5 |
-1.3346 | 0.08989 | -1.225623 | 3 |
1.103795 | 0.891758 | 1.528385 | 5 |
a | [0, 1) | Normal | Int from [0, 10) |
---|---|---|---|
f64 | f64 | f64 | i32 |
null | 0.355994 | -0.03833 | 3 |
null | 0.802205 | 0.675571 | 3 |
1.637908 | 0.937455 | -0.423257 | 7 |
0.39391 | 0.909049 | 0.419791 | 6 |
1.135519 | 0.760001 | 2.551108 | 7 |
t-tests: statistics | t-tests: pvalue | normality_test: statistics | normality_test: pvalue |
---|---|---|---|
f64 | f64 | f64 | f64 |
0.657304 | 0.511087 | 0.360216 | 0.83518 |
t-tests: statistics | t-tests: pvalue | normality_test: statistics | normality_test: pvalue |
---|---|---|---|
f64 | f64 | f64 | f64 |
0.323934 | 0.746034 | 11.004412 | 0.004078 |
market_id | var1 | var2 | category_1 | category_2 |
---|---|---|---|---|
i64 | f64 | f64 | i32 | i32 |
0 | 0.095526 | 0.219794 | 4 | 8 |
1 | 0.502957 | 0.127986 | 1 | 5 |
2 | 0.029582 | 0.808177 | 1 | 0 |
0 | 0.424024 | 0.39014 | 1 | 6 |
1 | 0.061654 | 0.3925 | 4 | 8 |
market_id | var1 | var2 | category_1 | category_2 |
---|---|---|---|---|
i64 | f64 | f64 | i32 | i32 |
0 | 0.553279 | 0.872152 | 0 | 1 |
1 | 0.899939 | 0.271614 | 0 | 5 |
2 | 0.87596 | 0.827022 | 4 | 1 |
0 | 0.069782 | 0.647962 | 1 | 7 |
1 | 0.5716 | 0.357665 | 2 | 4 |
t-test | chi2-test | f-test |
---|---|---|
struct[2] | struct[2] | struct[2] |
{-0.584836,0.558671} | {41.626592,0.239132} | {0.368606,0.831148} |
t-test | chi2-test | f-test |
---|---|---|
struct[2] | struct[2] | struct[2] |
{0.331295,0.740429} | {41.700303,0.236697} | {0.667292,0.614652} |
first_digit_cnt | first_digit_distribution |
---|---|
u32 | f64 |
553 | 0.1106 |
588 | 0.1176 |
549 | 0.1098 |
593 | 0.1186 |
563 | 0.1126 |
591 | 0.1182 |
517 | 0.1034 |
535 | 0.107 |
511 | 0.1022 |
first_digit_cnt | first_digit_distribution |
---|---|
u32 | f64 |
544 | 0.1088 |
538 | 0.1076 |
546 | 0.1092 |
603 | 0.1206 |
537 | 0.1074 |
544 | 0.1088 |
555 | 0.111 |
577 | 0.1154 |
556 | 0.1112 |
id | var1 | var2 | var3 | r | rh | nb_l_inf_cnt |
---|---|---|---|---|---|---|
u32 | f64 | f64 | f64 | f64 | f64 | u32 |
0 | 0.984493 | 0.574152 | 0.033131 | 0.828606 | 5.0614 | 7 |
1 | 0.187497 | 0.823286 | 0.687118 | 0.373936 | 7.496395 | 16 |
2 | 0.384569 | 0.802113 | 0.369732 | 0.386991 | 5.979647 | 23 |
3 | 0.243692 | 0.179802 | 0.438691 | 0.41351 | 2.614651 | 21 |
4 | 0.762425 | 0.930085 | 0.375169 | 0.248961 | 3.33307 | 12 |
id | var1 | var2 | var3 | r | rh | nb_l_inf_cnt |
---|---|---|---|---|---|---|
u32 | f64 | f64 | f64 | f64 | f64 | u32 |
0 | 0.604373 | 0.167666 | 0.163173 | 0.364641 | 1.933242 | 18 |
1 | 0.039875 | 0.707229 | 0.940102 | 0.722232 | 8.10753 | 8 |
2 | 0.708146 | 0.665207 | 0.41272 | 0.196922 | 1.800771 | 15 |
3 | 0.073332 | 0.356413 | 0.656907 | 0.791403 | 3.913569 | 18 |
4 | 0.452309 | 0.788815 | 0.783661 | 0.134379 | 8.155852 | 16 |
id | var1 | var2 | var3 | r | rh | nb_l1_r_cnt |
---|---|---|---|---|---|---|
u32 | f64 | f64 | f64 | f64 | f64 | u32 |
0 | 0.984493 | 0.574152 | 0.033131 | 0.828606 | 5.0614 | 398 |
1 | 0.187497 | 0.823286 | 0.687118 | 0.373936 | 7.496395 | 138 |
2 | 0.384569 | 0.802113 | 0.369732 | 0.386991 | 5.979647 | 152 |
3 | 0.243692 | 0.179802 | 0.438691 | 0.41351 | 2.614651 | 146 |
4 | 0.762425 | 0.930085 | 0.375169 | 0.248961 | 3.33307 | 33 |
id | var1 | var2 | var3 | r | rh | nb_l1_r_cnt |
---|---|---|---|---|---|---|
u32 | f64 | f64 | f64 | f64 | f64 | u32 |
0 | 0.604373 | 0.167666 | 0.163173 | 0.364641 | 1.933242 | 104 |
1 | 0.039875 | 0.707229 | 0.940102 | 0.722232 | 8.10753 | 285 |
2 | 0.708146 | 0.665207 | 0.41272 | 0.196922 | 1.800771 | 19 |
3 | 0.073332 | 0.356413 | 0.656907 | 0.791403 | 3.913569 | 627 |
4 | 0.452309 | 0.788815 | 0.783661 | 0.134379 | 8.155852 | 4 |
id | var1 | var2 | var3 | r | rh | best friends |
---|---|---|---|---|---|---|
u32 | f64 | f64 | f64 | f64 | f64 | list[u32] |
0 | 0.984493 | 0.574152 | 0.033131 | 0.828606 | 5.0614 | [0, 359, … 1993] |
1 | 0.187497 | 0.823286 | 0.687118 | 0.373936 | 7.496395 | [1, 285, … 583] |
2 | 0.384569 | 0.802113 | 0.369732 | 0.386991 | 5.979647 | [2, 1907, … 15] |
3 | 0.243692 | 0.179802 | 0.438691 | 0.41351 | 2.614651 | [3, 1247, … 1616] |
4 | 0.762425 | 0.930085 | 0.375169 | 0.248961 | 3.33307 | [4, 840, … 207] |
id | var1 | var2 | var3 | r | rh | best friends |
---|---|---|---|---|---|---|
u32 | f64 | f64 | f64 | f64 | f64 | list[u32] |
0 | 0.604373 | 0.167666 | 0.163173 | 0.364641 | 1.933242 | [0, 802, … 1942] |
1 | 0.039875 | 0.707229 | 0.940102 | 0.722232 | 8.10753 | [1, 998, … 1858] |
2 | 0.708146 | 0.665207 | 0.41272 | 0.196922 | 1.800771 | [2, 1049, … 1348] |
3 | 0.073332 | 0.356413 | 0.656907 | 0.791403 | 3.913569 | [3, 1028, … 676] |
4 | 0.452309 | 0.788815 | 0.783661 | 0.134379 | 8.155852 | [4, 1518, … 1587] |
id | var1 | var2 | var3 | r | rh | idx | dist |
---|---|---|---|---|---|---|---|
u32 | f64 | f64 | f64 | f64 | f64 | list[u32] | list[f64] |
0 | 0.984493 | 0.574152 | 0.033131 | 0.828606 | 5.0614 | [0, 359, … 1993] | [0.0, 0.075875, … 0.084589] |
1 | 0.187497 | 0.823286 | 0.687118 | 0.373936 | 7.496395 | [1, 285, … 583] | [0.0, 0.066303, … 0.074323] |
2 | 0.384569 | 0.802113 | 0.369732 | 0.386991 | 5.979647 | [2, 1907, … 15] | [0.0, 0.034667, … 0.052971] |
3 | 0.243692 | 0.179802 | 0.438691 | 0.41351 | 2.614651 | [3, 1247, … 1616] | [0.0, 0.067122, … 0.069608] |
4 | 0.762425 | 0.930085 | 0.375169 | 0.248961 | 3.33307 | [4, 840, … 207] | [0.0, 0.082641, … 0.098739] |
id | var1 | var2 | var3 | r | rh | idx | dist |
---|---|---|---|---|---|---|---|
u32 | f64 | f64 | f64 | f64 | f64 | list[u32] | list[f64] |
0 | 0.604373 | 0.167666 | 0.163173 | 0.364641 | 1.933242 | [0, 802, … 1942] | [0.0, 0.029826, … 0.043563] |
1 | 0.039875 | 0.707229 | 0.940102 | 0.722232 | 8.10753 | [1, 998, … 1858] | [0.0, 0.044368, … 0.074784] |
2 | 0.708146 | 0.665207 | 0.41272 | 0.196922 | 1.800771 | [2, 1049, … 1348] | [0.0, 0.040579, … 0.047743] |
3 | 0.073332 | 0.356413 | 0.656907 | 0.791403 | 3.913569 | [3, 1028, … 676] | [0.0, 0.071795, … 0.077792] |
4 | 0.452309 | 0.788815 | 0.783661 | 0.134379 | 8.155852 | [4, 1518, … 1587] | [0.0, 0.035496, … 0.081266] |
id | var1 | var2 | var3 | r | rh |
---|---|---|---|---|---|
u32 | f64 | f64 | f64 | f64 | f64 |
2 | 0.384569 | 0.802113 | 0.369732 | 0.386991 | 5.979647 |
3 | 0.243692 | 0.179802 | 0.438691 | 0.41351 | 2.614651 |
9 | 0.155314 | 0.743213 | 0.572871 | 0.493723 | 5.70132 |
14 | 0.438281 | 0.466449 | 0.432785 | 0.841271 | 7.552722 |
15 | 0.419596 | 0.821244 | 0.334902 | 0.694541 | 9.895788 |
id | var1 | var2 | var3 | r | rh |
---|---|---|---|---|---|
u32 | f64 | f64 | f64 | f64 | f64 |
2 | 0.708146 | 0.665207 | 0.41272 | 0.196922 | 1.800771 |
4 | 0.452309 | 0.788815 | 0.783661 | 0.134379 | 8.155852 |
8 | 0.531335 | 0.710672 | 0.146274 | 0.031667 | 2.591195 |
11 | 0.309524 | 0.617698 | 0.838848 | 0.926261 | 3.831364 |
13 | 0.135002 | 0.570181 | 0.528709 | 0.843758 | 1.314187 |
id | var1 | var2 | var3 | r | rh |
---|---|---|---|---|---|
u32 | f64 | f64 | f64 | f64 | f64 |
14 | 0.438281 | 0.466449 | 0.432785 | 0.841271 | 7.552722 |
35 | 0.560663 | 0.443125 | 0.712812 | 0.157677 | 8.754831 |
160 | 0.514703 | 0.556517 | 0.227685 | 0.56752 | 2.089827 |
225 | 0.508374 | 0.477648 | 0.024097 | 0.650202 | 2.372914 |
228 | 0.449517 | 0.47311 | 0.917671 | 0.140131 | 8.827047 |
id | var1 | var2 | var3 | r | rh |
---|---|---|---|---|---|
u32 | f64 | f64 | f64 | f64 | f64 |
58 | 0.419884 | 0.535903 | 0.039764 | 0.051981 | 1.401684 |
77 | 0.549855 | 0.534748 | 0.185199 | 0.274521 | 5.945685 |
141 | 0.451263 | 0.463935 | 0.892009 | 0.416547 | 5.000283 |
144 | 0.584173 | 0.514794 | 0.289369 | 0.589062 | 9.860071 |
198 | 0.465909 | 0.532728 | 0.233542 | 0.555194 | 4.687478 |
id | var1 | var2 | var3 | r | rh |
---|---|---|---|---|---|
u32 | f64 | f64 | f64 | f64 | f64 |
228 | 0.449517 | 0.47311 | 0.917671 | 0.140131 | 8.827047 |
433 | 0.499018 | 0.536497 | 0.916673 | 0.780058 | 9.844932 |
756 | 0.538259 | 0.503251 | 0.575517 | 0.612985 | 9.982001 |
789 | 0.515183 | 0.456997 | 0.009057 | 0.922829 | 9.109619 |
829 | 0.530049 | 0.505223 | 0.655772 | 0.800784 | 7.142362 |
id | var1 | var2 | var3 | r | rh |
---|---|---|---|---|---|
u32 | f64 | f64 | f64 | f64 | f64 |
144 | 0.584173 | 0.514794 | 0.289369 | 0.589062 | 9.860071 |
337 | 0.443469 | 0.440043 | 0.024672 | 0.550996 | 9.993739 |
574 | 0.555338 | 0.527856 | 0.00325 | 0.836992 | 7.741302 |
681 | 0.572232 | 0.510764 | 0.569358 | 0.219204 | 9.860488 |
1054 | 0.483668 | 0.550031 | 0.07829 | 0.337293 | 6.788518 |
id | friends | count |
---|---|---|
u64 | list[u32] | u32 |
0 | [0, 1908] | 2 |
1 | [1, 13, … 256] | 4 |
2 | [2, 616, … 1266] | 4 |
3 | [3, 1247] | 2 |
4 | [4] | 1 |
id | friends | count |
---|---|---|
u64 | list[u32] | u32 |
0 | [0, 1942, … 1869] | 6 |
1 | [1, 1576, 154] | 3 |
2 | [2, 1227] | 2 |
3 | [3, 1044] | 2 |
4 | [4, 663, 1248] | 3 |
actual | predicted | 0-2 | 0-9 |
---|---|---|---|
f64 | f64 | i32 | i32 |
1.0 | 0.811084 | 2 | 3 |
0.0 | 0.803395 | 0 | 1 |
0.0 | 0.652138 | 2 | 4 |
1.0 | 0.881974 | 0 | 4 |
0.0 | 0.717603 | 0 | 2 |
actual | predicted | 0-2 | 0-9 |
---|---|---|---|
f64 | f64 | i32 | i32 |
0.0 | 0.395984 | 1 | 3 |
1.0 | 0.568538 | 0 | 8 |
0.0 | 0.47802 | 1 | 6 |
1.0 | 0.505546 | 0 | 7 |
0.0 | 0.64596 | 2 | 4 |
precision | recall | f | average_precision | roc_auc |
---|---|---|---|---|
f64 | f64 | f64 | f64 | f64 |
0.501208 | 0.50215 | 0.501679 | 0.499845 | 0.5011 |
precision | recall | f | average_precision | roc_auc |
---|---|---|---|---|
f64 | f64 | f64 | f64 | f64 |
0.497784 | 0.499179 | 0.49848 | 0.498427 | 0.498643 |
cnt<= | baseline_pct | actual_pct | psi_bin |
---|---|---|---|
f64 | f64 | f64 | f64 |
0.213856 | 0.2 | 0.22 | 0.001906 |
0.400451 | 0.2 | 0.207 | 0.000241 |
0.588933 | 0.2 | 0.204 | 0.000079 |
0.794855 | 0.2 | 0.194 | 0.000183 |
inf | 0.2 | 0.175 | 0.003338 |
cnt<= | baseline_pct | actual_pct | psi_bin |
---|---|---|---|
f64 | f64 | f64 | f64 |
0.204298 | 0.2 | 0.214 | 0.000947 |
0.388013 | 0.2 | 0.172 | 0.004223 |
0.579994 | 0.2 | 0.198 | 0.00002 |
0.765523 | 0.2 | 0.185 | 0.001169 |
inf | 0.2 | 0.231 | 0.004467 |
cid_ce |
---|
f64 |
12.762909 |
cid_ce |
---|
f64 |
13.129367 |
c3_stats |
---|
f64 |
0.126182 |
c3_stats |
---|
f64 |
0.123437 |