diff --git a/diamonds-regression.ipynb b/diamonds-regression.ipynb new file mode 100644 index 0000000..a7a34b0 --- /dev/null +++ b/diamonds-regression.ipynb @@ -0,0 +1,884 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
caratcutcolorclaritydepthtablepricexyz
10.23IdealESI261.555.03263.953.982.43
20.21PremiumESI159.861.03263.893.842.31
30.23GoodEVS156.965.03274.054.072.31
40.29PremiumIVS262.458.03344.204.232.63
50.31GoodJSI263.358.03354.344.352.75
\n", + "
" + ], + "text/plain": [ + " carat cut color clarity depth table price x y z\n", + "1 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43\n", + "2 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31\n", + "3 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31\n", + "4 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63\n", + "5 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "df = pd.read_csv(\"diamonds.csv\", index_col=0)\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['Ideal', 'Premium', 'Good', 'Very Good', 'Fair'], dtype=object)" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['cut'].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [], + "source": [ + "cut_class_dict = {\"Fair\": 1, \"Good\": 2, \"Very Good\": 3, \"Premium\": 4, \"Ideal\": 5}" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['SI2', 'SI1', 'VS1', 'VS2', 'VVS2', 'VVS1', 'I1', 'IF'],\n", + " dtype=object)" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['clarity'].unique()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "FL,IF, VVS1, VVS2, VS1, VS2, SI1, SI2, I1, I2, I3 - Taken from the dataset page, this is ordered best to worst, so now we need this in a dict too.\n", + "\n", + "We also have color. D is the best, J is the worst." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "clarity_dict = {\"I3\": 1, \"I2\": 2, \"I1\": 3, \"SI2\": 4, \"SI1\": 5, \"VS2\": 6, \"VS1\": 7, \"VVS2\": 8, \"VVS1\": 9, \"IF\": 10, \"FL\": 11}\n", + "color_dict = {\"J\": 1,\"I\": 2,\"H\": 3,\"G\": 4,\"F\": 5,\"E\": 6,\"D\": 7}" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
caratcutcolorclaritydepthtablepricexyz
10.2356461.555.03263.953.982.43
20.2146559.861.03263.893.842.31
30.2326756.965.03274.054.072.31
40.2942662.458.03344.204.232.63
50.3121463.358.03354.344.352.75
\n", + "
" + ], + "text/plain": [ + " carat cut color clarity depth table price x y z\n", + "1 0.23 5 6 4 61.5 55.0 326 3.95 3.98 2.43\n", + "2 0.21 4 6 5 59.8 61.0 326 3.89 3.84 2.31\n", + "3 0.23 2 6 7 56.9 65.0 327 4.05 4.07 2.31\n", + "4 0.29 4 2 6 62.4 58.0 334 4.20 4.23 2.63\n", + "5 0.31 2 1 4 63.3 58.0 335 4.34 4.35 2.75" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['cut'] = df['cut'].map(cut_class_dict)\n", + "df['clarity'] = df['clarity'].map(clarity_dict)\n", + "df['color'] = df['color'].map(color_dict)\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "import sklearn\n", + "from sklearn.linear_model import SGDRegressor\n", + "\n", + "df = sklearn.utils.shuffle(df) # always shuffle your data to avoid any biases that may emerge b/c of some order.\n", + "\n", + "X = df.drop(\"price\", axis=1).values\n", + "y = df[\"price\"].values" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "test_size = 200\n", + "\n", + "X_train = X[:-test_size]\n", + "y_train = y[:-test_size]\n", + "\n", + "X_test = X[-test_size:]\n", + "y_test = y[-test_size:]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-169716966.18255037\n" + ] + } + ], + "source": [ + "clf = SGDRegressor(max_iter=1000)\n", + "clf.fit(X_train, y_train)\n", + "\n", + "print(clf.score(X_test, y_test))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "22444137.136467457 3365\n", + "-43443171.014743805 842\n", + "27170870.383862972 5600\n", + "30423105.71565962 6468\n", + "-45202250.546970844 3360\n", + "-57564682.934447765 876\n", + "-78040747.2394104 4633\n", + "-45878871.67086029 3873\n", + "1726116.0675020218 1755\n", + "-62319117.010454655 456\n" + ] + } + ], + "source": [ + "for X,y in list(zip(X_test, y_test))[:10]:\n", + " print(clf.predict([X])[0], y)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-0.09673774039928018\n" + ] + } + ], + "source": [ + "from sklearn import svm\n", + "\n", + "clf = svm.SVR()\n", + "\n", + "clf.fit(X_train, y_train)\n", + "print(clf.score(X_test, y_test))" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2391.8871714277357 3365\n", + "2287.721993172211 842\n", + "2465.0369637934473 5600\n", + "2455.5107598729282 6468\n", + "2482.5199710167467 3360\n", + "2364.1066446235336 876\n", + "2449.7215837677586 4633\n", + "2440.2195393614793 3873\n", + "2403.696173119835 1755\n", + "2334.406910211517 456\n" + ] + } + ], + "source": [ + "for X,y in list(zip(X_test, y_test))[:10]:\n", + " print(clf.predict([X])[0], y)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-11266095.012137182\n", + "-28100216.207829475 3365\n", + "-6752871.735469699 842\n", + "-29022496.428314924 5600\n", + "-30273679.579220533 6468\n", + "-8136124.630202532 3360\n", + "-2888225.139208436 876\n", + "1858105.8247030973 4633\n", + "-6508417.491261959 3873\n", + "-20259675.351282597 1755\n", + "-753576.3754941225 456\n" + ] + } + ], + "source": [ + "clf = SGDRegressor(max_iter=10000)\n", + "\n", + "clf.fit(X_train, y_train)\n", + "print(clf.score(X_test, y_test))\n", + "\n", + "for X,y in list(zip(X_test, y_test))[:10]:\n", + " print(clf.predict([X])[0], y)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "improving models and scale data" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.5689918450445994\n", + "model predicts 1092.5823243952013, real value: 612\n", + "model predicts 6704.744710626697, real value: 18656\n", + "model predicts 2940.98116841229, real value: 2398\n", + "model predicts 5271.756220864841, real value: 7339\n", + "model predicts 6356.498143421156, real value: 15600\n", + "model predicts 715.4851809094562, real value: 872\n", + "model predicts 708.4884214473241, real value: 945\n", + "model predicts 6432.3572834037395, real value: 13622\n", + "model predicts 4453.600641313859, real value: 4315\n", + "model predicts 4154.437850063783, real value: 14208\n" + ] + } + ], + "source": [ + "import sklearn\n", + "from sklearn import svm, preprocessing\n", + "\n", + "df = sklearn.utils.shuffle(df) \n", + "\n", + "X = df.drop(\"price\", axis=1).values\n", + "X = preprocessing.scale(X)\n", + "y = df[\"price\"].values\n", + "\n", + "test_size = 200\n", + "\n", + "X_train = X[:-test_size]\n", + "y_train = y[:-test_size]\n", + "\n", + "X_test = X[-test_size:]\n", + "y_test = y[-test_size:]\n", + "\n", + "clf = svm.SVR()\n", + "\n", + "clf.fit(X_train, y_train)\n", + "print(clf.score(X_test, y_test))\n", + "\n", + "for X,y in list(zip(X_test, y_test))[:10]:\n", + " print(f\"model predicts {clf.predict([X])[0]}, real value: {y}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "def prediction_function_wrapper(X_test):\n", + " return clf.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "from giskard import GiskardClient\n", + "\n", + "url = \"http://34.83.122.220:19000\" #if Giskard is installed locally (for installation, see: https://docs.giskard.ai/start/guides/installation) \n", + "token = \"eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJhZG1pbiIsInRva2VuX3R5cGUiOiJBUEkiLCJhdXRoIjoiUk9MRV9BRE1JTiIsImV4cCI6MTY4ODQ3NzU0Mn0.io9Wdxf5JTyGUUOIhXt69_w0wf6OXYiR8BIGgu2DZuA\" #you can generate your API token in the Admin tab of the Giskard application (for installation, see: https://docs.giskard.ai/start/guides/installation) \n", + "client = GiskardClient(url, token)\n", + "\n", + "project = client.create_project(\"diamonds\", \"REGRESSION\", \"DESCRIPTION\") #Choose the arguments you want. But \"project_key\" should be unique and in lower case\n", + "#If your project is already created use project = client.get_project(\"existing_project_key\")" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
caratcutcolorclaritydepthtablepricexyz
10.2356461.555.03263.953.982.43
20.2146559.861.03263.893.842.31
30.2326756.965.03274.054.072.31
40.2942662.458.03344.204.232.63
50.3121463.358.03354.344.352.75
\n", + "
" + ], + "text/plain": [ + " carat cut color clarity depth table price x y z\n", + "1 0.23 5 6 4 61.5 55.0 326 3.95 3.98 2.43\n", + "2 0.21 4 6 5 59.8 61.0 326 3.89 3.84 2.31\n", + "3 0.23 2 6 7 56.9 65.0 327 4.05 4.07 2.31\n", + "4 0.29 4 2 6 62.4 58.0 334 4.20 4.23 2.63\n", + "5 0.31 2 1 4 63.3 58.0 335 4.34 4.35 2.75" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [], + "source": [ + "column_types={'carat': 'numeric', 'cut':'numeric', 'color':'numeric', 'clarity':'numeric', 'depth':'numeric',\n", + " 'table': 'numeric', 'x':'numeric','price':'numeric', 'y':'numeric', 'z':'numeric'}" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [], + "source": [ + "# feature_types is used to declare the features the model is trained on\n", + "feature_types = {i:column_types[i] for i in column_types if i!='price'}" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "True\n" + ] + } + ], + "source": [ + "print(isinstance(df, pd.DataFrame))" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
caratcutcolorclaritydepthtablepricexyz
10.2356461.555.03263.953.982.43
\n", + "
" + ], + "text/plain": [ + " carat cut color clarity depth table price x y z\n", + "1 0.23 5 6 4 61.5 55.0 326 3.95 3.98 2.43" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/cloudsuperadmin/.local/lib/python3.9/site-packages/sklearn/base.py:443: UserWarning: X has feature names, but SVR was fitted without feature names\n", + " warnings.warn(\n", + "/home/cloudsuperadmin/.local/lib/python3.9/site-packages/sklearn/base.py:443: UserWarning: X has feature names, but SVR was fitted without feature names\n", + " warnings.warn(\n", + "/home/cloudsuperadmin/.local/lib/python3.9/site-packages/sklearn/base.py:443: UserWarning: X has feature names, but SVR was fitted without feature names\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset successfully uploaded to project key 'diamonds' with ID = 33. It is available at http://34.83.122.220:19000 \n", + "Model successfully uploaded to project key 'diamonds' with ID = 34. It is available at http://34.83.122.220:19000 \n" + ] + }, + { + "data": { + "text/plain": [ + "(34, 33)" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "project.upload_model_and_df(\n", + " prediction_function=prediction_function_wrapper, # Python function which takes pandas dataframe as input and returns probabilities for classification model OR returns predictions for regression model\n", + " model_type='regression', # \"classification\" for classification model OR \"regression\" for regression model\n", + " df=df, # The dataset you want to use to inspect your model\n", + " column_types=column_types, # A dictionary with columns names of df as key and types(category, numeric, text) of columns as values\n", + " target='price', # The column name in df corresponding to the actual target variable (ground truth).\n", + " feature_names=list(feature_types.keys()), # List of the feature names of prediction_function\n", + " model_name='clf', # Name of the model\n", + " dataset_name='diamonds.csv' # Name of the dataset\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.2" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}