From 2e9a08c18b6ddf006110e914a2db7378f9d2a7bb Mon Sep 17 00:00:00 2001 From: jzsmoreno <42299052+jzsmoreno@users.noreply.github.com> Date: Sun, 8 Dec 2024 12:35:45 -0600 Subject: [PATCH] Bug fixes and final tests --- example.ipynb | 295 ++++++++++++++++++++++++++++++++---- pydbsmgr/utils/azure_sdk.py | 2 +- 2 files changed, 269 insertions(+), 28 deletions(-) diff --git a/example.ipynb b/example.ipynb index 1863ab5..72978ad 100644 --- a/example.ipynb +++ b/example.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -21,7 +21,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -35,23 +35,23 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "\u001b[32m2024-01-19 00:51:57.724\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mfix\u001b[0m:\u001b[36m48\u001b[0m - \u001b[1m1) Empty columns have been removed.\u001b[0m\n", - "\u001b[32m2024-01-19 00:51:57.744\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mfix\u001b[0m:\u001b[36m50\u001b[0m - \u001b[1m1) Columns have been cleaned and transformed.\u001b[0m\n", - "\u001b[32m2024-01-19 00:51:57.763\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36m_ops_dtypes\u001b[0m:\u001b[36m161\u001b[0m - \u001b[1m1) The data type has been verified.\u001b[0m\n", - "\u001b[32m2024-01-19 00:51:57.765\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36m_ops_dtypes\u001b[0m:\u001b[36m163\u001b[0m - \u001b[1m1) The `nan` strings have been replaced by `np.nan`.\u001b[0m\n", - "\u001b[32m2024-01-19 00:51:57.769\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36m_ops_dtypes\u001b[0m:\u001b[36m165\u001b[0m - \u001b[1m1) Only the named columns have been retained.\u001b[0m\n", - "\u001b[32m2024-01-19 00:51:57.773\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mfix\u001b[0m:\u001b[36m48\u001b[0m - \u001b[1m2) Empty columns have been removed.\u001b[0m\n", - "\u001b[32m2024-01-19 00:51:57.777\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mfix\u001b[0m:\u001b[36m50\u001b[0m - \u001b[1m2) Columns have been cleaned and transformed.\u001b[0m\n", - "\u001b[32m2024-01-19 00:51:57.933\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36m_ops_dtypes\u001b[0m:\u001b[36m161\u001b[0m - \u001b[1m2) The data type has been verified.\u001b[0m\n", - "\u001b[32m2024-01-19 00:51:57.933\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36m_ops_dtypes\u001b[0m:\u001b[36m163\u001b[0m - \u001b[1m2) The `nan` strings have been replaced by `np.nan`.\u001b[0m\n", - "\u001b[32m2024-01-19 00:51:57.944\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36m_ops_dtypes\u001b[0m:\u001b[36m165\u001b[0m - \u001b[1m2) Only the named columns have been retained.\u001b[0m\n" + "\u001b[32m2024-12-08 12:34:36.016\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mfix\u001b[0m:\u001b[36m51\u001b[0m - \u001b[1m1) Empty columns have been removed.\u001b[0m\n", + "\u001b[32m2024-12-08 12:34:36.017\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mfix\u001b[0m:\u001b[36m53\u001b[0m - \u001b[1m1) Columns have been cleaned and transformed.\u001b[0m\n", + "\u001b[32m2024-12-08 12:34:36.019\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36m_ops_dtypes\u001b[0m:\u001b[36m149\u001b[0m - \u001b[1m1) The data type has been verified.\u001b[0m\n", + "\u001b[32m2024-12-08 12:34:36.019\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36m_ops_dtypes\u001b[0m:\u001b[36m151\u001b[0m - \u001b[1m1) The `nan` strings have been replaced by `np.nan`.\u001b[0m\n", + "\u001b[32m2024-12-08 12:34:36.019\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36m_ops_dtypes\u001b[0m:\u001b[36m153\u001b[0m - \u001b[1m1) Only the named columns have been retained.\u001b[0m\n", + "\u001b[32m2024-12-08 12:34:36.019\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mfix\u001b[0m:\u001b[36m51\u001b[0m - \u001b[1m2) Empty columns have been removed.\u001b[0m\n", + "\u001b[32m2024-12-08 12:34:36.019\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mfix\u001b[0m:\u001b[36m53\u001b[0m - \u001b[1m2) Columns have been cleaned and transformed.\u001b[0m\n", + "\u001b[32m2024-12-08 12:34:36.034\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36m_ops_dtypes\u001b[0m:\u001b[36m149\u001b[0m - \u001b[1m2) The data type has been verified.\u001b[0m\n", + "\u001b[32m2024-12-08 12:34:36.036\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36m_ops_dtypes\u001b[0m:\u001b[36m151\u001b[0m - \u001b[1m2) The `nan` strings have been replaced by `np.nan`.\u001b[0m\n", + "\u001b[32m2024-12-08 12:34:36.037\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36m_ops_dtypes\u001b[0m:\u001b[36m153\u001b[0m - \u001b[1m2) Only the named columns have been retained.\u001b[0m\n" ] }, { @@ -204,7 +204,7 @@ "4 159.801 0.585 " ] }, - "execution_count": 3, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -218,16 +218,16 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "\u001b[32m2024-01-19 00:52:01.293\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mgenerate_report\u001b[0m:\u001b[36m142\u001b[0m - \u001b[1mDataFrame 'Features' has been processed\u001b[0m\n", - "\u001b[32m2024-01-19 00:52:03.553\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mgenerate_report\u001b[0m:\u001b[36m142\u001b[0m - \u001b[1mDataFrame 'Streams' has been processed\u001b[0m\n", - "\u001b[32m2024-01-19 00:52:03.563\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mgenerate_report\u001b[0m:\u001b[36m146\u001b[0m - \u001b[1mA report has been created under the name './report.html'\u001b[0m\n" + "\u001b[32m2024-12-08 12:34:38.186\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mgenerate_report\u001b[0m:\u001b[36m129\u001b[0m - \u001b[1mDataFrame 'Features' has been processed\u001b[0m\n", + "\u001b[32m2024-12-08 12:34:39.468\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mgenerate_report\u001b[0m:\u001b[36m129\u001b[0m - \u001b[1mDataFrame 'Streams' has been processed\u001b[0m\n", + "\u001b[32m2024-12-08 12:34:39.489\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mgenerate_report\u001b[0m:\u001b[36m133\u001b[0m - \u001b[1mA report has been created under the name './report.html'\u001b[0m\n" ] } ], @@ -245,7 +245,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -257,7 +257,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -270,9 +270,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "UserWarning: The logbook file test_logsbook.csv already exists, the changes will be added.\n" + ] + } + ], "source": [ "# For this example, assume you’re using a dictionary to pass the data\n", "data = {\n", @@ -299,18 +307,251 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ignoring data/test_database.csv, does not match \\w+.parquet\n" + ] + } + ], "source": [ - "controller.upload_parquet(\"/\", [df], [database_name])\n", - "controller.upload_parquet(\"/\", [df], [database_name], compression=False)\n", + "controller.upload_parquet(\"/data/\", [df], [database_name])\n", + "controller.upload_parquet(\"/data/\", [df], [database_name], compression=False)\n", "blob_list = controller.get_blob_list(\"/\")\n", "\n", - "dfs, names = controller.get_parquet(\"/\", \"\\w+.parquet\")\n", + "dfs, names = controller.get_parquet(\"/data/\", \"\\w+.parquet\")\n", "\n", "logbook.create(logbook_data)" ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "controller.upload_excel_csv(\"/data/\", [df], [database_name])" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ignoring data/test_database.csv, does not match \\w+.parquet.gz\n", + "Ignoring data/test_database.parquet, does not match \\w+.parquet.gz\n", + "number of files read: 1\n" + ] + } + ], + "source": [ + "dfs, names = controller.get_parquet(\"/data/\", \"\\w+.parquet.gz\")\n", + "print(f\"number of files read: {len(dfs)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namecityagepy-score
0XavierMexico City4188.0
1AnnToronto2879.0
2JanaPrague3381.0
3YiShanghai3480.0
4RobinManchester3868.0
\n", + "
" + ], + "text/plain": [ + " name city age py-score\n", + "0 Xavier Mexico City 41 88.0\n", + "1 Ann Toronto 28 79.0\n", + "2 Jana Prague 33 81.0\n", + "3 Yi Shanghai 34 80.0\n", + "4 Robin Manchester 38 68.0" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfs[0].head()" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "dfs, names = controller.get_excel_csv(\"/data/\", \"\\w\")" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namecityagepy-score
0XavierMexico City4188.0
1AnnToronto2879.0
2JanaPrague3381.0
3YiShanghai3480.0
4RobinManchester3868.0
\n", + "
" + ], + "text/plain": [ + " name city age py-score\n", + "0 Xavier Mexico City 41 88.0\n", + "1 Ann Toronto 28 79.0\n", + "2 Jana Prague 33 81.0\n", + "3 Yi Shanghai 34 80.0\n", + "4 Robin Manchester 38 68.0" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfs[0].head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/pydbsmgr/utils/azure_sdk.py b/pydbsmgr/utils/azure_sdk.py index fd0e279..f94e023 100644 --- a/pydbsmgr/utils/azure_sdk.py +++ b/pydbsmgr/utils/azure_sdk.py @@ -141,7 +141,7 @@ def _read_files(self, file_list, regex, file_type): dataframes.append(df) elif file_type == "excel_csv": - filename, extension = os.path.splitext(file.name.rsplit("/", 1)[1]) + filename, extension = os.path.splitext(file.name.split("/")[-1]) if extension == ".csv": try: blob_str = blob_data.decode("utf-8")