From 2e9a08c18b6ddf006110e914a2db7378f9d2a7bb Mon Sep 17 00:00:00 2001
From: jzsmoreno <42299052+jzsmoreno@users.noreply.github.com>
Date: Sun, 8 Dec 2024 12:35:45 -0600
Subject: [PATCH] Bug fixes and final tests
---
example.ipynb | 295 ++++++++++++++++++++++++++++++++----
pydbsmgr/utils/azure_sdk.py | 2 +-
2 files changed, 269 insertions(+), 28 deletions(-)
diff --git a/example.ipynb b/example.ipynb
index 1863ab5..72978ad 100644
--- a/example.ipynb
+++ b/example.ipynb
@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
@@ -21,7 +21,7 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
@@ -35,23 +35,23 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
- "\u001b[32m2024-01-19 00:51:57.724\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mfix\u001b[0m:\u001b[36m48\u001b[0m - \u001b[1m1) Empty columns have been removed.\u001b[0m\n",
- "\u001b[32m2024-01-19 00:51:57.744\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mfix\u001b[0m:\u001b[36m50\u001b[0m - \u001b[1m1) Columns have been cleaned and transformed.\u001b[0m\n",
- "\u001b[32m2024-01-19 00:51:57.763\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36m_ops_dtypes\u001b[0m:\u001b[36m161\u001b[0m - \u001b[1m1) The data type has been verified.\u001b[0m\n",
- "\u001b[32m2024-01-19 00:51:57.765\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36m_ops_dtypes\u001b[0m:\u001b[36m163\u001b[0m - \u001b[1m1) The `nan` strings have been replaced by `np.nan`.\u001b[0m\n",
- "\u001b[32m2024-01-19 00:51:57.769\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36m_ops_dtypes\u001b[0m:\u001b[36m165\u001b[0m - \u001b[1m1) Only the named columns have been retained.\u001b[0m\n",
- "\u001b[32m2024-01-19 00:51:57.773\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mfix\u001b[0m:\u001b[36m48\u001b[0m - \u001b[1m2) Empty columns have been removed.\u001b[0m\n",
- "\u001b[32m2024-01-19 00:51:57.777\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mfix\u001b[0m:\u001b[36m50\u001b[0m - \u001b[1m2) Columns have been cleaned and transformed.\u001b[0m\n",
- "\u001b[32m2024-01-19 00:51:57.933\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36m_ops_dtypes\u001b[0m:\u001b[36m161\u001b[0m - \u001b[1m2) The data type has been verified.\u001b[0m\n",
- "\u001b[32m2024-01-19 00:51:57.933\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36m_ops_dtypes\u001b[0m:\u001b[36m163\u001b[0m - \u001b[1m2) The `nan` strings have been replaced by `np.nan`.\u001b[0m\n",
- "\u001b[32m2024-01-19 00:51:57.944\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36m_ops_dtypes\u001b[0m:\u001b[36m165\u001b[0m - \u001b[1m2) Only the named columns have been retained.\u001b[0m\n"
+ "\u001b[32m2024-12-08 12:34:36.016\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mfix\u001b[0m:\u001b[36m51\u001b[0m - \u001b[1m1) Empty columns have been removed.\u001b[0m\n",
+ "\u001b[32m2024-12-08 12:34:36.017\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mfix\u001b[0m:\u001b[36m53\u001b[0m - \u001b[1m1) Columns have been cleaned and transformed.\u001b[0m\n",
+ "\u001b[32m2024-12-08 12:34:36.019\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36m_ops_dtypes\u001b[0m:\u001b[36m149\u001b[0m - \u001b[1m1) The data type has been verified.\u001b[0m\n",
+ "\u001b[32m2024-12-08 12:34:36.019\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36m_ops_dtypes\u001b[0m:\u001b[36m151\u001b[0m - \u001b[1m1) The `nan` strings have been replaced by `np.nan`.\u001b[0m\n",
+ "\u001b[32m2024-12-08 12:34:36.019\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36m_ops_dtypes\u001b[0m:\u001b[36m153\u001b[0m - \u001b[1m1) Only the named columns have been retained.\u001b[0m\n",
+ "\u001b[32m2024-12-08 12:34:36.019\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mfix\u001b[0m:\u001b[36m51\u001b[0m - \u001b[1m2) Empty columns have been removed.\u001b[0m\n",
+ "\u001b[32m2024-12-08 12:34:36.019\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mfix\u001b[0m:\u001b[36m53\u001b[0m - \u001b[1m2) Columns have been cleaned and transformed.\u001b[0m\n",
+ "\u001b[32m2024-12-08 12:34:36.034\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36m_ops_dtypes\u001b[0m:\u001b[36m149\u001b[0m - \u001b[1m2) The data type has been verified.\u001b[0m\n",
+ "\u001b[32m2024-12-08 12:34:36.036\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36m_ops_dtypes\u001b[0m:\u001b[36m151\u001b[0m - \u001b[1m2) The `nan` strings have been replaced by `np.nan`.\u001b[0m\n",
+ "\u001b[32m2024-12-08 12:34:36.037\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36m_ops_dtypes\u001b[0m:\u001b[36m153\u001b[0m - \u001b[1m2) Only the named columns have been retained.\u001b[0m\n"
]
},
{
@@ -204,7 +204,7 @@
"4 159.801 0.585 "
]
},
- "execution_count": 3,
+ "execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
@@ -218,16 +218,16 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
- "\u001b[32m2024-01-19 00:52:01.293\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mgenerate_report\u001b[0m:\u001b[36m142\u001b[0m - \u001b[1mDataFrame 'Features' has been processed\u001b[0m\n",
- "\u001b[32m2024-01-19 00:52:03.553\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mgenerate_report\u001b[0m:\u001b[36m142\u001b[0m - \u001b[1mDataFrame 'Streams' has been processed\u001b[0m\n",
- "\u001b[32m2024-01-19 00:52:03.563\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mgenerate_report\u001b[0m:\u001b[36m146\u001b[0m - \u001b[1mA report has been created under the name './report.html'\u001b[0m\n"
+ "\u001b[32m2024-12-08 12:34:38.186\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mgenerate_report\u001b[0m:\u001b[36m129\u001b[0m - \u001b[1mDataFrame 'Features' has been processed\u001b[0m\n",
+ "\u001b[32m2024-12-08 12:34:39.468\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mgenerate_report\u001b[0m:\u001b[36m129\u001b[0m - \u001b[1mDataFrame 'Streams' has been processed\u001b[0m\n",
+ "\u001b[32m2024-12-08 12:34:39.489\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mgenerate_report\u001b[0m:\u001b[36m133\u001b[0m - \u001b[1mA report has been created under the name './report.html'\u001b[0m\n"
]
}
],
@@ -245,7 +245,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
@@ -257,7 +257,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
@@ -270,9 +270,17 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 21,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "UserWarning: The logbook file test_logsbook.csv already exists, the changes will be added.\n"
+ ]
+ }
+ ],
"source": [
"# For this example, assume you’re using a dictionary to pass the data\n",
"data = {\n",
@@ -299,18 +307,251 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 22,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Ignoring data/test_database.csv, does not match \\w+.parquet\n"
+ ]
+ }
+ ],
"source": [
- "controller.upload_parquet(\"/\", [df], [database_name])\n",
- "controller.upload_parquet(\"/\", [df], [database_name], compression=False)\n",
+ "controller.upload_parquet(\"/data/\", [df], [database_name])\n",
+ "controller.upload_parquet(\"/data/\", [df], [database_name], compression=False)\n",
"blob_list = controller.get_blob_list(\"/\")\n",
"\n",
- "dfs, names = controller.get_parquet(\"/\", \"\\w+.parquet\")\n",
+ "dfs, names = controller.get_parquet(\"/data/\", \"\\w+.parquet\")\n",
"\n",
"logbook.create(logbook_data)"
]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "controller.upload_excel_csv(\"/data/\", [df], [database_name])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Ignoring data/test_database.csv, does not match \\w+.parquet.gz\n",
+ "Ignoring data/test_database.parquet, does not match \\w+.parquet.gz\n",
+ "number of files read: 1\n"
+ ]
+ }
+ ],
+ "source": [
+ "dfs, names = controller.get_parquet(\"/data/\", \"\\w+.parquet.gz\")\n",
+ "print(f\"number of files read: {len(dfs)}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " name | \n",
+ " city | \n",
+ " age | \n",
+ " py-score | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Xavier | \n",
+ " Mexico City | \n",
+ " 41 | \n",
+ " 88.0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Ann | \n",
+ " Toronto | \n",
+ " 28 | \n",
+ " 79.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Jana | \n",
+ " Prague | \n",
+ " 33 | \n",
+ " 81.0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Yi | \n",
+ " Shanghai | \n",
+ " 34 | \n",
+ " 80.0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Robin | \n",
+ " Manchester | \n",
+ " 38 | \n",
+ " 68.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " name city age py-score\n",
+ "0 Xavier Mexico City 41 88.0\n",
+ "1 Ann Toronto 28 79.0\n",
+ "2 Jana Prague 33 81.0\n",
+ "3 Yi Shanghai 34 80.0\n",
+ "4 Robin Manchester 38 68.0"
+ ]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "dfs[0].head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs, names = controller.get_excel_csv(\"/data/\", \"\\w\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " name | \n",
+ " city | \n",
+ " age | \n",
+ " py-score | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Xavier | \n",
+ " Mexico City | \n",
+ " 41 | \n",
+ " 88.0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Ann | \n",
+ " Toronto | \n",
+ " 28 | \n",
+ " 79.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Jana | \n",
+ " Prague | \n",
+ " 33 | \n",
+ " 81.0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Yi | \n",
+ " Shanghai | \n",
+ " 34 | \n",
+ " 80.0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Robin | \n",
+ " Manchester | \n",
+ " 38 | \n",
+ " 68.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " name city age py-score\n",
+ "0 Xavier Mexico City 41 88.0\n",
+ "1 Ann Toronto 28 79.0\n",
+ "2 Jana Prague 33 81.0\n",
+ "3 Yi Shanghai 34 80.0\n",
+ "4 Robin Manchester 38 68.0"
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "dfs[0].head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
}
],
"metadata": {
diff --git a/pydbsmgr/utils/azure_sdk.py b/pydbsmgr/utils/azure_sdk.py
index fd0e279..f94e023 100644
--- a/pydbsmgr/utils/azure_sdk.py
+++ b/pydbsmgr/utils/azure_sdk.py
@@ -141,7 +141,7 @@ def _read_files(self, file_list, regex, file_type):
dataframes.append(df)
elif file_type == "excel_csv":
- filename, extension = os.path.splitext(file.name.rsplit("/", 1)[1])
+ filename, extension = os.path.splitext(file.name.split("/")[-1])
if extension == ".csv":
try:
blob_str = blob_data.decode("utf-8")