Bug fixes and final tests

jzsmoreno · Dec 8, 2024 · 2e9a08c · 2e9a08c
1 parent 3ba6816
commit 2e9a08c
Show file tree

Hide file tree

Showing 2 changed files with 269 additions and 28 deletions.
diff --git a/example.ipynb b/example.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -21,7 +21,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -35,23 +35,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[32m2024-01-19 00:51:57.724\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mfix\u001b[0m:\u001b[36m48\u001b[0m - \u001b[1m1) Empty columns have been removed.\u001b[0m\n",
-      "\u001b[32m2024-01-19 00:51:57.744\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mfix\u001b[0m:\u001b[36m50\u001b[0m - \u001b[1m1) Columns have been cleaned and transformed.\u001b[0m\n",
-      "\u001b[32m2024-01-19 00:51:57.763\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36m_ops_dtypes\u001b[0m:\u001b[36m161\u001b[0m - \u001b[1m1) The data type has been verified.\u001b[0m\n",
-      "\u001b[32m2024-01-19 00:51:57.765\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36m_ops_dtypes\u001b[0m:\u001b[36m163\u001b[0m - \u001b[1m1) The `nan` strings have been replaced by `np.nan`.\u001b[0m\n",
-      "\u001b[32m2024-01-19 00:51:57.769\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36m_ops_dtypes\u001b[0m:\u001b[36m165\u001b[0m - \u001b[1m1) Only the named columns have been retained.\u001b[0m\n",
-      "\u001b[32m2024-01-19 00:51:57.773\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mfix\u001b[0m:\u001b[36m48\u001b[0m - \u001b[1m2) Empty columns have been removed.\u001b[0m\n",
-      "\u001b[32m2024-01-19 00:51:57.777\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mfix\u001b[0m:\u001b[36m50\u001b[0m - \u001b[1m2) Columns have been cleaned and transformed.\u001b[0m\n",
-      "\u001b[32m2024-01-19 00:51:57.933\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36m_ops_dtypes\u001b[0m:\u001b[36m161\u001b[0m - \u001b[1m2) The data type has been verified.\u001b[0m\n",
-      "\u001b[32m2024-01-19 00:51:57.933\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36m_ops_dtypes\u001b[0m:\u001b[36m163\u001b[0m - \u001b[1m2) The `nan` strings have been replaced by `np.nan`.\u001b[0m\n",
-      "\u001b[32m2024-01-19 00:51:57.944\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36m_ops_dtypes\u001b[0m:\u001b[36m165\u001b[0m - \u001b[1m2) Only the named columns have been retained.\u001b[0m\n"
+      "\u001b[32m2024-12-08 12:34:36.016\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mfix\u001b[0m:\u001b[36m51\u001b[0m - \u001b[1m1) Empty columns have been removed.\u001b[0m\n",
+      "\u001b[32m2024-12-08 12:34:36.017\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mfix\u001b[0m:\u001b[36m53\u001b[0m - \u001b[1m1) Columns have been cleaned and transformed.\u001b[0m\n",
+      "\u001b[32m2024-12-08 12:34:36.019\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36m_ops_dtypes\u001b[0m:\u001b[36m149\u001b[0m - \u001b[1m1) The data type has been verified.\u001b[0m\n",
+      "\u001b[32m2024-12-08 12:34:36.019\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36m_ops_dtypes\u001b[0m:\u001b[36m151\u001b[0m - \u001b[1m1) The `nan` strings have been replaced by `np.nan`.\u001b[0m\n",
+      "\u001b[32m2024-12-08 12:34:36.019\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36m_ops_dtypes\u001b[0m:\u001b[36m153\u001b[0m - \u001b[1m1) Only the named columns have been retained.\u001b[0m\n",
+      "\u001b[32m2024-12-08 12:34:36.019\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mfix\u001b[0m:\u001b[36m51\u001b[0m - \u001b[1m2) Empty columns have been removed.\u001b[0m\n",
+      "\u001b[32m2024-12-08 12:34:36.019\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mfix\u001b[0m:\u001b[36m53\u001b[0m - \u001b[1m2) Columns have been cleaned and transformed.\u001b[0m\n",
+      "\u001b[32m2024-12-08 12:34:36.034\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36m_ops_dtypes\u001b[0m:\u001b[36m149\u001b[0m - \u001b[1m2) The data type has been verified.\u001b[0m\n",
+      "\u001b[32m2024-12-08 12:34:36.036\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36m_ops_dtypes\u001b[0m:\u001b[36m151\u001b[0m - \u001b[1m2) The `nan` strings have been replaced by `np.nan`.\u001b[0m\n",
+      "\u001b[32m2024-12-08 12:34:36.037\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36m_ops_dtypes\u001b[0m:\u001b[36m153\u001b[0m - \u001b[1m2) Only the named columns have been retained.\u001b[0m\n"
      ]
     },
     {
@@ -204,7 +204,7 @@
        "4  159.801         0.585  "
       ]
      },
-     "execution_count": 3,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -218,16 +218,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[32m2024-01-19 00:52:01.293\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mgenerate_report\u001b[0m:\u001b[36m142\u001b[0m - \u001b[1mDataFrame 'Features' has been processed\u001b[0m\n",
-      "\u001b[32m2024-01-19 00:52:03.553\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mgenerate_report\u001b[0m:\u001b[36m142\u001b[0m - \u001b[1mDataFrame 'Streams' has been processed\u001b[0m\n",
-      "\u001b[32m2024-01-19 00:52:03.563\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mgenerate_report\u001b[0m:\u001b[36m146\u001b[0m - \u001b[1mA report has been created under the name './report.html'\u001b[0m\n"
+      "\u001b[32m2024-12-08 12:34:38.186\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mgenerate_report\u001b[0m:\u001b[36m129\u001b[0m - \u001b[1mDataFrame 'Features' has been processed\u001b[0m\n",
+      "\u001b[32m2024-12-08 12:34:39.468\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mgenerate_report\u001b[0m:\u001b[36m129\u001b[0m - \u001b[1mDataFrame 'Streams' has been processed\u001b[0m\n",
+      "\u001b[32m2024-12-08 12:34:39.489\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mgenerate_report\u001b[0m:\u001b[36m133\u001b[0m - \u001b[1mA report has been created under the name './report.html'\u001b[0m\n"
      ]
     }
    ],
@@ -245,7 +245,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -257,7 +257,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -270,9 +270,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 21,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "UserWarning: The logbook file test_logsbook.csv already exists, the changes will be added.\n"
+     ]
+    }
+   ],
    "source": [
     "# For this example, assume you’re using a dictionary to pass the data\n",
     "data = {\n",
@@ -299,18 +307,251 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 22,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Ignoring data/test_database.csv, does not match \\w+.parquet\n"
+     ]
+    }
+   ],
    "source": [
-    "controller.upload_parquet(\"/\", [df], [database_name])\n",
-    "controller.upload_parquet(\"/\", [df], [database_name], compression=False)\n",
+    "controller.upload_parquet(\"/data/\", [df], [database_name])\n",
+    "controller.upload_parquet(\"/data/\", [df], [database_name], compression=False)\n",
     "blob_list = controller.get_blob_list(\"/\")\n",
     "\n",
-    "dfs, names = controller.get_parquet(\"/\", \"\\w+.parquet\")\n",
+    "dfs, names = controller.get_parquet(\"/data/\", \"\\w+.parquet\")\n",
     "\n",
     "logbook.create(logbook_data)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "controller.upload_excel_csv(\"/data/\", [df], [database_name])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Ignoring data/test_database.csv, does not match \\w+.parquet.gz\n",
+      "Ignoring data/test_database.parquet, does not match \\w+.parquet.gz\n",
+      "number of files read: 1\n"
+     ]
+    }
+   ],
+   "source": [
+    "dfs, names = controller.get_parquet(\"/data/\", \"\\w+.parquet.gz\")\n",
+    "print(f\"number of files read: {len(dfs)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>name</th>\n",
+       "      <th>city</th>\n",
+       "      <th>age</th>\n",
+       "      <th>py-score</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Xavier</td>\n",
+       "      <td>Mexico City</td>\n",
+       "      <td>41</td>\n",
+       "      <td>88.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Ann</td>\n",
+       "      <td>Toronto</td>\n",
+       "      <td>28</td>\n",
+       "      <td>79.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Jana</td>\n",
+       "      <td>Prague</td>\n",
+       "      <td>33</td>\n",
+       "      <td>81.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Yi</td>\n",
+       "      <td>Shanghai</td>\n",
+       "      <td>34</td>\n",
+       "      <td>80.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Robin</td>\n",
+       "      <td>Manchester</td>\n",
+       "      <td>38</td>\n",
+       "      <td>68.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "     name         city  age  py-score\n",
+       "0  Xavier  Mexico City   41      88.0\n",
+       "1     Ann      Toronto   28      79.0\n",
+       "2    Jana       Prague   33      81.0\n",
+       "3      Yi     Shanghai   34      80.0\n",
+       "4   Robin   Manchester   38      68.0"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dfs[0].head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs, names = controller.get_excel_csv(\"/data/\", \"\\w\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>name</th>\n",
+       "      <th>city</th>\n",
+       "      <th>age</th>\n",
+       "      <th>py-score</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Xavier</td>\n",
+       "      <td>Mexico City</td>\n",
+       "      <td>41</td>\n",
+       "      <td>88.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Ann</td>\n",
+       "      <td>Toronto</td>\n",
+       "      <td>28</td>\n",
+       "      <td>79.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Jana</td>\n",
+       "      <td>Prague</td>\n",
+       "      <td>33</td>\n",
+       "      <td>81.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Yi</td>\n",
+       "      <td>Shanghai</td>\n",
+       "      <td>34</td>\n",
+       "      <td>80.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Robin</td>\n",
+       "      <td>Manchester</td>\n",
+       "      <td>38</td>\n",
+       "      <td>68.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "     name         city  age  py-score\n",
+       "0  Xavier  Mexico City   41      88.0\n",
+       "1     Ann      Toronto   28      79.0\n",
+       "2    Jana       Prague   33      81.0\n",
+       "3      Yi     Shanghai   34      80.0\n",
+       "4   Robin   Manchester   38      68.0"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dfs[0].head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {

diff --git a/pydbsmgr/utils/azure_sdk.py b/pydbsmgr/utils/azure_sdk.py
@@ -141,7 +141,7 @@ def _read_files(self, file_list, regex, file_type):
                     dataframes.append(df)
 
             elif file_type == "excel_csv":
-                filename, extension = os.path.splitext(file.name.rsplit("/", 1)[1])
+                filename, extension = os.path.splitext(file.name.split("/")[-1])
                 if extension == ".csv":
                     try:
                         blob_str = blob_data.decode("utf-8")