Skip to content

Commit

Permalink
Bug fixes and final tests
Browse files Browse the repository at this point in the history
  • Loading branch information
jzsmoreno committed Dec 8, 2024
1 parent 3ba6816 commit 2e9a08c
Show file tree
Hide file tree
Showing 2 changed files with 269 additions and 28 deletions.
295 changes: 268 additions & 27 deletions example.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -21,7 +21,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -35,23 +35,23 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[32m2024-01-19 00:51:57.724\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mfix\u001b[0m:\u001b[36m48\u001b[0m - \u001b[1m1) Empty columns have been removed.\u001b[0m\n",
"\u001b[32m2024-01-19 00:51:57.744\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mfix\u001b[0m:\u001b[36m50\u001b[0m - \u001b[1m1) Columns have been cleaned and transformed.\u001b[0m\n",
"\u001b[32m2024-01-19 00:51:57.763\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36m_ops_dtypes\u001b[0m:\u001b[36m161\u001b[0m - \u001b[1m1) The data type has been verified.\u001b[0m\n",
"\u001b[32m2024-01-19 00:51:57.765\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36m_ops_dtypes\u001b[0m:\u001b[36m163\u001b[0m - \u001b[1m1) The `nan` strings have been replaced by `np.nan`.\u001b[0m\n",
"\u001b[32m2024-01-19 00:51:57.769\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36m_ops_dtypes\u001b[0m:\u001b[36m165\u001b[0m - \u001b[1m1) Only the named columns have been retained.\u001b[0m\n",
"\u001b[32m2024-01-19 00:51:57.773\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mfix\u001b[0m:\u001b[36m48\u001b[0m - \u001b[1m2) Empty columns have been removed.\u001b[0m\n",
"\u001b[32m2024-01-19 00:51:57.777\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mfix\u001b[0m:\u001b[36m50\u001b[0m - \u001b[1m2) Columns have been cleaned and transformed.\u001b[0m\n",
"\u001b[32m2024-01-19 00:51:57.933\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36m_ops_dtypes\u001b[0m:\u001b[36m161\u001b[0m - \u001b[1m2) The data type has been verified.\u001b[0m\n",
"\u001b[32m2024-01-19 00:51:57.933\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36m_ops_dtypes\u001b[0m:\u001b[36m163\u001b[0m - \u001b[1m2) The `nan` strings have been replaced by `np.nan`.\u001b[0m\n",
"\u001b[32m2024-01-19 00:51:57.944\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36m_ops_dtypes\u001b[0m:\u001b[36m165\u001b[0m - \u001b[1m2) Only the named columns have been retained.\u001b[0m\n"
"\u001b[32m2024-12-08 12:34:36.016\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mfix\u001b[0m:\u001b[36m51\u001b[0m - \u001b[1m1) Empty columns have been removed.\u001b[0m\n",
"\u001b[32m2024-12-08 12:34:36.017\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mfix\u001b[0m:\u001b[36m53\u001b[0m - \u001b[1m1) Columns have been cleaned and transformed.\u001b[0m\n",
"\u001b[32m2024-12-08 12:34:36.019\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36m_ops_dtypes\u001b[0m:\u001b[36m149\u001b[0m - \u001b[1m1) The data type has been verified.\u001b[0m\n",
"\u001b[32m2024-12-08 12:34:36.019\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36m_ops_dtypes\u001b[0m:\u001b[36m151\u001b[0m - \u001b[1m1) The `nan` strings have been replaced by `np.nan`.\u001b[0m\n",
"\u001b[32m2024-12-08 12:34:36.019\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36m_ops_dtypes\u001b[0m:\u001b[36m153\u001b[0m - \u001b[1m1) Only the named columns have been retained.\u001b[0m\n",
"\u001b[32m2024-12-08 12:34:36.019\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mfix\u001b[0m:\u001b[36m51\u001b[0m - \u001b[1m2) Empty columns have been removed.\u001b[0m\n",
"\u001b[32m2024-12-08 12:34:36.019\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mfix\u001b[0m:\u001b[36m53\u001b[0m - \u001b[1m2) Columns have been cleaned and transformed.\u001b[0m\n",
"\u001b[32m2024-12-08 12:34:36.034\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36m_ops_dtypes\u001b[0m:\u001b[36m149\u001b[0m - \u001b[1m2) The data type has been verified.\u001b[0m\n",
"\u001b[32m2024-12-08 12:34:36.036\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36m_ops_dtypes\u001b[0m:\u001b[36m151\u001b[0m - \u001b[1m2) The `nan` strings have been replaced by `np.nan`.\u001b[0m\n",
"\u001b[32m2024-12-08 12:34:36.037\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36m_ops_dtypes\u001b[0m:\u001b[36m153\u001b[0m - \u001b[1m2) Only the named columns have been retained.\u001b[0m\n"
]
},
{
Expand Down Expand Up @@ -204,7 +204,7 @@
"4 159.801 0.585 "
]
},
"execution_count": 3,
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -218,16 +218,16 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[32m2024-01-19 00:52:01.293\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mgenerate_report\u001b[0m:\u001b[36m142\u001b[0m - \u001b[1mDataFrame 'Features' has been processed\u001b[0m\n",
"\u001b[32m2024-01-19 00:52:03.553\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mgenerate_report\u001b[0m:\u001b[36m142\u001b[0m - \u001b[1mDataFrame 'Streams' has been processed\u001b[0m\n",
"\u001b[32m2024-01-19 00:52:03.563\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mgenerate_report\u001b[0m:\u001b[36m146\u001b[0m - \u001b[1mA report has been created under the name './report.html'\u001b[0m\n"
"\u001b[32m2024-12-08 12:34:38.186\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mgenerate_report\u001b[0m:\u001b[36m129\u001b[0m - \u001b[1mDataFrame 'Features' has been processed\u001b[0m\n",
"\u001b[32m2024-12-08 12:34:39.468\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mgenerate_report\u001b[0m:\u001b[36m129\u001b[0m - \u001b[1mDataFrame 'Streams' has been processed\u001b[0m\n",
"\u001b[32m2024-12-08 12:34:39.489\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpydbsmgr.health\u001b[0m:\u001b[36mgenerate_report\u001b[0m:\u001b[36m133\u001b[0m - \u001b[1mA report has been created under the name './report.html'\u001b[0m\n"
]
}
],
Expand All @@ -245,7 +245,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -257,7 +257,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -270,9 +270,17 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 21,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"UserWarning: The logbook file test_logsbook.csv already exists, the changes will be added.\n"
]
}
],
"source": [
"# For this example, assume you’re using a dictionary to pass the data\n",
"data = {\n",
Expand All @@ -299,18 +307,251 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 22,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Ignoring data/test_database.csv, does not match \\w+.parquet\n"
]
}
],
"source": [
"controller.upload_parquet(\"/\", [df], [database_name])\n",
"controller.upload_parquet(\"/\", [df], [database_name], compression=False)\n",
"controller.upload_parquet(\"/data/\", [df], [database_name])\n",
"controller.upload_parquet(\"/data/\", [df], [database_name], compression=False)\n",
"blob_list = controller.get_blob_list(\"/\")\n",
"\n",
"dfs, names = controller.get_parquet(\"/\", \"\\w+.parquet\")\n",
"dfs, names = controller.get_parquet(\"/data/\", \"\\w+.parquet\")\n",
"\n",
"logbook.create(logbook_data)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"controller.upload_excel_csv(\"/data/\", [df], [database_name])"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Ignoring data/test_database.csv, does not match \\w+.parquet.gz\n",
"Ignoring data/test_database.parquet, does not match \\w+.parquet.gz\n",
"number of files read: 1\n"
]
}
],
"source": [
"dfs, names = controller.get_parquet(\"/data/\", \"\\w+.parquet.gz\")\n",
"print(f\"number of files read: {len(dfs)}\")"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>city</th>\n",
" <th>age</th>\n",
" <th>py-score</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Xavier</td>\n",
" <td>Mexico City</td>\n",
" <td>41</td>\n",
" <td>88.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Ann</td>\n",
" <td>Toronto</td>\n",
" <td>28</td>\n",
" <td>79.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Jana</td>\n",
" <td>Prague</td>\n",
" <td>33</td>\n",
" <td>81.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Yi</td>\n",
" <td>Shanghai</td>\n",
" <td>34</td>\n",
" <td>80.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Robin</td>\n",
" <td>Manchester</td>\n",
" <td>38</td>\n",
" <td>68.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name city age py-score\n",
"0 Xavier Mexico City 41 88.0\n",
"1 Ann Toronto 28 79.0\n",
"2 Jana Prague 33 81.0\n",
"3 Yi Shanghai 34 80.0\n",
"4 Robin Manchester 38 68.0"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dfs[0].head()"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"dfs, names = controller.get_excel_csv(\"/data/\", \"\\w\")"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>city</th>\n",
" <th>age</th>\n",
" <th>py-score</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Xavier</td>\n",
" <td>Mexico City</td>\n",
" <td>41</td>\n",
" <td>88.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Ann</td>\n",
" <td>Toronto</td>\n",
" <td>28</td>\n",
" <td>79.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Jana</td>\n",
" <td>Prague</td>\n",
" <td>33</td>\n",
" <td>81.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Yi</td>\n",
" <td>Shanghai</td>\n",
" <td>34</td>\n",
" <td>80.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Robin</td>\n",
" <td>Manchester</td>\n",
" <td>38</td>\n",
" <td>68.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name city age py-score\n",
"0 Xavier Mexico City 41 88.0\n",
"1 Ann Toronto 28 79.0\n",
"2 Jana Prague 33 81.0\n",
"3 Yi Shanghai 34 80.0\n",
"4 Robin Manchester 38 68.0"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dfs[0].head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down
2 changes: 1 addition & 1 deletion pydbsmgr/utils/azure_sdk.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def _read_files(self, file_list, regex, file_type):
dataframes.append(df)

elif file_type == "excel_csv":
filename, extension = os.path.splitext(file.name.rsplit("/", 1)[1])
filename, extension = os.path.splitext(file.name.split("/")[-1])
if extension == ".csv":
try:
blob_str = blob_data.decode("utf-8")
Expand Down

0 comments on commit 2e9a08c

Please sign in to comment.