Merge pull request #2 from txhno/feature/dynamic-pdf-conversion

txhno · web-flow · commit e3b6b1f08d43 · 2024-04-02T03:57:00.000+05:30
structuring export directory
diff --git a/dynamic_pdf_to_json.ipynb b/dynamic_pdf_to_json.ipynb
@@ -18,13 +18,14 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 1,
+      "execution_count": 47,
       "metadata": {
         "id": "t1O3Je_ENtpf"
       },
       "outputs": [],
       "source": [
         "from pdf2image import convert_from_path\n",
+        "from pathlib import Path\n",
         "import google.generativeai as genai\n",
         "import json\n",
         "import os"
@@ -41,14 +42,14 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 2,
+      "execution_count": 48,
       "metadata": {
         "id": "rHXNl6qnN3qO"
       },
       "outputs": [],
       "source": [
         "# Used to securely store your API key\n",
-        "GOOGLE_API_KEY = \"AIzaSyDxxX0kIspXTg34tXfQfoTO0istS9RbrQg\"  # replace 'your-api-key-here' with your actual API key\n",
+        "GOOGLE_API_KEY = \"your-api-key-here\"  # replace 'your-api-key-here' with your actual API key\n",
         "\n",
         "genai.configure(api_key=GOOGLE_API_KEY)"
       ]
@@ -64,7 +65,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 3,
+      "execution_count": 49,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/",
@@ -102,7 +103,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 4,
+      "execution_count": 50,
       "metadata": {
         "id": "sErUXoOIOMKs"
       },
@@ -139,7 +140,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 5,
+      "execution_count": 51,
       "metadata": {
         "id": "kDhL4GY1OuW_"
       },
@@ -161,14 +162,12 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 6,
+      "execution_count": 52,
       "metadata": {
         "id": "Eh_FdZ_nO2Xx"
       },
       "outputs": [],
       "source": [
-        "from pathlib import Path\n",
-        "\n",
         "def image_format(image_path):\n",
         "    img = Path(image_path)\n",
         "\n",
@@ -195,7 +194,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 7,
+      "execution_count": 53,
       "metadata": {
         "id": "gkDwnC9NQKSd"
       },
@@ -218,13 +217,14 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 8,
+      "execution_count": 54,
       "metadata": {},
       "outputs": [],
       "source": [
-        "pdf_path = \"pdfs/boundaried/sku_list_2.pdf\" ## replace PDF to parse\n",
+        "pdf_path = \"pdfs/boundaried/sku_list_2.pdf\"  # replace PDF to parse\n",
         "pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]\n",
-        "os.makedirs(pdf_name, exist_ok=True)"
+        "export_path = os.path.join(\"exported-jsons\", pdf_name)\n",
+        "os.makedirs(export_path, exist_ok=True)"
       ]
     },
     {
@@ -236,11 +236,11 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 9,
+      "execution_count": 55,
       "metadata": {},
       "outputs": [],
       "source": [
-        "images = convert_from_path(pdf_path, first_page=2, last_page=10) ## set the exact pages to parse"
+        "images = convert_from_path(pdf_path, first_page=2, last_page=3) ## set the exact pages to parse"
       ]
     },
     {
@@ -252,7 +252,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 10,
+      "execution_count": 56,
       "metadata": {},
       "outputs": [],
       "source": [
@@ -274,17 +274,19 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 11,
+      "execution_count": 57,
       "metadata": {},
       "outputs": [],
       "source": [
-        "all_json_outputs = [] \n",
+        "images_dir = Path(\"exported-jsons\") / pdf_name / \"images\"\n",
+        "images_dir.mkdir(parents=True, exist_ok=True)  # This creates the directory if it doesn't exist\n",
         "\n",
-        "for i, image in enumerate(images):\n",
-        "    image_path = os.path.join(pdf_name, f\"output_image_{i}.png\")\n",
-        "    image.save(image_path, \"PNG\")\n",
+        "all_json_outputs = []\n",
         "\n",
-        "    response_text = gemini_output(image_path, system_prompt, user_prompt)\n",
+        "for i, image in enumerate(images):\n",
+        "    image_path = images_dir / f\"output_image_{i}.png\"\n",
+        "    image.save(image_path.as_posix(), \"PNG\")  # Save the image to the specified path\n",
+        "    response_text = gemini_output(image_path.as_posix(), system_prompt, user_prompt)\n",
         "\n",
         "    try:\n",
         "        json_output = json.loads(response_text)\n",
@@ -303,23 +305,24 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 12,
+      "execution_count": 58,
       "metadata": {},
       "outputs": [
         {
           "name": "stdout",
           "output_type": "stream",
           "text": [
-            "Final JSON saved to: sku_list_2/sku_list_2.json.\n"
+            "Final JSON saved to: exported-jsons/sku_list_2.json.\n"
           ]
         }
       ],
       "source": [
-        "final_json_path = os.path.join(pdf_name, f\"{pdf_name}.json\")\n",
-        "with open(final_json_path, 'w') as f:\n",
+        "export_dir = Path(\"exported-jsons\")\n",
+        "export_json_path = export_dir / f\"{pdf_name}.json\"\n",
+        "with open(export_json_path, 'w') as f:\n",
         "    json.dump(all_json_outputs, f)\n",
         "\n",
-        "print(f\"Final JSON saved to: {final_json_path}.\")"
+        "print(f\"Final JSON saved to: {export_json_path}.\")"
       ]
     }
   ],