Skip to content

Commit

Permalink
Merge pull request #2 from txhno/feature/dynamic-pdf-conversion
Browse files Browse the repository at this point in the history
structuring export directory
  • Loading branch information
txhno authored Apr 1, 2024
2 parents ce9090b + 51a0ca2 commit e3b6b1f
Showing 1 changed file with 30 additions and 27 deletions.
57 changes: 30 additions & 27 deletions dynamic_pdf_to_json.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,14 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 47,
"metadata": {
"id": "t1O3Je_ENtpf"
},
"outputs": [],
"source": [
"from pdf2image import convert_from_path\n",
"from pathlib import Path\n",
"import google.generativeai as genai\n",
"import json\n",
"import os"
Expand All @@ -41,14 +42,14 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 48,
"metadata": {
"id": "rHXNl6qnN3qO"
},
"outputs": [],
"source": [
"# Used to securely store your API key\n",
"GOOGLE_API_KEY = \"AIzaSyDxxX0kIspXTg34tXfQfoTO0istS9RbrQg\" # replace 'your-api-key-here' with your actual API key\n",
"GOOGLE_API_KEY = \"your-api-key-here\" # replace 'your-api-key-here' with your actual API key\n",
"\n",
"genai.configure(api_key=GOOGLE_API_KEY)"
]
Expand All @@ -64,7 +65,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 49,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
Expand Down Expand Up @@ -102,7 +103,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 50,
"metadata": {
"id": "sErUXoOIOMKs"
},
Expand Down Expand Up @@ -139,7 +140,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 51,
"metadata": {
"id": "kDhL4GY1OuW_"
},
Expand All @@ -161,14 +162,12 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 52,
"metadata": {
"id": "Eh_FdZ_nO2Xx"
},
"outputs": [],
"source": [
"from pathlib import Path\n",
"\n",
"def image_format(image_path):\n",
" img = Path(image_path)\n",
"\n",
Expand All @@ -195,7 +194,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 53,
"metadata": {
"id": "gkDwnC9NQKSd"
},
Expand All @@ -218,13 +217,14 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 54,
"metadata": {},
"outputs": [],
"source": [
"pdf_path = \"pdfs/boundaried/sku_list_2.pdf\" ## replace PDF to parse\n",
"pdf_path = \"pdfs/boundaried/sku_list_2.pdf\" # replace PDF to parse\n",
"pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]\n",
"os.makedirs(pdf_name, exist_ok=True)"
"export_path = os.path.join(\"exported-jsons\", pdf_name)\n",
"os.makedirs(export_path, exist_ok=True)"
]
},
{
Expand All @@ -236,11 +236,11 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 55,
"metadata": {},
"outputs": [],
"source": [
"images = convert_from_path(pdf_path, first_page=2, last_page=10) ## set the exact pages to parse"
"images = convert_from_path(pdf_path, first_page=2, last_page=3) ## set the exact pages to parse"
]
},
{
Expand All @@ -252,7 +252,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -274,17 +274,19 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 57,
"metadata": {},
"outputs": [],
"source": [
"all_json_outputs = [] \n",
"images_dir = Path(\"exported-jsons\") / pdf_name / \"images\"\n",
"images_dir.mkdir(parents=True, exist_ok=True) # This creates the directory if it doesn't exist\n",
"\n",
"for i, image in enumerate(images):\n",
" image_path = os.path.join(pdf_name, f\"output_image_{i}.png\")\n",
" image.save(image_path, \"PNG\")\n",
"all_json_outputs = []\n",
"\n",
" response_text = gemini_output(image_path, system_prompt, user_prompt)\n",
"for i, image in enumerate(images):\n",
" image_path = images_dir / f\"output_image_{i}.png\"\n",
" image.save(image_path.as_posix(), \"PNG\") # Save the image to the specified path\n",
" response_text = gemini_output(image_path.as_posix(), system_prompt, user_prompt)\n",
"\n",
" try:\n",
" json_output = json.loads(response_text)\n",
Expand All @@ -303,23 +305,24 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 58,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Final JSON saved to: sku_list_2/sku_list_2.json.\n"
"Final JSON saved to: exported-jsons/sku_list_2.json.\n"
]
}
],
"source": [
"final_json_path = os.path.join(pdf_name, f\"{pdf_name}.json\")\n",
"with open(final_json_path, 'w') as f:\n",
"export_dir = Path(\"exported-jsons\")\n",
"export_json_path = export_dir / f\"{pdf_name}.json\"\n",
"with open(export_json_path, 'w') as f:\n",
" json.dump(all_json_outputs, f)\n",
"\n",
"print(f\"Final JSON saved to: {final_json_path}.\")"
"print(f\"Final JSON saved to: {export_json_path}.\")"
]
}
],
Expand Down

0 comments on commit e3b6b1f

Please sign in to comment.