Skip to content

Commit e3b6b1f

Browse files
authored
Merge pull request #2 from txhno/feature/dynamic-pdf-conversion
structuring export directory
2 parents ce9090b + 51a0ca2 commit e3b6b1f

File tree

1 file changed

+30
-27
lines changed

1 file changed

+30
-27
lines changed

dynamic_pdf_to_json.ipynb

+30-27
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,14 @@
1818
},
1919
{
2020
"cell_type": "code",
21-
"execution_count": 1,
21+
"execution_count": 47,
2222
"metadata": {
2323
"id": "t1O3Je_ENtpf"
2424
},
2525
"outputs": [],
2626
"source": [
2727
"from pdf2image import convert_from_path\n",
28+
"from pathlib import Path\n",
2829
"import google.generativeai as genai\n",
2930
"import json\n",
3031
"import os"
@@ -41,14 +42,14 @@
4142
},
4243
{
4344
"cell_type": "code",
44-
"execution_count": 2,
45+
"execution_count": 48,
4546
"metadata": {
4647
"id": "rHXNl6qnN3qO"
4748
},
4849
"outputs": [],
4950
"source": [
5051
"# Used to securely store your API key\n",
51-
"GOOGLE_API_KEY = \"AIzaSyDxxX0kIspXTg34tXfQfoTO0istS9RbrQg\" # replace 'your-api-key-here' with your actual API key\n",
52+
"GOOGLE_API_KEY = \"your-api-key-here\" # replace 'your-api-key-here' with your actual API key\n",
5253
"\n",
5354
"genai.configure(api_key=GOOGLE_API_KEY)"
5455
]
@@ -64,7 +65,7 @@
6465
},
6566
{
6667
"cell_type": "code",
67-
"execution_count": 3,
68+
"execution_count": 49,
6869
"metadata": {
6970
"colab": {
7071
"base_uri": "https://localhost:8080/",
@@ -102,7 +103,7 @@
102103
},
103104
{
104105
"cell_type": "code",
105-
"execution_count": 4,
106+
"execution_count": 50,
106107
"metadata": {
107108
"id": "sErUXoOIOMKs"
108109
},
@@ -139,7 +140,7 @@
139140
},
140141
{
141142
"cell_type": "code",
142-
"execution_count": 5,
143+
"execution_count": 51,
143144
"metadata": {
144145
"id": "kDhL4GY1OuW_"
145146
},
@@ -161,14 +162,12 @@
161162
},
162163
{
163164
"cell_type": "code",
164-
"execution_count": 6,
165+
"execution_count": 52,
165166
"metadata": {
166167
"id": "Eh_FdZ_nO2Xx"
167168
},
168169
"outputs": [],
169170
"source": [
170-
"from pathlib import Path\n",
171-
"\n",
172171
"def image_format(image_path):\n",
173172
" img = Path(image_path)\n",
174173
"\n",
@@ -195,7 +194,7 @@
195194
},
196195
{
197196
"cell_type": "code",
198-
"execution_count": 7,
197+
"execution_count": 53,
199198
"metadata": {
200199
"id": "gkDwnC9NQKSd"
201200
},
@@ -218,13 +217,14 @@
218217
},
219218
{
220219
"cell_type": "code",
221-
"execution_count": 8,
220+
"execution_count": 54,
222221
"metadata": {},
223222
"outputs": [],
224223
"source": [
225-
"pdf_path = \"pdfs/boundaried/sku_list_2.pdf\" ## replace PDF to parse\n",
224+
"pdf_path = \"pdfs/boundaried/sku_list_2.pdf\" # replace PDF to parse\n",
226225
"pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]\n",
227-
"os.makedirs(pdf_name, exist_ok=True)"
226+
"export_path = os.path.join(\"exported-jsons\", pdf_name)\n",
227+
"os.makedirs(export_path, exist_ok=True)"
228228
]
229229
},
230230
{
@@ -236,11 +236,11 @@
236236
},
237237
{
238238
"cell_type": "code",
239-
"execution_count": 9,
239+
"execution_count": 55,
240240
"metadata": {},
241241
"outputs": [],
242242
"source": [
243-
"images = convert_from_path(pdf_path, first_page=2, last_page=10) ## set the exact pages to parse"
243+
"images = convert_from_path(pdf_path, first_page=2, last_page=3) ## set the exact pages to parse"
244244
]
245245
},
246246
{
@@ -252,7 +252,7 @@
252252
},
253253
{
254254
"cell_type": "code",
255-
"execution_count": 10,
255+
"execution_count": 56,
256256
"metadata": {},
257257
"outputs": [],
258258
"source": [
@@ -274,17 +274,19 @@
274274
},
275275
{
276276
"cell_type": "code",
277-
"execution_count": 11,
277+
"execution_count": 57,
278278
"metadata": {},
279279
"outputs": [],
280280
"source": [
281-
"all_json_outputs = [] \n",
281+
"images_dir = Path(\"exported-jsons\") / pdf_name / \"images\"\n",
282+
"images_dir.mkdir(parents=True, exist_ok=True) # This creates the directory if it doesn't exist\n",
282283
"\n",
283-
"for i, image in enumerate(images):\n",
284-
" image_path = os.path.join(pdf_name, f\"output_image_{i}.png\")\n",
285-
" image.save(image_path, \"PNG\")\n",
284+
"all_json_outputs = []\n",
286285
"\n",
287-
" response_text = gemini_output(image_path, system_prompt, user_prompt)\n",
286+
"for i, image in enumerate(images):\n",
287+
" image_path = images_dir / f\"output_image_{i}.png\"\n",
288+
" image.save(image_path.as_posix(), \"PNG\") # Save the image to the specified path\n",
289+
" response_text = gemini_output(image_path.as_posix(), system_prompt, user_prompt)\n",
288290
"\n",
289291
" try:\n",
290292
" json_output = json.loads(response_text)\n",
@@ -303,23 +305,24 @@
303305
},
304306
{
305307
"cell_type": "code",
306-
"execution_count": 12,
308+
"execution_count": 58,
307309
"metadata": {},
308310
"outputs": [
309311
{
310312
"name": "stdout",
311313
"output_type": "stream",
312314
"text": [
313-
"Final JSON saved to: sku_list_2/sku_list_2.json.\n"
315+
"Final JSON saved to: exported-jsons/sku_list_2.json.\n"
314316
]
315317
}
316318
],
317319
"source": [
318-
"final_json_path = os.path.join(pdf_name, f\"{pdf_name}.json\")\n",
319-
"with open(final_json_path, 'w') as f:\n",
320+
"export_dir = Path(\"exported-jsons\")\n",
321+
"export_json_path = export_dir / f\"{pdf_name}.json\"\n",
322+
"with open(export_json_path, 'w') as f:\n",
320323
" json.dump(all_json_outputs, f)\n",
321324
"\n",
322-
"print(f\"Final JSON saved to: {final_json_path}.\")"
325+
"print(f\"Final JSON saved to: {export_json_path}.\")"
323326
]
324327
}
325328
],

0 commit comments

Comments
 (0)