|
18 | 18 | },
|
19 | 19 | {
|
20 | 20 | "cell_type": "code",
|
21 |
| - "execution_count": 1, |
| 21 | + "execution_count": 47, |
22 | 22 | "metadata": {
|
23 | 23 | "id": "t1O3Je_ENtpf"
|
24 | 24 | },
|
25 | 25 | "outputs": [],
|
26 | 26 | "source": [
|
27 | 27 | "from pdf2image import convert_from_path\n",
|
| 28 | + "from pathlib import Path\n", |
28 | 29 | "import google.generativeai as genai\n",
|
29 | 30 | "import json\n",
|
30 | 31 | "import os"
|
|
41 | 42 | },
|
42 | 43 | {
|
43 | 44 | "cell_type": "code",
|
44 |
| - "execution_count": 2, |
| 45 | + "execution_count": 48, |
45 | 46 | "metadata": {
|
46 | 47 | "id": "rHXNl6qnN3qO"
|
47 | 48 | },
|
48 | 49 | "outputs": [],
|
49 | 50 | "source": [
|
50 | 51 | "# Used to securely store your API key\n",
|
51 |
| - "GOOGLE_API_KEY = \"AIzaSyDxxX0kIspXTg34tXfQfoTO0istS9RbrQg\" # replace 'your-api-key-here' with your actual API key\n", |
| 52 | + "GOOGLE_API_KEY = \"your-api-key-here\" # replace 'your-api-key-here' with your actual API key\n", |
52 | 53 | "\n",
|
53 | 54 | "genai.configure(api_key=GOOGLE_API_KEY)"
|
54 | 55 | ]
|
|
64 | 65 | },
|
65 | 66 | {
|
66 | 67 | "cell_type": "code",
|
67 |
| - "execution_count": 3, |
| 68 | + "execution_count": 49, |
68 | 69 | "metadata": {
|
69 | 70 | "colab": {
|
70 | 71 | "base_uri": "https://localhost:8080/",
|
|
102 | 103 | },
|
103 | 104 | {
|
104 | 105 | "cell_type": "code",
|
105 |
| - "execution_count": 4, |
| 106 | + "execution_count": 50, |
106 | 107 | "metadata": {
|
107 | 108 | "id": "sErUXoOIOMKs"
|
108 | 109 | },
|
|
139 | 140 | },
|
140 | 141 | {
|
141 | 142 | "cell_type": "code",
|
142 |
| - "execution_count": 5, |
| 143 | + "execution_count": 51, |
143 | 144 | "metadata": {
|
144 | 145 | "id": "kDhL4GY1OuW_"
|
145 | 146 | },
|
|
161 | 162 | },
|
162 | 163 | {
|
163 | 164 | "cell_type": "code",
|
164 |
| - "execution_count": 6, |
| 165 | + "execution_count": 52, |
165 | 166 | "metadata": {
|
166 | 167 | "id": "Eh_FdZ_nO2Xx"
|
167 | 168 | },
|
168 | 169 | "outputs": [],
|
169 | 170 | "source": [
|
170 |
| - "from pathlib import Path\n", |
171 |
| - "\n", |
172 | 171 | "def image_format(image_path):\n",
|
173 | 172 | " img = Path(image_path)\n",
|
174 | 173 | "\n",
|
|
195 | 194 | },
|
196 | 195 | {
|
197 | 196 | "cell_type": "code",
|
198 |
| - "execution_count": 7, |
| 197 | + "execution_count": 53, |
199 | 198 | "metadata": {
|
200 | 199 | "id": "gkDwnC9NQKSd"
|
201 | 200 | },
|
|
218 | 217 | },
|
219 | 218 | {
|
220 | 219 | "cell_type": "code",
|
221 |
| - "execution_count": 8, |
| 220 | + "execution_count": 54, |
222 | 221 | "metadata": {},
|
223 | 222 | "outputs": [],
|
224 | 223 | "source": [
|
225 |
| - "pdf_path = \"pdfs/boundaried/sku_list_2.pdf\" ## replace PDF to parse\n", |
| 224 | + "pdf_path = \"pdfs/boundaried/sku_list_2.pdf\" # replace PDF to parse\n", |
226 | 225 | "pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]\n",
|
227 |
| - "os.makedirs(pdf_name, exist_ok=True)" |
| 226 | + "export_path = os.path.join(\"exported-jsons\", pdf_name)\n", |
| 227 | + "os.makedirs(export_path, exist_ok=True)" |
228 | 228 | ]
|
229 | 229 | },
|
230 | 230 | {
|
|
236 | 236 | },
|
237 | 237 | {
|
238 | 238 | "cell_type": "code",
|
239 |
| - "execution_count": 9, |
| 239 | + "execution_count": 55, |
240 | 240 | "metadata": {},
|
241 | 241 | "outputs": [],
|
242 | 242 | "source": [
|
243 |
| - "images = convert_from_path(pdf_path, first_page=2, last_page=10) ## set the exact pages to parse" |
| 243 | + "images = convert_from_path(pdf_path, first_page=2, last_page=3) ## set the exact pages to parse" |
244 | 244 | ]
|
245 | 245 | },
|
246 | 246 | {
|
|
252 | 252 | },
|
253 | 253 | {
|
254 | 254 | "cell_type": "code",
|
255 |
| - "execution_count": 10, |
| 255 | + "execution_count": 56, |
256 | 256 | "metadata": {},
|
257 | 257 | "outputs": [],
|
258 | 258 | "source": [
|
|
274 | 274 | },
|
275 | 275 | {
|
276 | 276 | "cell_type": "code",
|
277 |
| - "execution_count": 11, |
| 277 | + "execution_count": 57, |
278 | 278 | "metadata": {},
|
279 | 279 | "outputs": [],
|
280 | 280 | "source": [
|
281 |
| - "all_json_outputs = [] \n", |
| 281 | + "images_dir = Path(\"exported-jsons\") / pdf_name / \"images\"\n", |
| 282 | + "images_dir.mkdir(parents=True, exist_ok=True) # This creates the directory if it doesn't exist\n", |
282 | 283 | "\n",
|
283 |
| - "for i, image in enumerate(images):\n", |
284 |
| - " image_path = os.path.join(pdf_name, f\"output_image_{i}.png\")\n", |
285 |
| - " image.save(image_path, \"PNG\")\n", |
| 284 | + "all_json_outputs = []\n", |
286 | 285 | "\n",
|
287 |
| - " response_text = gemini_output(image_path, system_prompt, user_prompt)\n", |
| 286 | + "for i, image in enumerate(images):\n", |
| 287 | + " image_path = images_dir / f\"output_image_{i}.png\"\n", |
| 288 | + " image.save(image_path.as_posix(), \"PNG\") # Save the image to the specified path\n", |
| 289 | + " response_text = gemini_output(image_path.as_posix(), system_prompt, user_prompt)\n", |
288 | 290 | "\n",
|
289 | 291 | " try:\n",
|
290 | 292 | " json_output = json.loads(response_text)\n",
|
|
303 | 305 | },
|
304 | 306 | {
|
305 | 307 | "cell_type": "code",
|
306 |
| - "execution_count": 12, |
| 308 | + "execution_count": 58, |
307 | 309 | "metadata": {},
|
308 | 310 | "outputs": [
|
309 | 311 | {
|
310 | 312 | "name": "stdout",
|
311 | 313 | "output_type": "stream",
|
312 | 314 | "text": [
|
313 |
| - "Final JSON saved to: sku_list_2/sku_list_2.json.\n" |
| 315 | + "Final JSON saved to: exported-jsons/sku_list_2.json.\n" |
314 | 316 | ]
|
315 | 317 | }
|
316 | 318 | ],
|
317 | 319 | "source": [
|
318 |
| - "final_json_path = os.path.join(pdf_name, f\"{pdf_name}.json\")\n", |
319 |
| - "with open(final_json_path, 'w') as f:\n", |
| 320 | + "export_dir = Path(\"exported-jsons\")\n", |
| 321 | + "export_json_path = export_dir / f\"{pdf_name}.json\"\n", |
| 322 | + "with open(export_json_path, 'w') as f:\n", |
320 | 323 | " json.dump(all_json_outputs, f)\n",
|
321 | 324 | "\n",
|
322 |
| - "print(f\"Final JSON saved to: {final_json_path}.\")" |
| 325 | + "print(f\"Final JSON saved to: {export_json_path}.\")" |
323 | 326 | ]
|
324 | 327 | }
|
325 | 328 | ],
|
|
0 commit comments