Skip to content

Commit

Permalink
pdf_read_bytes(bytea) (#2)
Browse files Browse the repository at this point in the history
  • Loading branch information
Florents-Tselai authored Sep 23, 2024
1 parent 02d88c4 commit cfca484
Show file tree
Hide file tree
Showing 5 changed files with 365 additions and 28 deletions.
31 changes: 30 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,37 @@

## Usage

```sh
wget https://wiki.postgresql.org/images/e/ea/PostgreSQL_Introduction.pdf -O /tmp/pgintro.pdf
```

You can use an absolute path to file as a `text` argument

```tsql
select pdf_read_file('/tmp/pgintro.pdf');
```
```tsql
pdf_read_file
----------------------------------------------------------------------------------
PostgreSQL Introduction +
Digoal.Zhou +
7/20/2011Catalog +
 PostgreSQL Origin +
```

If you don't have the PDF file in your filesystem but have already stored its content in a `bytea` column:

```tsql
select pdf_read_file('/abs/path/to/test/pgintro.pdf');
select pdf_read_bytes(pg_read_binary_file('/tmp/pgintro.pdf'));
```
```tsql
pdf_read_bytes
----------------------------------------------------------------------------------
PostgreSQL Introduction +
Digoal.Zhou +
7/20/2011Catalog +
 PostgreSQL Origin +
```

## Installation
Expand Down
71 changes: 44 additions & 27 deletions pgpdf.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#include "poppler/glib/poppler.h"

#if PG_VERSION_NUM >= 160000

#include <glib.h>
#include "varatt.h"

#endif
Expand All @@ -18,38 +18,16 @@

PG_MODULE_MAGIC;

#include <poppler/glib/poppler.h>
#include <glib.h>
#include <stdio.h>


PG_FUNCTION_INFO_V1(pdf_read_file);

Datum pdf_read_file(PG_FUNCTION_ARGS) {
char *pdf_file = text_to_cstring(PG_GETARG_TEXT_PP(0));
GError *error = NULL;
PopplerDocument *document;
gchar *page_text;
int num_pages; /* Get the number of pages in the PDF */
StringInfo strinfo = makeStringInfo();

/* Open the PDF document */
document = poppler_document_new_from_file(g_strdup_printf("file://%s", pdf_file), NULL, &error);
if (!document)
elog(ERROR, "Error opening PDF document: %s\n", error->message);


num_pages = poppler_document_get_n_pages(document);

/* Iterate through pages and extract text */
static void extract_text_from_pdf(PopplerDocument *document, StringInfo strinfo) {
int num_pages = poppler_document_get_n_pages(document);
for (int i = 0; i < num_pages; i++) {
PopplerPage *page = poppler_document_get_page(document, i);
if (!page) {
elog(WARNING, "Failed to get page %d\n", i);
continue;
}

page_text = poppler_page_get_text(page);
gchar *page_text = poppler_page_get_text(page);
if (page_text) {
appendStringInfo(strinfo, "%s", page_text);
g_free(page_text);
Expand All @@ -59,8 +37,47 @@ Datum pdf_read_file(PG_FUNCTION_ARGS) {

g_object_unref(page);
}
}

static PopplerDocument *open_pdf_file(const char *file_path, GError **error) {
return poppler_document_new_from_file(g_strdup_printf("file://%s", file_path), NULL, error);
}

static PopplerDocument *open_pdf_bytes(bytea *bytes, GError **error) {
return poppler_document_new_from_bytes(g_bytes_new(VARDATA_ANY(bytes), VARSIZE_ANY_EXHDR(bytes)), NULL, error);
}

PG_FUNCTION_INFO_V1(pdf_read_file);

Datum pdf_read_file(PG_FUNCTION_ARGS) {
char *pdf_file = text_to_cstring(PG_GETARG_TEXT_PP(0));
GError *error = NULL;
StringInfo strinfo = makeStringInfo();

PopplerDocument *document = open_pdf_file(pdf_file, &error);
if (!document) {
elog(ERROR, "Error opening PDF document: %s\n", error->message);
}

extract_text_from_pdf(document, strinfo);
g_object_unref(document);

PG_RETURN_TEXT_P(cstring_to_text(strinfo->data));
}

PG_FUNCTION_INFO_V1(pdf_read_bytes);

Datum pdf_read_bytes(PG_FUNCTION_ARGS) {
bytea *bytes = PG_GETARG_BYTEA_PP(0);
GError *error = NULL;
StringInfo strinfo = makeStringInfo();

PopplerDocument *document = open_pdf_bytes(bytes, &error);
if (!document) {
elog(ERROR, "Error opening PDF document: %s\n", error->message);
}

/* Close the PDF document */
extract_text_from_pdf(document, strinfo);
g_object_unref(document);

PG_RETURN_TEXT_P(cstring_to_text(strinfo->data));
Expand Down
4 changes: 4 additions & 0 deletions sql/pgpdf--0.1.0.sql
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
CREATE FUNCTION pdf_read_file(text) returns text AS 'MODULE_PATHNAME',
'pdf_read_file'
LANGUAGE C;

CREATE FUNCTION pdf_read_bytes(bytea) returns text AS 'MODULE_PATHNAME',
'pdf_read_bytes'
LANGUAGE C;
Loading

0 comments on commit cfca484

Please sign in to comment.