Skip to content

Commit

Permalink
Working tests
Browse files Browse the repository at this point in the history
  • Loading branch information
Florents-Tselai committed Nov 5, 2024
1 parent 6f82436 commit ad1351c
Show file tree
Hide file tree
Showing 5 changed files with 288 additions and 366 deletions.
6 changes: 3 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,15 @@ TESTS = $(wildcard test/sql/*.sql)
REGRESS = $(patsubst test/sql/%.sql,%,$(TESTS))
REGRESS_OPTS = --inputdir=test --load-extension=$(EXTENSION)

TEST_FILES = /tmp/pgintro.pdf /tmp/bad.pdf
/tmp/pgintro.pdf:
cp test/pgintro.pdf $@

/tmp/bad.pdf:
echo 'not a pdf' >> $@

installcheck: /tmp/pgintro.pdf /tmp/bad.pdf
installcheck: $(TEST_FILES)

EXTRA_CLEAN = /tmp/pgintro.pdf /tmp/bad.pdf
EXTRA_CLEAN = $(TEST_FILES)

PGXS := $(shell $(PG_CONFIG) --pgxs)
include $(PGXS)
Expand Down
205 changes: 155 additions & 50 deletions pgpdf.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@
#include "utils/builtins.h"
#include "utils/jsonb.h"
#include "poppler.h"
#include <sys/stat.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>

#if PG_VERSION_NUM >= 160000
#include <glib.h>
Expand All @@ -24,58 +28,99 @@ typedef struct varlena pdftype;
#define DatumGetPdfPP(X) ((pdftype *) PG_DETOAST_DATUM_PACKED(X))
#define PdfPGetDatum(X) PointerGetDatum(X)

#define PG_GETARG_PDF_P(n) DatumGetUriP(PG_GETARG_DATUM(n))
#define PG_GETARG_PDF_PP(n) DatumGetUriPP(PG_GETARG_DATUM(n))
#define PG_GETARG_PDF_P(n) DatumGetPdfP(PG_GETARG_DATUM(n))
#define PG_GETARG_PDF_PP(n) DatumGetPdfPP(PG_GETARG_DATUM(n))
#define PG_RETURN_PDF_P(x) PG_RETURN_POINTER(x)


static PopplerDocument* parse_pdf_file(const char* file_path, GError** error)
{
return poppler_document_new_from_file(g_strdup_printf("file://%s", file_path), NULL, error);
}

static PopplerDocument* parse_pdf_bytes(bytea* bytes, GError** error)
{
return poppler_document_new_from_bytes(g_bytes_new(VARDATA_ANY(bytes), VARSIZE_ANY_EXHDR(bytes)), NULL, error);
}
#define PG_GETARG_POPPLER_DOCUMENT(X) ({ \
pdftype* pdf = PG_GETARG_PDF_P(X); \
GError* error = NULL; \
GBytes* pdf_data = g_bytes_new(VARDATA(pdf), VARSIZE_ANY_EXHDR(pdf)); \
PopplerDocument* doc = poppler_document_new_from_bytes(pdf_data, NULL, &error); \
g_bytes_unref(pdf_data); \
if (!doc) { \
elog(ERROR, "Error parsing PDF document: %s", error->message); \
g_clear_error(&error); \
} \
doc; \
})

PG_FUNCTION_INFO_V1(pdf_in);

Datum
pdf_in(PG_FUNCTION_ARGS)
{
char* s = PG_GETARG_CSTRING(0);
pdftype* vardata;
char* file_path = PG_GETARG_CSTRING(0);
pdftype* result;
PopplerDocument* doc = NULL;
GError* error = NULL;
int fd;
struct stat file_info;
ssize_t bytes_read;
GBytes* pdf_data;

// Open file and check if accessible
fd = open(file_path, O_RDONLY);
if (fd == -1)
{
elog(ERROR, "Could not open PDF file: %s", file_path);
}

doc = parse_pdf_file(s, &error);
if (!doc)
elog(ERROR, "Error parsing PDF document: %s\n", error->message);
// Get file size
if (fstat(fd, &file_info) == -1)
{
close(fd);
elog(ERROR, "Could not retrieve file information for: %s", file_path);
}

// Allocate pdftype struct with space for PDF data
int32 data_size = file_info.st_size;
result = (pdftype*)palloc(VARHDRSZ + data_size);
SET_VARSIZE(result, VARHDRSZ + data_size);

// Read file contents into pdftype->data
bytes_read = read(fd, VARDATA(result), data_size);
if (bytes_read != data_size)
{
close(fd);
elog(ERROR, "Could not read entire PDF file: %s", file_path);
}

// Close file
close(fd);

// Create GBytes from the PDF data for validation
pdf_data = g_bytes_new(VARDATA(result), data_size);

// Validate PDF using Poppler
doc = poppler_document_new_from_bytes(pdf_data, NULL, &error);
g_bytes_unref(pdf_data); // Free GBytes after use
if (!doc)
{
elog(ERROR, "Error parsing PDF document: %s", error->message);
pfree(result);
g_clear_error(&error);
PG_RETURN_NULL();
}
g_object_unref(doc);

vardata = (pdftype*)cstring_to_text(s);
PG_RETURN_PDF_P(vardata);
PG_RETURN_POINTER(result);
}

PG_FUNCTION_INFO_V1(pdf_out);

Datum
pdf_out(PG_FUNCTION_ARGS)
{
Datum arg = PG_GETARG_DATUM(0);

PG_RETURN_CSTRING(TextDatumGetCString(arg));
}

PopplerDocument* doc = PG_GETARG_POPPLER_DOCUMENT(0);
StringInfo strinfo = makeStringInfo();

static void extract_text_from_pdf(PopplerDocument* document, StringInfo strinfo)
{
int num_pages = poppler_document_get_n_pages(document);
int num_pages = poppler_document_get_n_pages(doc);
for (int i = 0; i < num_pages; i++)
{
PopplerPage* page = poppler_document_get_page(document, i);
PopplerPage* page = poppler_document_get_page(doc, i);
if (!page)
{
elog(WARNING, "Failed to get page %d\n", i);
Expand All @@ -95,51 +140,111 @@ static void extract_text_from_pdf(PopplerDocument* document, StringInfo strinfo)

g_object_unref(page);
}

PG_RETURN_CSTRING(strinfo->data);
}

PG_FUNCTION_INFO_V1(pdf_read_file);

Datum pdf_read_file(PG_FUNCTION_ARGS)
PG_FUNCTION_INFO_V1(pdf_title);

Datum
pdf_title(PG_FUNCTION_ARGS)
{
char* pdf_file = text_to_cstring(PG_GETARG_TEXT_PP(0));
GError* error = NULL;
StringInfo strinfo = makeStringInfo();
PopplerDocument* document = parse_pdf_file(pdf_file, &error);
if (!document)
elog(ERROR, "Error opening PDF document: %s\n", error->message);
PopplerDocument* doc = PG_GETARG_POPPLER_DOCUMENT(0);
PG_RETURN_TEXT_P(cstring_to_text(poppler_document_get_title(doc)));
}

extract_text_from_pdf(document, strinfo);
g_object_unref(document);
PG_FUNCTION_INFO_V1(pdf_author);

PG_RETURN_TEXT_P(cstring_to_text(strinfo->data));
Datum
pdf_author(PG_FUNCTION_ARGS)
{
PopplerDocument* doc = PG_GETARG_POPPLER_DOCUMENT(0);
PG_RETURN_TEXT_P(cstring_to_text(poppler_document_get_author(doc)));
}

PG_FUNCTION_INFO_V1(pdf_read_bytes);
PG_FUNCTION_INFO_V1(pdf_num_pages);

Datum pdf_read_bytes(PG_FUNCTION_ARGS)
Datum
pdf_num_pages(PG_FUNCTION_ARGS)
{
bytea* bytes = PG_GETARG_BYTEA_PP(0);
GError* error = NULL;
StringInfo strinfo = makeStringInfo();
PopplerDocument* doc = PG_GETARG_POPPLER_DOCUMENT(0);
PG_RETURN_INT32(poppler_document_get_n_pages(doc));
}

PopplerDocument* document = parse_pdf_bytes(bytes, &error);
if (!document)
elog(ERROR, "Error opening PDF document: %s\n", error->message);
PG_FUNCTION_INFO_V1(pdf_page);

Datum
pdf_page(PG_FUNCTION_ARGS)
{
PopplerDocument* doc = PG_GETARG_POPPLER_DOCUMENT(0);
int32 i = PG_GETARG_INT32(1);
PopplerPage* page = poppler_document_get_page(doc, i);
PG_RETURN_TEXT_P(cstring_to_text(poppler_page_get_text(page)));
}

extract_text_from_pdf(document, strinfo);
g_object_unref(document);

PG_RETURN_TEXT_P(cstring_to_text(strinfo->data));
PG_FUNCTION_INFO_V1(pdf_from_bytea);

Datum
pdf_from_bytea(PG_FUNCTION_ARGS)
{
bytea* input_bytes = PG_GETARG_BYTEA_P(0); // Get the binary PDF data
int32 bytes_len = VARSIZE_ANY_EXHDR(input_bytes);
pdftype* result;

result = (pdftype*)palloc(VARHDRSZ + bytes_len);
SET_VARSIZE(result, VARHDRSZ + bytes_len);

memcpy(VARDATA(result), VARDATA_ANY(input_bytes), bytes_len);

PG_RETURN_POINTER(result);
}


PG_FUNCTION_INFO_V1(pdf_to_bytea);

Datum pdf_to_bytea(PG_FUNCTION_ARGS)
Datum
pdf_to_bytea(PG_FUNCTION_ARGS)
{
pdftype *pdf = PG_GETARG_PDF_P(0);
bytea *result;

int32 data_len = VARSIZE_ANY_EXHDR(pdf);

result = (bytea *) palloc(VARHDRSZ + data_len);
SET_VARSIZE(result, VARHDRSZ + data_len);

memcpy(VARDATA(result), VARDATA(pdf), data_len);

PG_RETURN_BYTEA_P(result);
}

PG_FUNCTION_INFO_V1(bytea_to_pdf);

Datum bytea_to_pdf(PG_FUNCTION_ARGS)

Datum
bytea_to_pdf(PG_FUNCTION_ARGS)
{
bytea *bytes = PG_GETARG_BYTEA_PP(0);
pdftype *result;
GError *error = NULL;

GBytes *pdf_data = g_bytes_new(VARDATA_ANY(bytes), VARSIZE_ANY_EXHDR(bytes));

PopplerDocument *doc = poppler_document_new_from_bytes(pdf_data, NULL, &error);
if (!doc) {
g_bytes_unref(pdf_data);
elog(ERROR, "Error parsing PDF document: %s", error->message);
}

result = (pdftype*) palloc(VARHDRSZ + VARSIZE_ANY_EXHDR(bytes));
SET_VARSIZE(result, VARHDRSZ + VARSIZE_ANY_EXHDR(bytes));

memcpy(VARDATA(result), g_bytes_get_data(pdf_data, NULL), VARSIZE_ANY_EXHDR(bytes));

g_object_unref(doc);
g_bytes_unref(pdf_data);

PG_RETURN_POINTER(result);
}
75 changes: 64 additions & 11 deletions sql/pgpdf--0.1.0.sql
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,18 @@ CREATE FUNCTION pdf_in(cstring) RETURNS pdf
IMMUTABLE
STRICT
LANGUAGE C
AS 'MODULE_PATHNAME';
AS
'MODULE_PATHNAME';

CREATE FUNCTION pdf_out(pdf) RETURNS cstring
IMMUTABLE
STRICT
LANGUAGE C
AS 'MODULE_PATHNAME';
AS
'MODULE_PATHNAME';

CREATE TYPE pdf (
CREATE TYPE pdf
(
INTERNALLENGTH = -1,
INPUT = pdf_in,
OUTPUT = pdf_out
Expand All @@ -21,26 +24,76 @@ CREATE TYPE pdf (
CREATE CAST (pdf AS text) WITH INOUT AS ASSIGNMENT;
CREATE CAST (text AS pdf) WITH INOUT AS ASSIGNMENT;


/* API */

CREATE FUNCTION pdf_title(pdf) RETURNS TEXT
IMMUTABLE
STRICT
LANGUAGE C
AS
'MODULE_PATHNAME';

CREATE FUNCTION pdf_author(pdf) RETURNS TEXT
IMMUTABLE
STRICT
LANGUAGE C
AS
'MODULE_PATHNAME';

CREATE FUNCTION pdf_num_pages(pdf) RETURNS INTEGER
IMMUTABLE
STRICT
LANGUAGE C
AS
'MODULE_PATHNAME';

CREATE FUNCTION pdf_page(pdf, integer) RETURNS TEXT
IMMUTABLE
STRICT
LANGUAGE C
AS
'MODULE_PATHNAME';





CREATE FUNCTION bytea_to_pdf(bytea) RETURNS pdf
LANGUAGE C
IMMUTABLE
STRICT
AS 'MODULE_PATHNAME';
AS
'MODULE_PATHNAME';


CREATE FUNCTION pdf_to_bytea(pdf) RETURNS bytea
LANGUAGE C
IMMUTABLE
STRICT
AS 'MODULE_PATHNAME';
AS
'MODULE_PATHNAME';

CREATE CAST (bytea AS pdf) WITH FUNCTION bytea_to_pdf(bytea) AS ASSIGNMENT;
CREATE CAST (pdf AS bytea) WITH FUNCTION pdf_to_bytea(pdf) AS ASSIGNMENT;

CREATE FUNCTION pdf_read_file(text) returns text AS 'MODULE_PATHNAME',
'pdf_read_file'
LANGUAGE C;

CREATE FUNCTION pdf_read_bytes(bytea) returns text AS 'MODULE_PATHNAME',
'pdf_read_bytes'
LANGUAGE C;
--------------------
--
-- CREATE FUNCTION pdf_read_file(text) returns text AS
-- 'MODULE_PATHNAME',
-- 'pdf_read_file'
-- LANGUAGE C;
--
-- CREATE FUNCTION pdf_read_bytes(bytea) returns text AS
-- 'MODULE_PATHNAME',
-- 'pdf_read_bytes'
-- LANGUAGE C;
--
--
-- CREATE FUNCTION pdf_get_num_pages(pdf) RETURNS integer
-- LANGUAGE C
-- IMMUTABLE
-- STRICT
-- AS
-- 'MODULE_PATHNAME';
Loading

0 comments on commit ad1351c

Please sign in to comment.