Skip to content

Commit

Permalink
Add pdf type (#4)
Browse files Browse the repository at this point in the history
  • Loading branch information
Florents-Tselai authored Nov 5, 2024
1 parent 446d279 commit d65910a
Show file tree
Hide file tree
Showing 7 changed files with 474 additions and 357 deletions.
12 changes: 6 additions & 6 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ jobs:
fail-fast: false
matrix:
include:
- postgres: 18
os: ubuntu-24.04
- postgres: 17
os: ubuntu-24.04
- postgres: 16
Expand All @@ -29,10 +31,8 @@ jobs:
dev-files: true

- run: make
- run: sudo make install
- run: make installcheck
- if: ${{ failure() }}
run: cat regression.diffs

- run: |
export PG_CONFIG=`which pg_config`
sudo --preserve-env=PG_CONFIG make install
# - run: make installcheck
# - if: ${{ failure() }}
# run: cat regression.diffs
17 changes: 11 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,20 +8,25 @@ MODULE_big = $(EXTENSION)

OBJS = pgpdf.o

DATA = $(wildcard sql/*--*.sql)

PG_CPPFLAGS = $(shell $(PKG_CONFIG) --cflags poppler poppler-glib)
PG_LDFLAGS = $(shell $(PKG_CONFIG) --libs poppler poppler-glib)
SHLIB_LINK =-lpoppler -lpoppler-glib

TESTS = $(wildcard test/sql/*.sql)
REGRESS = $(patsubst test/sql/%.sql,%,$(TESTS))
REGRESS_OPTS = --inputdir=test --load-extension=$(EXTENSION)

TEST_FILES = /tmp/pgintro.pdf /tmp/bad.pdf
/tmp/pgintro.pdf:
cp test/pgintro.pdf $@
installcheck: /tmp/pgintro.pdf
/tmp/bad.pdf:
echo 'not a pdf' >> $@

DATA = $(wildcard sql/*--*.sql)

PG_CPPFLAGS = $(shell $(PKG_CONFIG) --cflags poppler poppler-glib)
PG_LDFLAGS = $(shell $(PKG_CONFIG) --libs poppler poppler-glib)
SHLIB_LINK=-lpoppler -lpoppler-glib
installcheck: $(TEST_FILES)

EXTRA_CLEAN = $(TEST_FILES)

PGXS := $(shell $(PG_CONFIG) --pgxs)
include $(PGXS)
Expand Down
81 changes: 62 additions & 19 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,47 +1,83 @@
# pgPDF: Read PDFs from Postgres
# pgPDF: `pdf` type for Postgres

[![build](https://github.com/Florents-Tselai/pgpdf/actions/workflows/build.yml/badge.svg)](https://github.com/Florents-Tselai/pgpdf/actions/workflows/build.yml)

This is an extension for PostgreSQL that provides a `pdf` data type.

The actual PDF parsing is done by [poppler](https://poppler.freedesktop.org).

## Usage

```sh
wget https://wiki.postgresql.org/images/e/ea/PostgreSQL_Introduction.pdf -O /tmp/pgintro.pdf
```

You can use an absolute path to file as a `text` argument

```tsql
select pdf_read_file('/tmp/pgintro.pdf');
SELECT '/tmp/pgintro.pdf'::pdf;
```

```tsql
pdf_read_file
pdf
----------------------------------------------------------------------------------
PostgreSQL Introduction +
Digoal.Zhou +
7/20/2011Catalog +
 PostgreSQL Origin +
 PostgreSQL Origin
```

If you don't have the PDF file in your filesystem but have already stored its content in a `bytea` column:
```tsql
SELECT pdf_title('/tmp/pgintro.pdf');
```

```tsql
select pdf_read_bytes(pg_read_binary_file('/tmp/pgintro.pdf'));
pdf_title
-------------------------
PostgreSQL Introduction
(1 row)
```

```tsql
SELECT pdf_author('/tmp/pgintro.pdf');
```

pdf_read_bytes
----------------------------------------------------------------------------------
PostgreSQL Introduction +
Digoal.Zhou +
7/20/2011Catalog +
 PostgreSQL Origin +
```tsql
pdf_author
------------
周正中
(1 row)
```

You can now do whatever you want,
for example full-text search is easy:
```tsql
SELECT pdf_num_pages('/tmp/pgintro.pdf');
```

```tsql
select pdf_read_file('/tmp/pgintro.pdf') @@ to_tsquery('postgres');
pdf_num_pages
---------------
24
(1 row)
```

```tsql
SELECT pdf_page('/tmp/pgintro.pdf', 1);
```

```tsql
pdf_page
------------------------------
Catalog +
 PostgreSQL Origin +
 Layout +
 Features +
 Enterprise Class Attribute+
 Case
(1 row)
```

You can also perform full-text search (FTS), since you can work on a `pdf` file like normal text.

```tsql
SELECT '/tmp/pgintro.pdf'::pdf::text @@ to_tsquery('postgres');
```

```tsql
Expand All @@ -52,7 +88,7 @@ select pdf_read_file('/tmp/pgintro.pdf') @@ to_tsquery('postgres');
```

```tsql
select pdf_read_file('/tmp/pgintro.pdf') @@ to_tsquery('oracle');
SELECT '/tmp/pgintro.pdf'::pdf::text @@ to_tsquery('oracle');
```

```tsql
Expand All @@ -62,6 +98,13 @@ select pdf_read_file('/tmp/pgintro.pdf') @@ to_tsquery('oracle');
(1 row)
```

If you don't have the PDF file in your filesystem but have already stored its content in a `bytea` column,
you can cast a `bytea` to `pdf`, like so:

```tsql
SELECT pg_read_binary_file('/tmp/pgintro.pdf')::pdf
```

## Installation

```
Expand All @@ -76,7 +119,7 @@ make install
```

```tsql
create extension pgpdf;
CREATE EXTENSION pgpdf;
```

> [!WARNING]
Expand Down
Loading

0 comments on commit d65910a

Please sign in to comment.