Add pdf type (#4)

Florents-Tselai · Nov 5, 2024 · d65910a · d65910a
1 parent 446d279
commit d65910a
Show file tree

Hide file tree

Showing 7 changed files with 474 additions and 357 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -7,6 +7,8 @@ jobs:
       fail-fast: false
       matrix:
         include:
+          - postgres: 18
+            os: ubuntu-24.04
           - postgres: 17
             os: ubuntu-24.04
           - postgres: 16
@@ -29,10 +31,8 @@ jobs:
           dev-files: true
 
       - run: make
+      - run: sudo make install
+      - run: make installcheck
+      - if: ${{ failure() }}
+        run: cat regression.diffs
 
-      - run: |
-          export PG_CONFIG=`which pg_config`
-          sudo --preserve-env=PG_CONFIG make install
-#      - run: make installcheck
-#      - if: ${{ failure() }}
-#        run: cat regression.diffs
diff --git a/Makefile b/Makefile
@@ -8,20 +8,25 @@ MODULE_big = $(EXTENSION)
 
 OBJS = pgpdf.o
 
+DATA = $(wildcard sql/*--*.sql)
+
+PG_CPPFLAGS = $(shell $(PKG_CONFIG) --cflags poppler poppler-glib)
+PG_LDFLAGS = $(shell $(PKG_CONFIG) --libs poppler poppler-glib)
+SHLIB_LINK =-lpoppler -lpoppler-glib
+
 TESTS = $(wildcard test/sql/*.sql)
 REGRESS = $(patsubst test/sql/%.sql,%,$(TESTS))
 REGRESS_OPTS = --inputdir=test --load-extension=$(EXTENSION)
 
+TEST_FILES = /tmp/pgintro.pdf /tmp/bad.pdf
 /tmp/pgintro.pdf:
 	cp test/pgintro.pdf $@
-installcheck: /tmp/pgintro.pdf
+/tmp/bad.pdf:
+	echo 'not a pdf' >> $@
 
-DATA = $(wildcard sql/*--*.sql)
-
-PG_CPPFLAGS = $(shell $(PKG_CONFIG) --cflags poppler poppler-glib)
-PG_LDFLAGS = $(shell $(PKG_CONFIG) --libs poppler poppler-glib)
-SHLIB_LINK=-lpoppler -lpoppler-glib
+installcheck: $(TEST_FILES)
 
+EXTRA_CLEAN = $(TEST_FILES)
 
 PGXS := $(shell $(PG_CONFIG) --pgxs)
 include $(PGXS)

diff --git a/README.md b/README.md
@@ -1,47 +1,83 @@
-# pgPDF: Read PDFs from Postgres
+# pgPDF: `pdf` type for Postgres
 
 [![build](https://github.com/Florents-Tselai/pgpdf/actions/workflows/build.yml/badge.svg)](https://github.com/Florents-Tselai/pgpdf/actions/workflows/build.yml)
 
+This is an extension for PostgreSQL that provides a `pdf` data type.
+
+The actual PDF parsing is done by [poppler](https://poppler.freedesktop.org).
+
 ## Usage
 
 ```sh
 wget https://wiki.postgresql.org/images/e/ea/PostgreSQL_Introduction.pdf -O /tmp/pgintro.pdf
 ```
 
-You can use an absolute path to file as a `text` argument
-
 ```tsql
-select pdf_read_file('/tmp/pgintro.pdf');
+SELECT '/tmp/pgintro.pdf'::pdf;
 ```
+
 ```tsql
-                                  pdf_read_file                                   
+                                       pdf                                        
 ----------------------------------------------------------------------------------
  PostgreSQL Introduction                                                         +
  Digoal.Zhou                                                                     +
  7/20/2011Catalog                                                                +
-  PostgreSQL Origin                                                             +
+  PostgreSQL Origin 
 ```
 
-If you don't have the PDF file in your filesystem but have already stored its content in a `bytea` column:
+```tsql
+SELECT pdf_title('/tmp/pgintro.pdf');
+```
 
 ```tsql
-select pdf_read_bytes(pg_read_binary_file('/tmp/pgintro.pdf'));
+        pdf_title        
+-------------------------
+ PostgreSQL Introduction
+(1 row)
 ```
+
 ```tsql
+SELECT pdf_author('/tmp/pgintro.pdf');
+```
 
-                                  pdf_read_bytes                                  
-----------------------------------------------------------------------------------
- PostgreSQL Introduction                                                         +
- Digoal.Zhou                                                                     +
- 7/20/2011Catalog                                                                +
-  PostgreSQL Origin                                                             +
+```tsql
+ pdf_author 
+------------
+ 周正中
+(1 row)
 ```
 
-You can now do whatever you want,
-for example full-text search is easy:
+```tsql
+SELECT pdf_num_pages('/tmp/pgintro.pdf');
+```
 
 ```tsql
-select pdf_read_file('/tmp/pgintro.pdf') @@ to_tsquery('postgres');
+ pdf_num_pages 
+---------------
+            24
+(1 row)
+```
+
+```tsql
+SELECT pdf_page('/tmp/pgintro.pdf', 1);
+```
+
+```tsql
+           pdf_page           
+------------------------------
+ Catalog                     +
+  PostgreSQL Origin         +
+  Layout                    +
+  Features                  +
+  Enterprise Class Attribute+
+  Case
+(1 row)
+```
+
+You can also perform full-text search (FTS), since you can work on a `pdf` file like normal text.
+
+```tsql
+SELECT '/tmp/pgintro.pdf'::pdf::text @@ to_tsquery('postgres');
 ```
 
 ```tsql
@@ -52,7 +88,7 @@ select pdf_read_file('/tmp/pgintro.pdf') @@ to_tsquery('postgres');
 ```
 
 ```tsql
-select pdf_read_file('/tmp/pgintro.pdf') @@ to_tsquery('oracle');
+SELECT '/tmp/pgintro.pdf'::pdf::text @@ to_tsquery('oracle');
 ```
 
 ```tsql
@@ -62,6 +98,13 @@ select pdf_read_file('/tmp/pgintro.pdf') @@ to_tsquery('oracle');
 (1 row)
 ```
 
+If you don't have the PDF file in your filesystem but have already stored its content in a `bytea` column,
+you can cast a `bytea` to `pdf`, like so:
+
+```tsql
+SELECT pg_read_binary_file('/tmp/pgintro.pdf')::pdf
+```
+
 ## Installation
 
 ```
@@ -76,7 +119,7 @@ make install
 ```
 
 ```tsql
-create extension pgpdf;
+CREATE EXTENSION pgpdf;
 ```
 
 > [!WARNING]