Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for BYTEA/BLOB #511

Merged
merged 5 commits into from
Jan 2, 2025
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/pgduckdb_filter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@ FilterOperationSwitch(const Datum &value, const duckdb::Value &constant, Oid typ
case TEXTOID:
case VARCHAROID:
return StringFilterOperation<OP>(value, constant, type_oid == BPCHAROID);
case BYTEAOID:
return StringFilterOperation<OP>(value, constant, false);
default:
throw duckdb::InvalidTypeException(
duckdb::string("(DuckDB/FilterOperationSwitch) Unsupported duckdb type: " + std::to_string(type_oid)));
Expand Down
49 changes: 49 additions & 0 deletions src/pgduckdb_types.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include "duckdb/common/shared_ptr.hpp"
#include "duckdb/common/extra_type_info.hpp"
#include "duckdb/common/types/uuid.hpp"
#include "duckdb/common/types/blob.hpp"

#include "pgduckdb/pgduckdb_types.hpp"
#include "pgduckdb/pgduckdb_utils.hpp"
Expand Down Expand Up @@ -199,6 +200,17 @@ ConvertVarCharDatum(const duckdb::Value &value) {
return PointerGetDatum(result);
}

static Datum
ConvertBinaryDatum(const duckdb::Value &value) {
auto str = value.GetValue<duckdb::string>();
auto blob = str.c_str();
auto blob_len = str.size();
bytea* result = (bytea *)palloc0(blob_len + VARHDRSZ);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: make format will re-format this line as

bytea *result = (bytea *)palloc0(blob_len + VARHDRSZ);

Copy link
Collaborator

@JelteF JelteF Jan 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the report. Fixed by: #518

SET_VARSIZE(result, blob_len + VARHDRSZ);
memcpy(VARDATA(result), blob, blob_len);
return PointerGetDatum(result);
}

inline Datum
ConvertDateDatum(const duckdb::Value &value) {
duckdb::date_t date = value.GetValue<duckdb::date_t>();
Expand Down Expand Up @@ -505,6 +517,19 @@ struct PostgresTypeTraits<VARCHAROID> {
}
};

// BLOB type
template <>
struct PostgresTypeTraits<BYTEAOID> {
static constexpr int16_t typlen = -1; // variable-length
static constexpr bool typbyval = false;
static constexpr char typalign = 'i';

static inline Datum
ToDatum(const duckdb::Value &val) {
return ConvertBinaryDatum(val);
}
};

template <int32_t OID>
struct PostgresOIDMapping {
static constexpr int32_t postgres_oid = OID;
Expand Down Expand Up @@ -545,6 +570,7 @@ using TimestampArray = PODArray<PostgresOIDMapping<TIMESTAMPOID>>;
using UUIDArray = PODArray<PostgresOIDMapping<UUIDOID>>;
using VarCharArray = PODArray<PostgresOIDMapping<VARCHAROID>>;
using NumericArray = PODArray<PostgresOIDMapping<NUMERICOID>>;
using ByteArray = PODArray<PostgresOIDMapping<BYTEAOID>>;

static idx_t
GetDuckDBListDimensionality(const duckdb::LogicalType &list_type, idx_t depth = 0) {
Expand Down Expand Up @@ -733,6 +759,10 @@ ConvertDuckToPostgresValue(TupleTableSlot *slot, duckdb::Value &value, idx_t col
slot->tts_values[col] = ConvertUUIDDatum(value);
break;
}
case BYTEAOID: {
slot->tts_values[col] = ConvertBinaryDatum(value);
break;
}
case BOOLARRAYOID: {
ConvertDuckToPostgresArray<BoolArray>(slot, value, col);
break;
Expand Down Expand Up @@ -784,6 +814,10 @@ ConvertDuckToPostgresValue(TupleTableSlot *slot, duckdb::Value &value, idx_t col
ConvertDuckToPostgresArray<UUIDArray>(slot, value, col);
break;
}
case BYTEAARRAYOID: {
ConvertDuckToPostgresArray<ByteArray>(slot, value, col);
break;
}
default:
elog(WARNING, "(PGDuckDB/ConvertDuckToPostgresValue) Unsuported pgduckdb type: %d", oid);
return false;
Expand Down Expand Up @@ -866,6 +900,9 @@ ConvertPostgresToBaseDuckColumnType(Form_pg_attribute &attribute) {
case REGCLASSOID:
case REGCLASSARRAYOID:
return duckdb::LogicalTypeId::UINTEGER;
case BYTEAOID:
case BYTEAARRAYOID:
return duckdb::LogicalTypeId::BLOB;
default:
return duckdb::LogicalType::USER("UnsupportedPostgresType (Oid=" + std::to_string(attribute->atttypid) + ")");
}
Expand Down Expand Up @@ -920,6 +957,8 @@ GetPostgresArrayDuckDBType(const duckdb::LogicalType &type) {
return NUMERICARRAYOID;
case duckdb::LogicalTypeId::UUID:
return UUIDARRAYOID;
case duckdb::LogicalTypeId::BLOB:
return BYTEAARRAYOID;
default: {
elog(WARNING, "(PGDuckDB/GetPostgresDuckDBType) Unsupported `LIST` subtype %d to Postgres type",
static_cast<uint8_t>(type.id()));
Expand Down Expand Up @@ -974,6 +1013,8 @@ GetPostgresDuckDBType(const duckdb::LogicalType &type) {
}
return GetPostgresArrayDuckDBType(*duck_type);
}
case duckdb::LogicalTypeId::BLOB:
return BYTEAOID;
default: {
elog(WARNING, "(PGDuckDB/GetPostgresDuckDBType) Could not convert DuckDB type: %s to Postgres type",
type.ToString().c_str());
Expand Down Expand Up @@ -1222,6 +1263,14 @@ ConvertPostgresToDuckValue(Oid attr_type, Datum value, duckdb::Vector &result, i
Append(result, duckdb_uuid, offset);
break;
}
case duckdb::LogicalTypeId::BLOB: {
const char *bytea_data = VARDATA_ANY(value);
size_t bytea_length = VARSIZE_ANY_EXHDR(value);
const duckdb::string_t s(bytea_data, bytea_length);
auto data = duckdb::FlatVector::GetData<duckdb::string_t>(result);
data[offset] = duckdb::StringVector::AddString(result, s);
break;
}
case duckdb::LogicalTypeId::LIST: {
// Convert Datum to ArrayType
auto array = DatumGetArrayTypeP(value);
Expand Down
13 changes: 13 additions & 0 deletions test/regression/expected/array_type_support.out
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,19 @@ SELECT * FROM varchar_array_2d;
{{some,strings},{NULL,last}}
(5 rows)

-- BYTEA (single dimension)
CREATE TABLE bytea_array_1d (a bytea[]);
INSERT INTO bytea_array_1d (a)
VALUES
(ARRAY[decode('01020304', 'hex'), decode('aabbccdd', 'hex')]),
(ARRAY[decode('11223344', 'hex'), decode('55667788', 'hex')]);
SELECT * FROM bytea_array_1d;
a
-------------------------------------------------------------------------------
{"\\x5c7830315c7830325c7830335c783034","\\x5c7841415c7842425c7843435c784444"}
{"\\x5c7831315c7832323344","\\x5566775c783838"}
(2 rows)

-- TIMESTAMP (two dimensions)
CREATE TABLE timestamp_array_2d(a TIMESTAMP[][]);
INSERT INTO timestamp_array_2d VALUES
Expand Down
22 changes: 22 additions & 0 deletions test/regression/expected/type_support.out
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,27 @@ SELECT * FROM json_tbl;
{}
(4 rows)

-- BLOB
CREATE TABLE blob_tbl(a bytea);
INSERT INTO blob_tbl SELECT CAST(a as bytea) FROM (VALUES
('\x'),
('\x110102030405060708090a0b0c0d0e0f'),
(''),
('\x00'),
('\x07'),
(NULL)
) t(a);
SELECT * from blob_tbl;
a
------------------------------------------------------------------------------------------------------------------------------------
\x
\x5c7831315c7830315c7830325c7830335c7830345c7830355c7830365c7830375c7830385c7830395c7830415c7830425c7830435c7830445c7830455c783046
\x
\x5c783030
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

currently the output from select * from blob_tbl is an escaped version of inserted rows, is this fine or do we need to change that?

Okay, this needs to change. All the non empty bytea results are wrong. When using postgres execution the it instead gives the following rows:

 \x
 \x110102030405060708090a0b0c0d0e0f
 \x
 \x00
 \x07

It seems like you're encoding the string as hex somewhere as an additional time, because 5c783030 is \x00 in ASCII, i.e. including the \ and x characters. So I think when converting between from DuckDB to PG type you're encoding the string representation of the type instead of the raw bytes.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Found it, converting it to string was trying to convert those bytes to string

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changed GetValue to GetValueUnsafe for getting raw bytes in string_t

\x5c783037

(6 rows)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a comparison test too? Something like:

SELECT * FROM blob_tbl WHERE a = '\x00';

-- REGCLASSOID
CREATE TABLE regclass_tbl (a REGCLASS);
INSERT INTO regclass_tbl VALUES (42), (3000000000);
Expand Down Expand Up @@ -337,4 +358,5 @@ DROP TABLE bigint_numeric;
DROP TABLE hugeint_numeric;
DROP TABLE uuid_tbl;
DROP TABLE json_tbl;
DROP TABLE blob_tbl;
DROP TABLE regclass_tbl;
9 changes: 9 additions & 0 deletions test/regression/sql/array_type_support.sql
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,15 @@ INSERT INTO varchar_array_2d VALUES
('{{"some","strings"},{NULL,"last"}}');
SELECT * FROM varchar_array_2d;

-- BYTEA (single dimension)
CREATE TABLE bytea_array_1d (a bytea[]);

INSERT INTO bytea_array_1d (a)
VALUES
(ARRAY[decode('01020304', 'hex'), decode('aabbccdd', 'hex')]),
(ARRAY[decode('11223344', 'hex'), decode('55667788', 'hex')]);
SELECT * FROM bytea_array_1d;

-- TIMESTAMP (two dimensions)
CREATE TABLE timestamp_array_2d(a TIMESTAMP[][]);
INSERT INTO timestamp_array_2d VALUES
Expand Down
13 changes: 13 additions & 0 deletions test/regression/sql/type_support.sql
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,18 @@ INSERT INTO json_tbl SELECT CAST(a as JSON) FROM (VALUES
) t(a);
SELECT * FROM json_tbl;

-- BLOB
CREATE TABLE blob_tbl(a bytea);
INSERT INTO blob_tbl SELECT CAST(a as bytea) FROM (VALUES
('\x'),
('\x110102030405060708090a0b0c0d0e0f'),
(''),
('\x00'),
('\x07'),
(NULL)
) t(a);
SELECT * from blob_tbl;

-- REGCLASSOID
CREATE TABLE regclass_tbl (a REGCLASS);
INSERT INTO regclass_tbl VALUES (42), (3000000000);
Expand All @@ -178,4 +190,5 @@ DROP TABLE bigint_numeric;
DROP TABLE hugeint_numeric;
DROP TABLE uuid_tbl;
DROP TABLE json_tbl;
DROP TABLE blob_tbl;
DROP TABLE regclass_tbl;