32
32
33
33
namespace iceberg ::avro {
34
34
35
+ // Forward declaration of functions to test
36
+ bool validAvroName (const std::string& name);
37
+
35
38
namespace {
36
39
37
40
void CheckCustomLogicalType (const ::avro::NodePtr& node, const std::string& type_name) {
@@ -47,8 +50,82 @@ void CheckFieldIdAt(const ::avro::NodePtr& node, size_t index, int32_t field_id,
47
50
ASSERT_EQ (attrs.getAttribute (key), std::make_optional (std::to_string (field_id)));
48
51
}
49
52
53
+ // Helper function to check if a custom attribute exists for a field name preservation
54
+ void CheckIcebergFieldName (const ::avro::NodePtr& node, size_t index,
55
+ const std::string& original_name) {
56
+ ASSERT_LT (index, node->customAttributes ());
57
+ const auto & attrs = node->customAttributesAt (index);
58
+ ASSERT_EQ (attrs.getAttribute (" iceberg-field-name" ), std::make_optional (original_name));
59
+ }
60
+
50
61
} // namespace
51
62
63
+ TEST (ValidAvroNameTest, ValidNames) {
64
+ // Valid field names should return true
65
+ EXPECT_TRUE (validAvroName (" valid_field" ));
66
+ EXPECT_TRUE (validAvroName (" field123" ));
67
+ EXPECT_TRUE (validAvroName (" _private" ));
68
+ EXPECT_TRUE (validAvroName (" CamelCase" ));
69
+ EXPECT_TRUE (validAvroName (" field_with_underscores" ));
70
+ }
71
+
72
+ TEST (ValidAvroNameTest, InvalidNames) {
73
+ // Names starting with numbers should return false
74
+ EXPECT_FALSE (validAvroName (" 123field" ));
75
+ EXPECT_FALSE (validAvroName (" 0value" ));
76
+
77
+ // Names with special characters should return false
78
+ EXPECT_FALSE (validAvroName (" field-name" ));
79
+ EXPECT_FALSE (validAvroName (" field.name" ));
80
+ EXPECT_FALSE (validAvroName (" field name" ));
81
+ EXPECT_FALSE (validAvroName (" field@name" ));
82
+ EXPECT_FALSE (validAvroName (" field#name" ));
83
+ }
84
+
85
+ TEST (ValidAvroNameTest, EmptyName) {
86
+ // Empty name should throw an exception
87
+ EXPECT_THROW (validAvroName (" " ), std::runtime_error);
88
+ }
89
+
90
+ TEST (SanitizeFieldNameTest, ValidFieldNames) {
91
+ // Valid field names should remain unchanged
92
+ EXPECT_EQ (SanitizeFieldName (" valid_field" ), " valid_field" );
93
+ EXPECT_EQ (SanitizeFieldName (" field123" ), " field123" );
94
+ EXPECT_EQ (SanitizeFieldName (" _private" ), " _private" );
95
+ EXPECT_EQ (SanitizeFieldName (" CamelCase" ), " CamelCase" );
96
+ EXPECT_EQ (SanitizeFieldName (" field_with_underscores" ), " field_with_underscores" );
97
+ }
98
+
99
+ TEST (SanitizeFieldNameTest, InvalidFieldNames) {
100
+ // Field names starting with numbers should be prefixed with underscore
101
+ EXPECT_EQ (SanitizeFieldName (" 123field" ), " _123field" );
102
+ EXPECT_EQ (SanitizeFieldName (" 0value" ), " _0value" );
103
+
104
+ // Field names with special characters should be encoded with hex values
105
+ EXPECT_EQ (SanitizeFieldName (" field-name" ), " field_x2Dname" );
106
+ EXPECT_EQ (SanitizeFieldName (" field.name" ), " field_x2Ename" );
107
+ EXPECT_EQ (SanitizeFieldName (" field name" ), " field_x20name" );
108
+ EXPECT_EQ (SanitizeFieldName (" field@name" ), " field_x40name" );
109
+ EXPECT_EQ (SanitizeFieldName (" field#name" ), " field_x23name" );
110
+
111
+ // Complex field names with multiple issues
112
+ EXPECT_EQ (SanitizeFieldName (" 1field-with.special@chars" ),
113
+ " _1field_x2Dwith_x2Especial_x40chars" );
114
+ EXPECT_EQ (SanitizeFieldName (" user-email" ), " user_x2Demail" );
115
+ }
116
+
117
+ TEST (SanitizeFieldNameTest, EdgeCases) {
118
+ // Empty field name
119
+ EXPECT_EQ (SanitizeFieldName (" " ), " _x0" );
120
+
121
+ // Field name with only special characters
122
+ EXPECT_EQ (SanitizeFieldName (" @#$" ), " _x40_x23_x24" );
123
+
124
+ // Field name starting with special character
125
+ EXPECT_EQ (SanitizeFieldName (" -field" ), " _x2Dfield" );
126
+ EXPECT_EQ (SanitizeFieldName (" .field" ), " _x2Efield" );
127
+ }
128
+
52
129
TEST (ToAvroNodeVisitorTest, BooleanType) {
53
130
::avro::NodePtr node;
54
131
EXPECT_THAT (ToAvroNodeVisitor{}.Visit (BooleanType{}, &node), IsOk ());
@@ -181,6 +258,69 @@ TEST(ToAvroNodeVisitorTest, StructType) {
181
258
EXPECT_EQ (node->leafAt (1 )->leafAt (1 )->type (), ::avro::AVRO_INT);
182
259
}
183
260
261
+ TEST (ToAvroNodeVisitorTest, StructTypeWithSanitizedFieldNames) {
262
+ // Test struct with field names that require sanitization
263
+ StructType struct_type{
264
+ {SchemaField{/* field_id=*/ 1 , " user-name" , iceberg::string (),
265
+ /* optional=*/ false },
266
+ SchemaField{/* field_id=*/ 2 , " email.address" , iceberg::string (),
267
+ /* optional=*/ true },
268
+ SchemaField{/* field_id=*/ 3 , " 123field" , iceberg::int32 (),
269
+ /* optional=*/ false },
270
+ SchemaField{/* field_id=*/ 4 , " field with spaces" , iceberg::boolean (),
271
+ /* optional=*/ true }}};
272
+
273
+ ::avro::NodePtr node;
274
+ EXPECT_THAT (ToAvroNodeVisitor{}.Visit (struct_type, &node), IsOk ());
275
+ EXPECT_EQ (node->type (), ::avro::AVRO_RECORD);
276
+
277
+ // Check that field names are sanitized
278
+ ASSERT_EQ (node->names (), 4 );
279
+ EXPECT_EQ (node->nameAt (0 ), " user_x2Dname" ); // "user-name" -> "user_x2Dname"
280
+ EXPECT_EQ (node->nameAt (1 ),
281
+ " email_x2Eaddress" ); // "email.address" -> "email_x2Eaddress"
282
+ EXPECT_EQ (node->nameAt (2 ), " _123field" ); // "123field" -> "_123field"
283
+ EXPECT_EQ (
284
+ node->nameAt (3 ),
285
+ " field_x20with_x20spaces" ); // "field with spaces" -> "field_x20with_x20spaces"
286
+
287
+ // Check that field IDs are correctly applied
288
+ // Each field has 1 custom attribute: field-id
289
+ ASSERT_EQ (node->customAttributes (), 4 );
290
+ ASSERT_NO_FATAL_FAILURE (CheckFieldIdAt (node, /* index=*/ 0 , /* field_id=*/ 1 ));
291
+ ASSERT_NO_FATAL_FAILURE (CheckFieldIdAt (node, /* index=*/ 1 , /* field_id=*/ 2 ));
292
+ ASSERT_NO_FATAL_FAILURE (CheckFieldIdAt (node, /* index=*/ 2 , /* field_id=*/ 3 ));
293
+ ASSERT_NO_FATAL_FAILURE (CheckFieldIdAt (node, /* index=*/ 3 , /* field_id=*/ 4 ));
294
+ }
295
+
296
+ TEST (ToAvroNodeVisitorTest, StructTypeWithValidFieldNames) {
297
+ // Test struct with field names that don't require sanitization
298
+ StructType struct_type{{SchemaField{/* field_id=*/ 1 , " valid_field" , iceberg::string (),
299
+ /* optional=*/ false },
300
+ SchemaField{/* field_id=*/ 2 , " AnotherField" , iceberg::int32 (),
301
+ /* optional=*/ true }}};
302
+
303
+ ::avro::NodePtr node;
304
+ EXPECT_THAT (ToAvroNodeVisitor{}.Visit (struct_type, &node), IsOk ());
305
+ EXPECT_EQ (node->type (), ::avro::AVRO_RECORD);
306
+
307
+ // Check that field names remain unchanged
308
+ ASSERT_EQ (node->names (), 2 );
309
+ EXPECT_EQ (node->nameAt (0 ), " valid_field" );
310
+ EXPECT_EQ (node->nameAt (1 ), " AnotherField" );
311
+
312
+ // Check that field IDs are correctly applied
313
+ ASSERT_EQ (node->customAttributes (), 2 );
314
+ ASSERT_NO_FATAL_FAILURE (CheckFieldIdAt (node, /* index=*/ 0 , /* field_id=*/ 1 ));
315
+ ASSERT_NO_FATAL_FAILURE (CheckFieldIdAt (node, /* index=*/ 1 , /* field_id=*/ 2 ));
316
+
317
+ // For valid field names, there should be no iceberg-field-name attributes
318
+ const auto & attrs0 = node->customAttributesAt (0 );
319
+ const auto & attrs1 = node->customAttributesAt (1 );
320
+ EXPECT_FALSE (attrs0.getAttribute (" iceberg-field-name" ).has_value ());
321
+ EXPECT_FALSE (attrs1.getAttribute (" iceberg-field-name" ).has_value ());
322
+ }
323
+
184
324
TEST (ToAvroNodeVisitorTest, ListType) {
185
325
ListType list_type{SchemaField{/* field_id=*/ 5 , " element" , iceberg::string (),
186
326
/* optional=*/ true }};
@@ -1436,5 +1576,4 @@ TEST_F(NameMappingAvroSchemaTest, MissingFieldIdError) {
1436
1576
auto result = MakeAvroNodeWithFieldIds (avro_schema.root (), *name_mapping);
1437
1577
ASSERT_THAT (result, IsError (ErrorKind::kInvalidSchema ));
1438
1578
}
1439
-
1440
1579
} // namespace iceberg::avro
0 commit comments