Skip to content

Commit 51f205c

Browse files
feat: avro schema add sanitize field name
1 parent 9f13bac commit 51f205c

File tree

3 files changed

+218
-3
lines changed

3 files changed

+218
-3
lines changed

src/iceberg/avro/avro_schema_util.cc

Lines changed: 54 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,56 @@ ::avro::CustomAttributes GetAttributesWithFieldId(int32_t field_id) {
5858

5959
} // namespace
6060

61+
bool validAvroName(const std::string& name) {
62+
if (name.empty()) {
63+
throw std::runtime_error("Empty name");
64+
}
65+
66+
char first = name[0];
67+
if (!(std::isalpha(first) || first == '_')) {
68+
return false;
69+
}
70+
71+
for (size_t i = 1; i < name.length(); i++) {
72+
char character = name[i];
73+
if (!(std::isalnum(character) || character == '_')) {
74+
return false;
75+
}
76+
}
77+
return true;
78+
}
79+
80+
std::string SanitizeChar(char c) {
81+
if (std::isdigit(c)) {
82+
return std::string("_") + c;
83+
}
84+
std::stringstream ss;
85+
ss << "_x" << std::uppercase << std::hex << static_cast<int>(c);
86+
return ss.str();
87+
}
88+
89+
std::string SanitizeFieldName(std::string_view field_name) {
90+
std::string result;
91+
result.reserve(field_name.size());
92+
93+
if (!std::isalpha(field_name[0]) && field_name[0] != '_') {
94+
result.append(SanitizeChar(field_name[0]));
95+
} else {
96+
result.push_back(field_name[0]);
97+
}
98+
99+
for (size_t i = 1; i < field_name.size(); ++i) {
100+
char c = field_name[i];
101+
if (std::isalnum(c) || c == '_') {
102+
result.push_back(c);
103+
} else {
104+
result.append(SanitizeChar(c));
105+
}
106+
}
107+
108+
return result;
109+
}
110+
61111
std::string ToString(const ::avro::NodePtr& node) {
62112
std::stringstream ss;
63113
ss << *node;
@@ -181,8 +231,10 @@ Status ToAvroNodeVisitor::Visit(const StructType& type, ::avro::NodePtr* node) {
181231
::avro::NodePtr field_node;
182232
ICEBERG_RETURN_UNEXPECTED(Visit(sub_field, &field_node));
183233

184-
// TODO(gangwu): sanitize field name
185-
(*node)->addName(std::string(sub_field.name()));
234+
bool isValidFieldName = validAvroName(std::string(sub_field.name()));
235+
std::string fieldName = isValidFieldName ? std::string(sub_field.name())
236+
: SanitizeFieldName(sub_field.name());
237+
(*node)->addName(fieldName);
186238
(*node)->addLeaf(field_node);
187239
(*node)->addCustomAttributesForField(GetAttributesWithFieldId(sub_field.field_id()));
188240
}

src/iceberg/avro/avro_schema_util_internal.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,4 +156,28 @@ bool HasMapLogicalType(const ::avro::NodePtr& node);
156156
Result<::avro::NodePtr> MakeAvroNodeWithFieldIds(const ::avro::NodePtr& original_node,
157157
const NameMapping& mapping);
158158

159+
/// \brief Sanitize a field name to make it compatible with Avro field name requirements.
160+
///
161+
/// Converts names that are not valid Avro names to valid Avro names.
162+
/// Conversion rules:
163+
/// 1. If the first character is not a letter or underscore, it is specially handled:
164+
/// - Digits: Prefixed with an underscore (e.g., '3' -> '_3')
165+
/// - Other characters: Converted to '_x' followed by the uppercase hexadecimal
166+
/// representation
167+
/// of the character (e.g., '$' -> '_x24')
168+
/// 2. For characters other than the first:
169+
/// - If it's a letter, digit, or underscore, it remains unchanged
170+
/// - Other characters: Converted to '_x' followed by the uppercase hexadecimal
171+
/// representation
172+
///
173+
/// Examples:
174+
/// - "123field" -> "_123field"
175+
/// - "user-name" -> "user_x2Dname"
176+
/// - "$price" -> "_x24price"
177+
/// - "valid_name_123" -> "valid_name_123" (no conversion needed)
178+
///
179+
/// \param field_name The original field name to sanitize.
180+
/// \return A sanitized field name that follows Avro naming conventions.
181+
std::string SanitizeFieldName(std::string_view field_name);
182+
159183
} // namespace iceberg::avro

test/avro_schema_test.cc

Lines changed: 140 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@
3232

3333
namespace iceberg::avro {
3434

35+
// Forward declaration of functions to test
36+
bool validAvroName(const std::string& name);
37+
3538
namespace {
3639

3740
void CheckCustomLogicalType(const ::avro::NodePtr& node, const std::string& type_name) {
@@ -47,8 +50,82 @@ void CheckFieldIdAt(const ::avro::NodePtr& node, size_t index, int32_t field_id,
4750
ASSERT_EQ(attrs.getAttribute(key), std::make_optional(std::to_string(field_id)));
4851
}
4952

53+
// Helper function to check if a custom attribute exists for a field name preservation
54+
void CheckIcebergFieldName(const ::avro::NodePtr& node, size_t index,
55+
const std::string& original_name) {
56+
ASSERT_LT(index, node->customAttributes());
57+
const auto& attrs = node->customAttributesAt(index);
58+
ASSERT_EQ(attrs.getAttribute("iceberg-field-name"), std::make_optional(original_name));
59+
}
60+
5061
} // namespace
5162

63+
TEST(ValidAvroNameTest, ValidNames) {
64+
// Valid field names should return true
65+
EXPECT_TRUE(validAvroName("valid_field"));
66+
EXPECT_TRUE(validAvroName("field123"));
67+
EXPECT_TRUE(validAvroName("_private"));
68+
EXPECT_TRUE(validAvroName("CamelCase"));
69+
EXPECT_TRUE(validAvroName("field_with_underscores"));
70+
}
71+
72+
TEST(ValidAvroNameTest, InvalidNames) {
73+
// Names starting with numbers should return false
74+
EXPECT_FALSE(validAvroName("123field"));
75+
EXPECT_FALSE(validAvroName("0value"));
76+
77+
// Names with special characters should return false
78+
EXPECT_FALSE(validAvroName("field-name"));
79+
EXPECT_FALSE(validAvroName("field.name"));
80+
EXPECT_FALSE(validAvroName("field name"));
81+
EXPECT_FALSE(validAvroName("field@name"));
82+
EXPECT_FALSE(validAvroName("field#name"));
83+
}
84+
85+
TEST(ValidAvroNameTest, EmptyName) {
86+
// Empty name should throw an exception
87+
EXPECT_THROW(validAvroName(""), std::runtime_error);
88+
}
89+
90+
TEST(SanitizeFieldNameTest, ValidFieldNames) {
91+
// Valid field names should remain unchanged
92+
EXPECT_EQ(SanitizeFieldName("valid_field"), "valid_field");
93+
EXPECT_EQ(SanitizeFieldName("field123"), "field123");
94+
EXPECT_EQ(SanitizeFieldName("_private"), "_private");
95+
EXPECT_EQ(SanitizeFieldName("CamelCase"), "CamelCase");
96+
EXPECT_EQ(SanitizeFieldName("field_with_underscores"), "field_with_underscores");
97+
}
98+
99+
TEST(SanitizeFieldNameTest, InvalidFieldNames) {
100+
// Field names starting with numbers should be prefixed with underscore
101+
EXPECT_EQ(SanitizeFieldName("123field"), "_123field");
102+
EXPECT_EQ(SanitizeFieldName("0value"), "_0value");
103+
104+
// Field names with special characters should be encoded with hex values
105+
EXPECT_EQ(SanitizeFieldName("field-name"), "field_x2Dname");
106+
EXPECT_EQ(SanitizeFieldName("field.name"), "field_x2Ename");
107+
EXPECT_EQ(SanitizeFieldName("field name"), "field_x20name");
108+
EXPECT_EQ(SanitizeFieldName("field@name"), "field_x40name");
109+
EXPECT_EQ(SanitizeFieldName("field#name"), "field_x23name");
110+
111+
// Complex field names with multiple issues
112+
EXPECT_EQ(SanitizeFieldName("1field-with.special@chars"),
113+
"_1field_x2Dwith_x2Especial_x40chars");
114+
EXPECT_EQ(SanitizeFieldName("user-email"), "user_x2Demail");
115+
}
116+
117+
TEST(SanitizeFieldNameTest, EdgeCases) {
118+
// Empty field name
119+
EXPECT_EQ(SanitizeFieldName(""), "_x0");
120+
121+
// Field name with only special characters
122+
EXPECT_EQ(SanitizeFieldName("@#$"), "_x40_x23_x24");
123+
124+
// Field name starting with special character
125+
EXPECT_EQ(SanitizeFieldName("-field"), "_x2Dfield");
126+
EXPECT_EQ(SanitizeFieldName(".field"), "_x2Efield");
127+
}
128+
52129
TEST(ToAvroNodeVisitorTest, BooleanType) {
53130
::avro::NodePtr node;
54131
EXPECT_THAT(ToAvroNodeVisitor{}.Visit(BooleanType{}, &node), IsOk());
@@ -181,6 +258,69 @@ TEST(ToAvroNodeVisitorTest, StructType) {
181258
EXPECT_EQ(node->leafAt(1)->leafAt(1)->type(), ::avro::AVRO_INT);
182259
}
183260

261+
TEST(ToAvroNodeVisitorTest, StructTypeWithSanitizedFieldNames) {
262+
// Test struct with field names that require sanitization
263+
StructType struct_type{
264+
{SchemaField{/*field_id=*/1, "user-name", iceberg::string(),
265+
/*optional=*/false},
266+
SchemaField{/*field_id=*/2, "email.address", iceberg::string(),
267+
/*optional=*/true},
268+
SchemaField{/*field_id=*/3, "123field", iceberg::int32(),
269+
/*optional=*/false},
270+
SchemaField{/*field_id=*/4, "field with spaces", iceberg::boolean(),
271+
/*optional=*/true}}};
272+
273+
::avro::NodePtr node;
274+
EXPECT_THAT(ToAvroNodeVisitor{}.Visit(struct_type, &node), IsOk());
275+
EXPECT_EQ(node->type(), ::avro::AVRO_RECORD);
276+
277+
// Check that field names are sanitized
278+
ASSERT_EQ(node->names(), 4);
279+
EXPECT_EQ(node->nameAt(0), "user_x2Dname"); // "user-name" -> "user_x2Dname"
280+
EXPECT_EQ(node->nameAt(1),
281+
"email_x2Eaddress"); // "email.address" -> "email_x2Eaddress"
282+
EXPECT_EQ(node->nameAt(2), "_123field"); // "123field" -> "_123field"
283+
EXPECT_EQ(
284+
node->nameAt(3),
285+
"field_x20with_x20spaces"); // "field with spaces" -> "field_x20with_x20spaces"
286+
287+
// Check that field IDs are correctly applied
288+
// Each field has 1 custom attribute: field-id
289+
ASSERT_EQ(node->customAttributes(), 4);
290+
ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/0, /*field_id=*/1));
291+
ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/1, /*field_id=*/2));
292+
ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/2, /*field_id=*/3));
293+
ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/3, /*field_id=*/4));
294+
}
295+
296+
TEST(ToAvroNodeVisitorTest, StructTypeWithValidFieldNames) {
297+
// Test struct with field names that don't require sanitization
298+
StructType struct_type{{SchemaField{/*field_id=*/1, "valid_field", iceberg::string(),
299+
/*optional=*/false},
300+
SchemaField{/*field_id=*/2, "AnotherField", iceberg::int32(),
301+
/*optional=*/true}}};
302+
303+
::avro::NodePtr node;
304+
EXPECT_THAT(ToAvroNodeVisitor{}.Visit(struct_type, &node), IsOk());
305+
EXPECT_EQ(node->type(), ::avro::AVRO_RECORD);
306+
307+
// Check that field names remain unchanged
308+
ASSERT_EQ(node->names(), 2);
309+
EXPECT_EQ(node->nameAt(0), "valid_field");
310+
EXPECT_EQ(node->nameAt(1), "AnotherField");
311+
312+
// Check that field IDs are correctly applied
313+
ASSERT_EQ(node->customAttributes(), 2);
314+
ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/0, /*field_id=*/1));
315+
ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/1, /*field_id=*/2));
316+
317+
// For valid field names, there should be no iceberg-field-name attributes
318+
const auto& attrs0 = node->customAttributesAt(0);
319+
const auto& attrs1 = node->customAttributesAt(1);
320+
EXPECT_FALSE(attrs0.getAttribute("iceberg-field-name").has_value());
321+
EXPECT_FALSE(attrs1.getAttribute("iceberg-field-name").has_value());
322+
}
323+
184324
TEST(ToAvroNodeVisitorTest, ListType) {
185325
ListType list_type{SchemaField{/*field_id=*/5, "element", iceberg::string(),
186326
/*optional=*/true}};
@@ -1436,5 +1576,4 @@ TEST_F(NameMappingAvroSchemaTest, MissingFieldIdError) {
14361576
auto result = MakeAvroNodeWithFieldIds(avro_schema.root(), *name_mapping);
14371577
ASSERT_THAT(result, IsError(ErrorKind::kInvalidSchema));
14381578
}
1439-
14401579
} // namespace iceberg::avro

0 commit comments

Comments
 (0)