chore: work in progress

mikita-sakalouski · mikita-sakalouski · commit 209aa47c4269 · 2025-07-07T21:20:51.000+02:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -403,7 +403,6 @@ Available scripts:
 - `spark-tests` - run the Spark test suite.
 - `coverage` or `cov` - run the test suite with coverage.
 """
-path = ".venv"
 python = "3.12"
 template = "default"
 features = [
diff --git a/src/koheesio/spark/transformations/camel_to_snake.py b/src/koheesio/spark/transformations/camel_to_snake.py
@@ -2,89 +2,35 @@
 Class for converting DataFrame column names from camel case to snake case.
 """
 
-from typing import Optional
 import re
 
-from koheesio.models import Field, ListOfColumns
-from koheesio.spark.transformations import ColumnsTransformation
-from koheesio.spark.utils import SPARK_MINOR_VERSION
+from collections.abc import Callable 
 
-camel_to_snake_re = re.compile("([a-z0-9])([A-Z])")
+from pydantic import Field
 
+from koheesio.spark.transformations.rename_columns import RenameColumns
 
-def convert_camel_to_snake(name: str) -> str:
-    """
-    Converts a string from camelCase to snake_case.
-
-    Parameters:
-    ----------
-    name : str
-        The string to be converted.
-
-    Returns:
-    --------
-    str
-        The converted string in snake_case.
-    """
-    return camel_to_snake_re.sub(r"\1_\2", name).lower()
-
-
-class CamelToSnakeTransformation(ColumnsTransformation):
-    """
-    Converts column names from camel case to snake cases
-
-    Parameters
-    ----------
-    columns : Optional[ListOfColumns], optional, default=None
-        The column or columns to convert. If no columns are specified, all columns will be converted. A list of columns
-        or a single column can be specified.
-        For example: `["column1", "column2"]` or `"column1"`
-
-    Example
-    -------
-    __input_df:__
 
-    | camelCaseColumn    | snake_case_column |
-    |--------------------|-------------------|
-    | ...                | ...               |
 
-    ```python
-    output_df = CamelToSnakeTransformation(
-        column="camelCaseColumn"
-    ).transform(input_df)
-    ```
+def camel_to_snake(name: str) -> str:
+    """Convert a camelCase string to snake_case.
 
-    __output_df:__
-
-    | camel_case_column | snake_case_column |
-    |-------------------|-------------------|
-    | ...               | ...               |
-
-    In this example, the column `camelCaseColumn` is converted to `camel_case_column`.
-
-    > Note: the data in the columns is not changed, only the column names.
+    Args:
+        name: The camelCase string to be converted.
 
+    Returns:
+        str: The converted snake_case string.
     """
-
-    def execute(self) -> ColumnsTransformation.Output:
-        _df = self.df
-
-        # Prepare columns input:
-        columns = list(self.get_columns())
-
-        if SPARK_MINOR_VERSION < 3.4:
-            for column in columns:
-                _df = _df.withColumnRenamed(column, convert_camel_to_snake(column))
-
-        else:
-            # Rename columns using toDF for Spark versions >= 3.4
-            # Note: toDF requires all column names to be specified
-            new_column_names = []
-            for column in _df.columns:
-                if column in columns:
-                    new_column_names.append(convert_camel_to_snake(column))
-                    continue
-                new_column_names.append(column)
-            _df = _df.toDF(*new_column_names)
-
-        self.output.df = _df
+    # Replace any lowercase letter or digit followed by an uppercase
+    # letter with the same characters separated by an underscore
+    s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
+    # Replace any lowercase letter or digit followed by an uppercase letter
+    # with the same characters separated by an underscore and convert to lowercase
+    s2 = re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1).lower()
+    # Remove any double underscores
+    res = re.sub("_+", "_", s2)
+    return res
+
+
+class CamelToSnakeTransformation(RenameColumns):
+    rename_func:Callable[[str], str] | None = Field(default=camel_to_snake, description="Function to convert camelCase to snake_case") # type: ignore
diff --git a/src/koheesio/spark/transformations/rename_columns.py b/src/koheesio/spark/transformations/rename_columns.py
@@ -0,0 +1,75 @@
+from __future__ import annotations
+
+from typing import Callable
+
+from pydantic import Field
+
+from pyspark.sql import functions as F
+from pyspark.sql.types import ArrayType, StructField, StructType
+
+from koheesio.spark import DataFrame
+from koheesio.spark.transformations import ColumnsTransformation
+
+
+class RenameColumns(ColumnsTransformation):
+    rename_func:Callable[[str], str] = Field(..., description="Function to rename columns")
+
+    def rename_schema(self, schema: StructType):
+        """Renames the fields of a given schema using a specified renaming function.
+        Args:
+            schema: The schema whose fields need to be renamed.
+        Returns:
+            StructType: A new schema with renamed fields.
+        Notes:
+            - If the renaming function is not provided, it defaults to `RenameColumns.camel_to_snake`.
+            - The function handles nested StructTypes and ArrayTypes containing StructTypes.
+
+        Steps:
+            1. Initialize an empty list to hold the new fields.
+            2. Determine the renaming function to use, defaulting to `RenameColumns.camel_to_snake` if not provided.
+            3. Iterate over each field in the provided schema.
+            4. Rename the field using the renaming function.
+            5. Check if the field's data type is a StructType:
+                - Recursively rename the schema of the nested StructType.
+            6. Check if the field's data type is an ArrayType containing a StructType:
+                - Recursively rename the schema of the nested StructType within the ArrayType.
+            7. For other data types, simply rename the field.
+            8. Append the newly created field to the list of new fields.
+            9. Return a new StructType constructed from the list of new fields.
+        """
+        new_fields = []
+        _columns= list(self.get_columns())
+        
+        if self.rename_func is None:
+            raise ValueError("rename_func must be provided")
+
+        for field in schema.fields:
+            if field.name in _columns:
+                new_name = self.rename_func(field.name)
+
+                if isinstance(field.dataType, StructType):
+                    new_field = StructField(new_name, self.rename_schema(field.dataType), field.nullable)
+                elif isinstance(field.dataType, ArrayType) and isinstance(field.dataType.elementType, StructType):
+                    new_field = StructField(
+                        new_name, ArrayType(self.rename_schema(field.dataType.elementType)), field.nullable
+                    )
+                else:
+                    new_field = StructField(new_name, field.dataType, field.nullable)
+                new_fields.append(new_field)
+            else:
+                # If the field is not in the columns to be renamed, keep it as is
+                new_fields.append(field)
+
+        return StructType(new_fields)
+
+    def execute(self):
+        self.df: DataFrame
+        new_schema = self.rename_schema(self.df.schema) # pylint: disable=E1102
+        _columns= list(self.get_columns())
+        _not_renamed= set(self.df.columns) - set(_columns)
+        renamed_select=[ F.col(c).cast(new_schema[self.rename_func(c)].dataType).alias(self.rename_func(c))  # pylint: disable=E1102 
+                        for c in _columns]
+        not_renamed_select= [ F.col(c).alias(c) for c in _not_renamed]
+    
+        # Apply the new schema by casting each column to the new type
+        self.output.df = self.df.select(renamed_select+not_renamed_select)
diff --git a/tests/spark/transformations/test_rename_colums.py b/tests/spark/transformations/test_rename_colums.py