|
2 | 2 | Class for converting DataFrame column names from camel case to snake case.
|
3 | 3 | """
|
4 | 4 |
|
5 |
| -from typing import Optional |
6 | 5 | import re
|
7 | 6 |
|
8 |
| -from koheesio.models import Field, ListOfColumns |
9 |
| -from koheesio.spark.transformations import ColumnsTransformation |
10 |
| -from koheesio.spark.utils import SPARK_MINOR_VERSION |
| 7 | +from collections.abc import Callable |
11 | 8 |
|
12 |
| -camel_to_snake_re = re.compile("([a-z0-9])([A-Z])") |
| 9 | +from pydantic import Field |
13 | 10 |
|
| 11 | +from koheesio.spark.transformations.rename_columns import RenameColumns |
14 | 12 |
|
15 |
| -def convert_camel_to_snake(name: str) -> str: |
16 |
| - """ |
17 |
| - Converts a string from camelCase to snake_case. |
18 |
| -
|
19 |
| - Parameters: |
20 |
| - ---------- |
21 |
| - name : str |
22 |
| - The string to be converted. |
23 |
| -
|
24 |
| - Returns: |
25 |
| - -------- |
26 |
| - str |
27 |
| - The converted string in snake_case. |
28 |
| - """ |
29 |
| - return camel_to_snake_re.sub(r"\1_\2", name).lower() |
30 |
| - |
31 |
| - |
32 |
| -class CamelToSnakeTransformation(ColumnsTransformation): |
33 |
| - """ |
34 |
| - Converts column names from camel case to snake cases |
35 |
| -
|
36 |
| - Parameters |
37 |
| - ---------- |
38 |
| - columns : Optional[ListOfColumns], optional, default=None |
39 |
| - The column or columns to convert. If no columns are specified, all columns will be converted. A list of columns |
40 |
| - or a single column can be specified. |
41 |
| - For example: `["column1", "column2"]` or `"column1"` |
42 |
| -
|
43 |
| - Example |
44 |
| - ------- |
45 |
| - __input_df:__ |
46 | 13 |
|
47 |
| - | camelCaseColumn | snake_case_column | |
48 |
| - |--------------------|-------------------| |
49 |
| - | ... | ... | |
50 | 14 |
|
51 |
| - ```python |
52 |
| - output_df = CamelToSnakeTransformation( |
53 |
| - column="camelCaseColumn" |
54 |
| - ).transform(input_df) |
55 |
| - ``` |
| 15 | +def camel_to_snake(name: str) -> str: |
| 16 | + """Convert a camelCase string to snake_case. |
56 | 17 |
|
57 |
| - __output_df:__ |
58 |
| -
|
59 |
| - | camel_case_column | snake_case_column | |
60 |
| - |-------------------|-------------------| |
61 |
| - | ... | ... | |
62 |
| -
|
63 |
| - In this example, the column `camelCaseColumn` is converted to `camel_case_column`. |
64 |
| -
|
65 |
| - > Note: the data in the columns is not changed, only the column names. |
| 18 | + Args: |
| 19 | + name: The camelCase string to be converted. |
66 | 20 |
|
| 21 | + Returns: |
| 22 | + str: The converted snake_case string. |
67 | 23 | """
|
68 |
| - |
69 |
| - def execute(self) -> ColumnsTransformation.Output: |
70 |
| - _df = self.df |
71 |
| - |
72 |
| - # Prepare columns input: |
73 |
| - columns = list(self.get_columns()) |
74 |
| - |
75 |
| - if SPARK_MINOR_VERSION < 3.4: |
76 |
| - for column in columns: |
77 |
| - _df = _df.withColumnRenamed(column, convert_camel_to_snake(column)) |
78 |
| - |
79 |
| - else: |
80 |
| - # Rename columns using toDF for Spark versions >= 3.4 |
81 |
| - # Note: toDF requires all column names to be specified |
82 |
| - new_column_names = [] |
83 |
| - for column in _df.columns: |
84 |
| - if column in columns: |
85 |
| - new_column_names.append(convert_camel_to_snake(column)) |
86 |
| - continue |
87 |
| - new_column_names.append(column) |
88 |
| - _df = _df.toDF(*new_column_names) |
89 |
| - |
90 |
| - self.output.df = _df |
| 24 | + # Replace any lowercase letter or digit followed by an uppercase |
| 25 | + # letter with the same characters separated by an underscore |
| 26 | + s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name) |
| 27 | + # Replace any lowercase letter or digit followed by an uppercase letter |
| 28 | + # with the same characters separated by an underscore and convert to lowercase |
| 29 | + s2 = re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1).lower() |
| 30 | + # Remove any double underscores |
| 31 | + res = re.sub("_+", "_", s2) |
| 32 | + return res |
| 33 | + |
| 34 | + |
| 35 | +class CamelToSnakeTransformation(RenameColumns): |
| 36 | + rename_func:Callable[[str], str] | None = Field(default=camel_to_snake, description="Function to convert camelCase to snake_case") # type: ignore |
0 commit comments