From 1f48f9f05246461f135d41dfb3a45c94dd2f762f Mon Sep 17 00:00:00 2001 From: MooooCat Date: Tue, 5 Mar 2024 16:44:02 +0800 Subject: [PATCH 01/19] add data processor logs --- sdgx/synthesizer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sdgx/synthesizer.py b/sdgx/synthesizer.py index d17b34f5..61f10ba4 100644 --- a/sdgx/synthesizer.py +++ b/sdgx/synthesizer.py @@ -288,8 +288,10 @@ def fit( self.metadata = metadata # Ensure update metadata logger.info("Fitting data processors...") + start_time = time.time() for d in self.data_processors: d.fit(metadata) + logger.info(f"Fitted {len(self.data_processors)} data processors in {time.time() - start_time}s.") def chunk_generator() -> Generator[pd.DataFrame, None, None]: for chunk in self.dataloader.iter(): From 2eab25caf7ddec187bf266e2c4d5de078084285f Mon Sep 17 00:00:00 2001 From: MooooCat Date: Tue, 5 Mar 2024 16:44:20 +0800 Subject: [PATCH 02/19] intro NaN processor --- sdgx/data_processors/transformers/nan.py | 47 ++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 sdgx/data_processors/transformers/nan.py diff --git a/sdgx/data_processors/transformers/nan.py b/sdgx/data_processors/transformers/nan.py new file mode 100644 index 00000000..05ce98a9 --- /dev/null +++ b/sdgx/data_processors/transformers/nan.py @@ -0,0 +1,47 @@ +from __future__ import annotations + +from pandas import DataFrame +from sdgx.data_models.metadata import Metadata +from sdgx.data_processors.transformers.base import Transformer +from sdgx.data_processors.extension import hookimpl +from sdgx.utils import logger + +class NonValueTransformer(Transformer): + ''' + Transformer class for handling missing values in data. + + This Transformer is mainly used as a reference for Transformer to facilitate developers to quickly understand the role of Transformer. + ''' + + def fit(self, metadata: Metadata | None = None): + ''' + Fit method for the transformer. + + Does not require any action. + ''' + logger.info("Fitting NonValueTransformer") + + return + + def convert(self, raw_data: DataFrame) -> DataFrame: + ''' + Convert method to handle missing values in the input data. + ''' + logger.info("Converting data using NonValueTransformer") + + return raw_data.fillna() + + def reverse_convert(self, processed_data: DataFrame) -> DataFrame: + ''' + Reverse_convert method for the transformer. + + Does not require any action. + ''' + return processed_data + + pass + +@hookimpl +def register(manager): + manager.register("NonValueTransformer", NonValueTransformer) + From 6712ae91ee3f2be154d7d9eb90164a6a6bd8a1c5 Mon Sep 17 00:00:00 2001 From: MooooCat Date: Wed, 6 Mar 2024 11:48:49 +0800 Subject: [PATCH 03/19] fix fill_na typo in NanValueTransformer --- sdgx/data_processors/transformers/nan.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sdgx/data_processors/transformers/nan.py b/sdgx/data_processors/transformers/nan.py index 05ce98a9..731c8ee7 100644 --- a/sdgx/data_processors/transformers/nan.py +++ b/sdgx/data_processors/transformers/nan.py @@ -13,6 +13,8 @@ class NonValueTransformer(Transformer): This Transformer is mainly used as a reference for Transformer to facilitate developers to quickly understand the role of Transformer. ''' + fill_na_value = 0 + def fit(self, metadata: Metadata | None = None): ''' Fit method for the transformer. @@ -29,7 +31,7 @@ def convert(self, raw_data: DataFrame) -> DataFrame: ''' logger.info("Converting data using NonValueTransformer") - return raw_data.fillna() + return raw_data.fillna(value= self.fill_na_value) def reverse_convert(self, processed_data: DataFrame) -> DataFrame: ''' From f16d12ba1a17ae2541e884d8714468823cba30bb Mon Sep 17 00:00:00 2001 From: MooooCat Date: Wed, 6 Mar 2024 11:49:16 +0800 Subject: [PATCH 04/19] add default_processors in manager --- sdgx/data_processors/manager.py | 63 ++++++++++++++++++++++++++++++++- 1 file changed, 62 insertions(+), 1 deletion(-) diff --git a/sdgx/data_processors/manager.py b/sdgx/data_processors/manager.py index 3a2e8f8f..50df94ef 100644 --- a/sdgx/data_processors/manager.py +++ b/sdgx/data_processors/manager.py @@ -8,21 +8,82 @@ from sdgx.data_processors.extension import project_name as PROJECT_NAME from sdgx.manager import Manager - class DataProcessorManager(Manager): + """ + This is a plugin management class for data processing components. + + Properties: + - register_type: Specifies the type of data processors to register. + - project_name: Stores the project name from the extension module. + - hookspecs_model: Stores the hook specifications model from the extension module. + - preset_default_processors: Stores a list of default processor names in lowercase. + - registed_data_processors: Property that returns the registered data processors. + - registed_default_processor_list: Property that returns the registered default data processors. + + Methods: + - load_all_local_model: Loads all local models for formatters, generators, samplers, and transformers. + - init_data_processor: Initializes a data processor with the given name and keyword arguments. + - init_all_processors: Initializes all registered data processors with optional keyword arguments. + - init_default_processors: Initializes default processors that are both registered and preset. + + """ register_type = DataProcessor + project_name = PROJECT_NAME + hookspecs_model = extension + preset_defalut_processors = [ p.lower() for p in ["NonValueTransformer"]] + @property def registed_data_processors(self): + ''' + This property returns all registered data processors + ''' return self.registed_cls + + @property + def registed_default_processor_list(self): + ''' + This property returns all registered default data processors + ''' + registed_processor_list = self.registed_data_processors.keys() + + # calculate intersection + target_processors = list(set(registed_processor_list).intersection(self.preset_defalut_processors)) + + return target_processors def load_all_local_model(self): + ''' + loads all local models + ''' self._load_dir(data_processors.formatters) self._load_dir(data_processors.generators) self._load_dir(data_processors.samplers) self._load_dir(data_processors.transformers) def init_data_processor(self, processor_name, **kwargs: dict[str, Any]) -> DataProcessor: + ''' + Initializes a data processor with the given name and parameters + ''' return self.init(processor_name, **kwargs) + + def init_all_processors(self, **kwargs: Any) -> list[DataProcessor]: + ''' + Initializes all registered data processors + ''' + return [ + self.init(processor_name, **kwargs) + for processor_name in self.registed_data_processors.keys() + ] + + def init_default_processors(self, **kwargs: Any) -> list[DataProcessor]: + ''' + Initializes all default data processors + ''' + + return [ + self.init(processor_name, **kwargs) + for processor_name in self.registed_default_processor_list + ] \ No newline at end of file From 4510d8572b35f2d396e0b418e69bdc3735a7496c Mon Sep 17 00:00:00 2001 From: MooooCat Date: Wed, 6 Mar 2024 11:49:28 +0800 Subject: [PATCH 05/19] add default_processors in synthesizer --- sdgx/synthesizer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sdgx/synthesizer.py b/sdgx/synthesizer.py index 61f10ba4..0f38c72b 100644 --- a/sdgx/synthesizer.py +++ b/sdgx/synthesizer.py @@ -99,9 +99,9 @@ def __init__( self.dataloader = None # Init data processors - if not data_processors: - data_processors = [] self.data_processors_manager = DataProcessorManager() + if not data_processors: + data_processors = self.data_processors_manager.registed_default_processor_list self.data_processors = [ ( d From 9afe27a5fc13522865019550198d928b28cca4ee Mon Sep 17 00:00:00 2001 From: MooooCat Date: Wed, 6 Mar 2024 11:52:25 +0800 Subject: [PATCH 06/19] update log in nan --- sdgx/data_processors/transformers/nan.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/sdgx/data_processors/transformers/nan.py b/sdgx/data_processors/transformers/nan.py index 731c8ee7..f75c8c7f 100644 --- a/sdgx/data_processors/transformers/nan.py +++ b/sdgx/data_processors/transformers/nan.py @@ -21,7 +21,7 @@ def fit(self, metadata: Metadata | None = None): Does not require any action. ''' - logger.info("Fitting NonValueTransformer") + logger.info("NonValueTransformer Fitted.") return @@ -29,9 +29,14 @@ def convert(self, raw_data: DataFrame) -> DataFrame: ''' Convert method to handle missing values in the input data. ''' - logger.info("Converting data using NonValueTransformer") - return raw_data.fillna(value= self.fill_na_value) + logger.info("Converting data using NonValueTransformer...") + + res = raw_data.fillna(value= self.fill_na_value) + + logger.info("Converting data using NonValueTransformer... Finished.") + + return res def reverse_convert(self, processed_data: DataFrame) -> DataFrame: ''' @@ -39,6 +44,8 @@ def reverse_convert(self, processed_data: DataFrame) -> DataFrame: Does not require any action. ''' + logger.info("Data reverse-converted by NonValueTransformer (No Action).") + return processed_data pass From da08ff1b7b04c1fb0442b0d3193518b6d6f56e7c Mon Sep 17 00:00:00 2001 From: MooooCat Date: Wed, 6 Mar 2024 15:23:03 +0800 Subject: [PATCH 07/19] update formatter base --- sdgx/data_processors/formatters/base.py | 28 +++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/sdgx/data_processors/formatters/base.py b/sdgx/data_processors/formatters/base.py index 52cdf320..201df19d 100644 --- a/sdgx/data_processors/formatters/base.py +++ b/sdgx/data_processors/formatters/base.py @@ -1,3 +1,6 @@ +from __future__ import annotations + +import pandas as pd from sdgx.data_processors.base import DataProcessor @@ -16,3 +19,28 @@ class Formatter(DataProcessor): - :ref:`Transformer` sometimes implements some functions with the help of Formatter. """ + + def reverse_convert(self, processed_data: pd.DataFrame) -> pd.DataFrame: + """Convert processed data into raw data. + + Args: + processed_data (pd.DataFrame): Processed data + + Returns: + pd.DataFrame: Raw data + """ + return self.post_processing(processed_data) + + + def post_processing(self, processed_data: pd.DataFrame) -> pd.DataFrame: + ''' + For formatter, please rewrite this method. + + Args: + processed_data (pd.DataFrame): Processed data + + Returns: + pd.DataFrame: Raw data + ''' + + return processed_data From e5ea5ab6bcf0d8f58ebbdf76b04adfaa39d50aed Mon Sep 17 00:00:00 2001 From: MooooCat Date: Wed, 6 Mar 2024 15:23:13 +0800 Subject: [PATCH 08/19] add int formatter --- sdgx/data_processors/formatters/int.py | 58 ++++++++++++++++++++++++++ sdgx/data_processors/manager.py | 2 +- 2 files changed, 59 insertions(+), 1 deletion(-) create mode 100644 sdgx/data_processors/formatters/int.py diff --git a/sdgx/data_processors/formatters/int.py b/sdgx/data_processors/formatters/int.py new file mode 100644 index 00000000..62f19645 --- /dev/null +++ b/sdgx/data_processors/formatters/int.py @@ -0,0 +1,58 @@ +from __future__ import annotations + +import pandas as pd +from sdgx.data_models.metadata import Metadata +from sdgx.data_processors.formatters.base import Formatter +from sdgx.data_processors.extension import hookimpl +from sdgx.utils import logger + +class IntValueFormatter(Formatter): + ''' + Formatter class for handling Int values in pd.DataFrame. + ''' + + int_columns = None + + def fit(self, metadata: Metadata | None = None): + ''' + Fit method for the formatter. + + Formatter need to use metadata to record which columns belong to the int type, and convert them back to the int type during post-processing. + ''' + + # get from metadata + self.int_columns = metadata.get("int_columns") + + logger.info("IntValueFormatter Fitted.") + + return + + def convert(self, raw_data: pd.DataFrame) -> pd.DataFrame: + ''' + Convert method to handle missing values in the input data. + ''' + + logger.info("Converting data using IntValueFormatter... Finished (No Action).") + + return raw_data + + def post_processing(self, processed_data: pd.DataFrame) -> pd.DataFrame: + ''' + post_processing method for the formatter. + + Does not require any action. + ''' + + for col in self.int_columns: + processed_data[col] = processed_data[col].astype(int) + + logger.info("Data reverse-converted by IntValueFormatter.") + + return processed_data + + pass + +@hookimpl +def register(manager): + manager.register("IntValueFormatter", IntValueFormatter) + diff --git a/sdgx/data_processors/manager.py b/sdgx/data_processors/manager.py index 50df94ef..4c75cdfc 100644 --- a/sdgx/data_processors/manager.py +++ b/sdgx/data_processors/manager.py @@ -33,7 +33,7 @@ class DataProcessorManager(Manager): hookspecs_model = extension - preset_defalut_processors = [ p.lower() for p in ["NonValueTransformer"]] + preset_defalut_processors = [ p.lower() for p in ["NonValueTransformer",'IntValueFormatter']] @property def registed_data_processors(self): From 6b25bc1fe9227e31b97cf718ca758d1e11a69ba2 Mon Sep 17 00:00:00 2001 From: MooooCat Date: Wed, 6 Mar 2024 16:44:52 +0800 Subject: [PATCH 09/19] Update nan.py --- sdgx/data_processors/transformers/nan.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/sdgx/data_processors/transformers/nan.py b/sdgx/data_processors/transformers/nan.py index f75c8c7f..8f52c4bb 100644 --- a/sdgx/data_processors/transformers/nan.py +++ b/sdgx/data_processors/transformers/nan.py @@ -14,6 +14,8 @@ class NonValueTransformer(Transformer): ''' fill_na_value = 0 + + drop_na = False def fit(self, metadata: Metadata | None = None): ''' @@ -32,7 +34,10 @@ def convert(self, raw_data: DataFrame) -> DataFrame: logger.info("Converting data using NonValueTransformer...") - res = raw_data.fillna(value= self.fill_na_value) + if self.drop: + res = raw_data.dropna() + else: + res = raw_data.fillna(value= self.fill_na_value) logger.info("Converting data using NonValueTransformer... Finished.") From c9f70248eccffe8a1b8978ae09ac1055fa73e3dd Mon Sep 17 00:00:00 2001 From: MooooCat Date: Wed, 6 Mar 2024 18:19:49 +0800 Subject: [PATCH 10/19] Create discrete.py still draft --- sdgx/data_processors/transformers/discrete.py | 61 +++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 sdgx/data_processors/transformers/discrete.py diff --git a/sdgx/data_processors/transformers/discrete.py b/sdgx/data_processors/transformers/discrete.py new file mode 100644 index 00000000..d91e14e0 --- /dev/null +++ b/sdgx/data_processors/transformers/discrete.py @@ -0,0 +1,61 @@ +from __future__ import annotations + +from pandas import DataFrame +from sdgx.data_models.metadata import Metadata +from sdgx.data_processors.transformers.base import Transformer +from sdgx.data_processors.extension import hookimpl +from sdgx.utils import logger + +class DiscreteTransformer(Transformer): + ''' + DiscreteTransformer is an important component of sdgx, used to handle discrete columns. + + By default, DiscreteTransformer will perform one-hot encoding of discrete columns, and issue a warning message when dimensionality explosion occurs. + ''' + + discrete_columns = None + ''' + Record which columns are of discrete type. + ''' + + def fit(self, metadata: Metadata | None = None): + ''' + Fit method for the transformer. + ''' + + self.discrete_columns = metadata.get('discrete_columns') + + logger.info("DiscreteTransformer Fitted.") + return + + def convert(self, raw_data: DataFrame) -> DataFrame: + ''' + Convert method to handle discrete values in the input data. + ''' + + logger.info("Converting data using DiscreteTransformer...") + + if self.drop: + res = raw_data.dropna() + else: + res = raw_data.fillna(value= self.fill_na_value) + + logger.info("Converting data using DiscreteTransformer... Finished.") + + return res + + def reverse_convert(self, processed_data: DataFrame) -> DataFrame: + ''' + Reverse_convert method for the transformer. + + + ''' + + logger.info("Data reverse-converted by DiscreteTransformer.") + + return processed_data + + + + + pass \ No newline at end of file From 57b5a3e13e6f8e0f249da4aaecaa6017b68ca8ab Mon Sep 17 00:00:00 2001 From: MooooCat Date: Fri, 8 Mar 2024 11:50:44 +0800 Subject: [PATCH 11/19] update method typing --- sdgx/data_processors/formatters/int.py | 4 +++- sdgx/data_processors/transformers/base.py | 14 ++++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/sdgx/data_processors/formatters/int.py b/sdgx/data_processors/formatters/int.py index 62f19645..0ed4d35d 100644 --- a/sdgx/data_processors/formatters/int.py +++ b/sdgx/data_processors/formatters/int.py @@ -1,6 +1,8 @@ from __future__ import annotations import pandas as pd +from typing import Any + from sdgx.data_models.metadata import Metadata from sdgx.data_processors.formatters.base import Formatter from sdgx.data_processors.extension import hookimpl @@ -13,7 +15,7 @@ class IntValueFormatter(Formatter): int_columns = None - def fit(self, metadata: Metadata | None = None): + def fit(self, metadata: Metadata | None = None, **kwargs: dict[str, Any]): ''' Fit method for the formatter. diff --git a/sdgx/data_processors/transformers/base.py b/sdgx/data_processors/transformers/base.py index 0f4c1f96..cc39a47c 100644 --- a/sdgx/data_processors/transformers/base.py +++ b/sdgx/data_processors/transformers/base.py @@ -1,4 +1,8 @@ +import pandas as pd from sdgx.data_processors.base import DataProcessor +from sdgx.data_models.metadata import Metadata +from sdgx.data_loader import DataLoader +from sdgx.models.components.optimize.ndarray_loader import NDArrayLoader class Transformer(DataProcessor): @@ -10,3 +14,13 @@ class Transformer(DataProcessor): To achieve that, Transformer can use :ref:`Formatter` and :ref:`Inspector` to help. """ + + def fit(self, metadata: Metadata | None = None, tabular_data: DataLoader | pd.DataFrame = None): + ''' + Fit method for the transformer. + + 由于 ,Transformer 可能需要更多参数。 + ''' + + + return \ No newline at end of file From e12f1af203017f5a403e39b094ae2a5ccee7a37b Mon Sep 17 00:00:00 2001 From: MooooCat Date: Fri, 8 Mar 2024 13:12:40 +0800 Subject: [PATCH 12/19] Update discrete.py --- sdgx/data_processors/transformers/discrete.py | 70 ++++++++++++++++--- 1 file changed, 59 insertions(+), 11 deletions(-) diff --git a/sdgx/data_processors/transformers/discrete.py b/sdgx/data_processors/transformers/discrete.py index d91e14e0..1d6cbfe0 100644 --- a/sdgx/data_processors/transformers/discrete.py +++ b/sdgx/data_processors/transformers/discrete.py @@ -1,10 +1,14 @@ from __future__ import annotations -from pandas import DataFrame +import pandas as pd +from sklearn.preprocessing import LabelEncoder, OneHotEncoder + from sdgx.data_models.metadata import Metadata from sdgx.data_processors.transformers.base import Transformer from sdgx.data_processors.extension import hookimpl from sdgx.utils import logger +from sdgx.data_loader import DataLoader +from sdgx.models.components.optimize.ndarray_loader import NDArrayLoader class DiscreteTransformer(Transformer): ''' @@ -13,38 +17,82 @@ class DiscreteTransformer(Transformer): By default, DiscreteTransformer will perform one-hot encoding of discrete columns, and issue a warning message when dimensionality explosion occurs. ''' - discrete_columns = None + discrete_columns: list = None ''' Record which columns are of discrete type. ''' - def fit(self, metadata: Metadata | None = None): + encoders: dict = {} + + onehot_encoder_handle_unknown='ignore' + + def fit(self, metadata: Metadata, tabular_data: DataLoader | pd.DataFrame): ''' Fit method for the transformer. ''' + logger.info("Fitting using DiscreteTransformer...") + self.discrete_columns = metadata.get('discrete_columns') + + # no discrete columns + if len(self.discrete_columns) == 0 : + logger.info("Fitting using DiscreteTransformer... Finished (No Columns).") + return + + # then, there are >= 1 discrete colums + for each_col in self.discrete_columns: + # fit each column + self._fit_column(tabular_data[[each_col]]) - logger.info("DiscreteTransformer Fitted.") + logger.info("Fitting using DiscreteTransformer... Finished.") + return + + def _fit_column(self, column_name: str, column_data: pd.DataFrame): + ''' + Fit every discrete columns in `_fit_column`. + + Args: + - column_data (pd.DataFrame): A dataframe containing a column. + - column_name: str: column name. - def convert(self, raw_data: DataFrame) -> DataFrame: + ''' + + self.encoders[column_name] = OneHotEncoder(handle_unknown= self.onehot_encoder_handle_unknown) + # fit the column data + self.encoders[column_name].fit(column_data) + + logger.info(f"Discrete column {column_name} fitted.") + + + def convert(self, raw_data: pd.DataFrame) -> pd.DataFrame: ''' Convert method to handle discrete values in the input data. ''' logger.info("Converting data using DiscreteTransformer...") - if self.drop: - res = raw_data.dropna() - else: - res = raw_data.fillna(value= self.fill_na_value) + # TODO + # transform every discrete column into logger.info("Converting data using DiscreteTransformer... Finished.") + + # return the result + return + + def _transform_column(self, column_name: str, column_data: pd.DataFrame | pd.Series): + ''' + Transform every single discrete columns in `_transform_column`. - return res + Args: + - column_data (pd.DataFrame): A dataframe containing a column. + - column_name: str: column name. + + ''' + pass - def reverse_convert(self, processed_data: DataFrame) -> DataFrame: + def reverse_convert(self, processed_data: pd.DataFrame) -> pd.DataFrame: ''' Reverse_convert method for the transformer. From 251f782b8f63f629a5d0b6ff7a65f2c514f265b9 Mon Sep 17 00:00:00 2001 From: MoooCat Date: Mon, 11 Mar 2024 10:44:55 +0800 Subject: [PATCH 13/19] update fit method kargs --- sdgx/data_processors/transformers/nan.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sdgx/data_processors/transformers/nan.py b/sdgx/data_processors/transformers/nan.py index 8f52c4bb..53a3f012 100644 --- a/sdgx/data_processors/transformers/nan.py +++ b/sdgx/data_processors/transformers/nan.py @@ -1,6 +1,8 @@ from __future__ import annotations from pandas import DataFrame +from typing import Any + from sdgx.data_models.metadata import Metadata from sdgx.data_processors.transformers.base import Transformer from sdgx.data_processors.extension import hookimpl @@ -17,7 +19,7 @@ class NonValueTransformer(Transformer): drop_na = False - def fit(self, metadata: Metadata | None = None): + def fit(self, metadata: Metadata | None = None, **kwargs: dict[str, Any]): ''' Fit method for the transformer. From 52244b78432ddcfeb48c0cead4e69c733ce7e8e0 Mon Sep 17 00:00:00 2001 From: MoooCat Date: Mon, 11 Mar 2024 10:49:44 +0800 Subject: [PATCH 14/19] fix typo in nan.py --- sdgx/data_processors/transformers/nan.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdgx/data_processors/transformers/nan.py b/sdgx/data_processors/transformers/nan.py index 53a3f012..ff07f8c1 100644 --- a/sdgx/data_processors/transformers/nan.py +++ b/sdgx/data_processors/transformers/nan.py @@ -36,7 +36,7 @@ def convert(self, raw_data: DataFrame) -> DataFrame: logger.info("Converting data using NonValueTransformer...") - if self.drop: + if self.drop_na: res = raw_data.dropna() else: res = raw_data.fillna(value= self.fill_na_value) From e1f5d737073fd8920c85f7fe579bcaa2e5290db7 Mon Sep 17 00:00:00 2001 From: MoooCat Date: Mon, 11 Mar 2024 10:50:16 +0800 Subject: [PATCH 15/19] intro col order, update base transformer --- sdgx/data_processors/transformers/base.py | 14 ++++- .../transformers/column_order.py | 58 +++++++++++++++++++ 2 files changed, 69 insertions(+), 3 deletions(-) create mode 100644 sdgx/data_processors/transformers/column_order.py diff --git a/sdgx/data_processors/transformers/base.py b/sdgx/data_processors/transformers/base.py index cc39a47c..cd1860e2 100644 --- a/sdgx/data_processors/transformers/base.py +++ b/sdgx/data_processors/transformers/base.py @@ -18,9 +18,17 @@ class Transformer(DataProcessor): def fit(self, metadata: Metadata | None = None, tabular_data: DataLoader | pd.DataFrame = None): ''' Fit method for the transformer. - - 由于 ,Transformer 可能需要更多参数。 ''' - return \ No newline at end of file + return + + @staticmethod + def delete_column(tabular_data, column_name): + + pass + + @staticmethod + def attach_columns(tabular_data, new_columns): + + pass \ No newline at end of file diff --git a/sdgx/data_processors/transformers/column_order.py b/sdgx/data_processors/transformers/column_order.py new file mode 100644 index 00000000..82e6fbdb --- /dev/null +++ b/sdgx/data_processors/transformers/column_order.py @@ -0,0 +1,58 @@ +from __future__ import annotations + +from pandas import DataFrame +from sdgx.data_models.metadata import Metadata +from sdgx.data_processors.transformers.base import Transformer +from sdgx.data_processors.extension import hookimpl +from sdgx.utils import logger + +class ColumnOrderTransformer(Transformer): + ''' + Transformer class for handling missing values in data. + + This Transformer is mainly used as a reference for Transformer to facilitate developers to quickly understand the role of Transformer. + ''' + + column_list: list = None + ''' + The list of tabular data's columns. + ''' + + def fit(self, metadata: Metadata | None = None): + ''' + Fit method for the transformer. + + Remember the order of the columns. + ''' + + self.column_list = list(metadata.column_list) + + logger.info("ColumnOrderTransformer Fitted.") + + return + + def convert(self, raw_data: DataFrame) -> DataFrame: + ''' + Convert method to handle missing values in the input data. + ''' + logger.info("Converting data using ColumnOrderTransformer...") + logger.info("Converting data using ColumnOrderTransformer... Finished (No action).") + + return raw_data + + def reverse_convert(self, processed_data: DataFrame) -> DataFrame: + ''' + Reverse_convert method for the transformer. + ''' + + + logger.info("Data reverse-converted by ColumnOrderTransformer.") + + return processed_data + + pass + +@hookimpl +def register(manager): + manager.register("ColumnOrderTransformer", ColumnOrderTransformer) + From 655e60f134b413527a73565c7ac67a632791e1dd Mon Sep 17 00:00:00 2001 From: MoooCat Date: Mon, 11 Mar 2024 10:50:24 +0800 Subject: [PATCH 16/19] Update discrete.py --- sdgx/data_processors/transformers/discrete.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/sdgx/data_processors/transformers/discrete.py b/sdgx/data_processors/transformers/discrete.py index 1d6cbfe0..b5e8e617 100644 --- a/sdgx/data_processors/transformers/discrete.py +++ b/sdgx/data_processors/transformers/discrete.py @@ -1,7 +1,7 @@ from __future__ import annotations import pandas as pd -from sklearn.preprocessing import LabelEncoder, OneHotEncoder +from sklearn.preprocessing import OneHotEncoder from sdgx.data_models.metadata import Metadata from sdgx.data_processors.transformers.base import Transformer @@ -28,7 +28,7 @@ class DiscreteTransformer(Transformer): def fit(self, metadata: Metadata, tabular_data: DataLoader | pd.DataFrame): ''' - Fit method for the transformer. + Fit method for the DiscreteTransformer. ''' logger.info("Fitting using DiscreteTransformer...") @@ -43,7 +43,7 @@ def fit(self, metadata: Metadata, tabular_data: DataLoader | pd.DataFrame): # then, there are >= 1 discrete colums for each_col in self.discrete_columns: # fit each column - self._fit_column(tabular_data[[each_col]]) + self._fit_column(each_col, tabular_data[[each_col]]) logger.info("Fitting using DiscreteTransformer... Finished.") @@ -56,7 +56,6 @@ def _fit_column(self, column_name: str, column_data: pd.DataFrame): Args: - column_data (pd.DataFrame): A dataframe containing a column. - column_name: str: column name. - ''' self.encoders[column_name] = OneHotEncoder(handle_unknown= self.onehot_encoder_handle_unknown) @@ -75,6 +74,15 @@ def convert(self, raw_data: pd.DataFrame) -> pd.DataFrame: # TODO # transform every discrete column into + if len(self.discrete_columns) == 0: + logger.info("Converting data using DiscreteTransformer... Finished (No column).") + return + + for each_col in self.discrete_columns: + new_onehot_column_set = self.encoders[each_col].transform(raw_data[[each_col]]) + # TODO 1- add new_onehot_column_set into the original dataframe + # TODO 2- delete the original column + logger.info(f"Column {each_col} converted.") logger.info("Converting data using DiscreteTransformer... Finished.") @@ -103,7 +111,4 @@ def reverse_convert(self, processed_data: pd.DataFrame) -> pd.DataFrame: return processed_data - - - pass \ No newline at end of file From 57237d1d50d79757d92fdaa8d00a8a5fa09c1a0a Mon Sep 17 00:00:00 2001 From: MooooCat Date: Mon, 11 Mar 2024 11:55:14 +0800 Subject: [PATCH 17/19] intro ChinaMainlandAddressInspector --- sdgx/data_models/inspectors/personal.py | 27 +++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/sdgx/data_models/inspectors/personal.py b/sdgx/data_models/inspectors/personal.py index fb8d2454..30b1ef97 100644 --- a/sdgx/data_models/inspectors/personal.py +++ b/sdgx/data_models/inspectors/personal.py @@ -73,6 +73,30 @@ def domain_verification(self, each_sample): return False return True +class ChinaMainlandAddressInspector(RegexInspector): + + # This regular expression does not take effect and is only for reference by developers. + # pattern = r"^[\u4e00-\u9fa5]{2,}(省|自治区|特别行政区|市)|[\u4e00-\u9fa5]{2,}(市|区|县|自治州|自治县|县级市|地区|盟|林区)?|[\u4e00-\u9fa5]{0,}(街道|镇|乡)?|[\u4e00-\u9fa5]{0,}(路|街|巷|弄)?|[\u4e00-\u9fa5]{0,}(号|弄)?$" + + pattern = r"^[\u4e00-\u9fa5]{2,}(省|自治区|特别行政区|市|县|村|弄|乡|路|街)" + + pii = True + + data_type_name = "china_mainland_address" + + _inspect_level = 30 + + def domain_verification(self, each_sample): + # CHN address should be between 8 - 30 characters + if len(each_sample) < 8: + return False + if len(each_sample) > 30: + return False + # notice to distinguishing from the company name + if each_sample.endswith('公司'): + return False + return True + @hookimpl def register(manager): @@ -85,3 +109,6 @@ def register(manager): manager.register("ChinaMainlandPostCode", ChinaMainlandPostCode) manager.register("ChinaMainlandUnifiedSocialCreditCode", ChinaMainlandUnifiedSocialCreditCode) + + manager.register("ChinaMainlandAddressInspector", ChinaMainlandAddressInspector) + From 90510d93851fdf377bd593f9f9fb7c4337c9397d Mon Sep 17 00:00:00 2001 From: MooooCat Date: Mon, 11 Mar 2024 11:55:22 +0800 Subject: [PATCH 18/19] add test cases --- tests/data_models/inspector/test_personal.py | 29 ++++++++++++++++---- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/tests/data_models/inspector/test_personal.py b/tests/data_models/inspector/test_personal.py index c7d27490..1477c87d 100644 --- a/tests/data_models/inspector/test_personal.py +++ b/tests/data_models/inspector/test_personal.py @@ -1,21 +1,21 @@ +import pytest import datetime import random import string - import pandas as pd -import pytest + from faker import Faker +fake = Faker(locale="zh_CN") from sdgx.data_models.inspectors.personal import ( + EmailInspector, ChinaMainlandIDInspector, ChinaMainlandMobilePhoneInspector, ChinaMainlandPostCode, ChinaMainlandUnifiedSocialCreditCode, - EmailInspector, + ChinaMainlandAddressInspector ) -fake = Faker(locale="zh_CN") - def generate_uniform_credit_code(): # generate china mainland 统一社会信用代码 for test @@ -193,6 +193,25 @@ def test_chn_uscc_inspector_generated_data(chn_personal_test_df: pd.DataFrame): assert inspector_USCC.inspect_level == 30 assert inspector_USCC.pii is True +# CHN address +def test_chn_address_inspector_demo_data(raw_data): + inspector_CHN_Address = ChinaMainlandAddressInspector() + inspector_CHN_Address.fit(raw_data) + assert not inspector_CHN_Address.regex_columns + assert sorted(inspector_CHN_Address.inspect()["china_mainland_address_columns"]) == sorted([]) + assert inspector_CHN_Address.inspect_level == 30 + assert inspector_CHN_Address.pii is True + + +def test_chn_address_inspector_generated_data(chn_personal_test_df: pd.DataFrame): + inspector_CHN_Address = ChinaMainlandAddressInspector() + inspector_CHN_Address.fit(chn_personal_test_df) + # assert inspector_CHN_Address.regex_columns + assert sorted(inspector_CHN_Address.inspect()["china_mainland_address_columns"]) == sorted( + ["chn_address"] + ) + assert inspector_CHN_Address.inspect_level == 30 + assert inspector_CHN_Address.pii is True if __name__ == "__main__": pytest.main(["-vv", "-s", __file__]) From 4af24aba8ed3c525fbee5686c80dab00dadb8b77 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 11 Mar 2024 03:56:41 +0000 Subject: [PATCH 19/19] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- sdgx/data_models/inspectors/personal.py | 10 +- sdgx/data_processors/formatters/base.py | 12 +-- sdgx/data_processors/formatters/int.py | 46 ++++----- sdgx/data_processors/manager.py | 52 +++++----- sdgx/data_processors/transformers/base.py | 20 ++-- .../transformers/column_order.py | 41 ++++---- sdgx/data_processors/transformers/discrete.py | 98 ++++++++++--------- sdgx/data_processors/transformers/nan.py | 50 +++++----- sdgx/synthesizer.py | 4 +- tests/data_models/inspector/test_personal.py | 11 ++- 10 files changed, 180 insertions(+), 164 deletions(-) diff --git a/sdgx/data_models/inspectors/personal.py b/sdgx/data_models/inspectors/personal.py index 30b1ef97..3a8e84dd 100644 --- a/sdgx/data_models/inspectors/personal.py +++ b/sdgx/data_models/inspectors/personal.py @@ -73,13 +73,14 @@ def domain_verification(self, each_sample): return False return True + class ChinaMainlandAddressInspector(RegexInspector): - # This regular expression does not take effect and is only for reference by developers. + # This regular expression does not take effect and is only for reference by developers. # pattern = r"^[\u4e00-\u9fa5]{2,}(省|自治区|特别行政区|市)|[\u4e00-\u9fa5]{2,}(市|区|县|自治州|自治县|县级市|地区|盟|林区)?|[\u4e00-\u9fa5]{0,}(街道|镇|乡)?|[\u4e00-\u9fa5]{0,}(路|街|巷|弄)?|[\u4e00-\u9fa5]{0,}(号|弄)?$" pattern = r"^[\u4e00-\u9fa5]{2,}(省|自治区|特别行政区|市|县|村|弄|乡|路|街)" - + pii = True data_type_name = "china_mainland_address" @@ -93,10 +94,10 @@ def domain_verification(self, each_sample): if len(each_sample) > 30: return False # notice to distinguishing from the company name - if each_sample.endswith('公司'): + if each_sample.endswith("公司"): return False return True - + @hookimpl def register(manager): @@ -111,4 +112,3 @@ def register(manager): manager.register("ChinaMainlandUnifiedSocialCreditCode", ChinaMainlandUnifiedSocialCreditCode) manager.register("ChinaMainlandAddressInspector", ChinaMainlandAddressInspector) - diff --git a/sdgx/data_processors/formatters/base.py b/sdgx/data_processors/formatters/base.py index 201df19d..83b810b9 100644 --- a/sdgx/data_processors/formatters/base.py +++ b/sdgx/data_processors/formatters/base.py @@ -1,6 +1,7 @@ from __future__ import annotations -import pandas as pd +import pandas as pd + from sdgx.data_processors.base import DataProcessor @@ -30,17 +31,16 @@ def reverse_convert(self, processed_data: pd.DataFrame) -> pd.DataFrame: pd.DataFrame: Raw data """ return self.post_processing(processed_data) - - + def post_processing(self, processed_data: pd.DataFrame) -> pd.DataFrame: - ''' + """ For formatter, please rewrite this method. - + Args: processed_data (pd.DataFrame): Processed data Returns: pd.DataFrame: Raw data - ''' + """ return processed_data diff --git a/sdgx/data_processors/formatters/int.py b/sdgx/data_processors/formatters/int.py index 0ed4d35d..8e6e319f 100644 --- a/sdgx/data_processors/formatters/int.py +++ b/sdgx/data_processors/formatters/int.py @@ -1,49 +1,51 @@ from __future__ import annotations -import pandas as pd from typing import Any -from sdgx.data_models.metadata import Metadata -from sdgx.data_processors.formatters.base import Formatter +import pandas as pd + +from sdgx.data_models.metadata import Metadata from sdgx.data_processors.extension import hookimpl +from sdgx.data_processors.formatters.base import Formatter from sdgx.utils import logger + class IntValueFormatter(Formatter): - ''' + """ Formatter class for handling Int values in pd.DataFrame. - ''' + """ int_columns = None def fit(self, metadata: Metadata | None = None, **kwargs: dict[str, Any]): - ''' - Fit method for the formatter. - + """ + Fit method for the formatter. + Formatter need to use metadata to record which columns belong to the int type, and convert them back to the int type during post-processing. - ''' - - # get from metadata + """ + + # get from metadata self.int_columns = metadata.get("int_columns") - + logger.info("IntValueFormatter Fitted.") - return + return def convert(self, raw_data: pd.DataFrame) -> pd.DataFrame: - ''' + """ Convert method to handle missing values in the input data. - ''' - + """ + logger.info("Converting data using IntValueFormatter... Finished (No Action).") return raw_data - + def post_processing(self, processed_data: pd.DataFrame) -> pd.DataFrame: - ''' - post_processing method for the formatter. - + """ + post_processing method for the formatter. + Does not require any action. - ''' + """ for col in self.int_columns: processed_data[col] = processed_data[col].astype(int) @@ -54,7 +56,7 @@ def post_processing(self, processed_data: pd.DataFrame) -> pd.DataFrame: pass + @hookimpl def register(manager): manager.register("IntValueFormatter", IntValueFormatter) - diff --git a/sdgx/data_processors/manager.py b/sdgx/data_processors/manager.py index 4c75cdfc..34105d00 100644 --- a/sdgx/data_processors/manager.py +++ b/sdgx/data_processors/manager.py @@ -8,11 +8,12 @@ from sdgx.data_processors.extension import project_name as PROJECT_NAME from sdgx.manager import Manager + class DataProcessorManager(Manager): """ This is a plugin management class for data processing components. - - Properties: + + Properties: - register_type: Specifies the type of data processors to register. - project_name: Stores the project name from the extension module. - hookspecs_model: Stores the hook specifications model from the extension module. @@ -27,63 +28,66 @@ class DataProcessorManager(Manager): - init_default_processors: Initializes default processors that are both registered and preset. """ + register_type = DataProcessor - + project_name = PROJECT_NAME - + hookspecs_model = extension - preset_defalut_processors = [ p.lower() for p in ["NonValueTransformer",'IntValueFormatter']] - + preset_defalut_processors = [p.lower() for p in ["NonValueTransformer", "IntValueFormatter"]] + @property def registed_data_processors(self): - ''' + """ This property returns all registered data processors - ''' + """ return self.registed_cls - + @property def registed_default_processor_list(self): - ''' + """ This property returns all registered default data processors - ''' + """ registed_processor_list = self.registed_data_processors.keys() - - # calculate intersection - target_processors = list(set(registed_processor_list).intersection(self.preset_defalut_processors)) - + + # calculate intersection + target_processors = list( + set(registed_processor_list).intersection(self.preset_defalut_processors) + ) + return target_processors def load_all_local_model(self): - ''' + """ loads all local models - ''' + """ self._load_dir(data_processors.formatters) self._load_dir(data_processors.generators) self._load_dir(data_processors.samplers) self._load_dir(data_processors.transformers) def init_data_processor(self, processor_name, **kwargs: dict[str, Any]) -> DataProcessor: - ''' + """ Initializes a data processor with the given name and parameters - ''' + """ return self.init(processor_name, **kwargs) def init_all_processors(self, **kwargs: Any) -> list[DataProcessor]: - ''' + """ Initializes all registered data processors - ''' + """ return [ self.init(processor_name, **kwargs) for processor_name in self.registed_data_processors.keys() ] def init_default_processors(self, **kwargs: Any) -> list[DataProcessor]: - ''' + """ Initializes all default data processors - ''' + """ return [ self.init(processor_name, **kwargs) for processor_name in self.registed_default_processor_list - ] \ No newline at end of file + ] diff --git a/sdgx/data_processors/transformers/base.py b/sdgx/data_processors/transformers/base.py index cd1860e2..5f40ff27 100644 --- a/sdgx/data_processors/transformers/base.py +++ b/sdgx/data_processors/transformers/base.py @@ -1,7 +1,8 @@ -import pandas as pd -from sdgx.data_processors.base import DataProcessor -from sdgx.data_models.metadata import Metadata +import pandas as pd + from sdgx.data_loader import DataLoader +from sdgx.data_models.metadata import Metadata +from sdgx.data_processors.base import DataProcessor from sdgx.models.components.optimize.ndarray_loader import NDArrayLoader @@ -16,19 +17,18 @@ class Transformer(DataProcessor): """ def fit(self, metadata: Metadata | None = None, tabular_data: DataLoader | pd.DataFrame = None): - ''' - Fit method for the transformer. - ''' - + """ + Fit method for the transformer. + """ - return + return @staticmethod - def delete_column(tabular_data, column_name): + def delete_column(tabular_data, column_name): pass @staticmethod def attach_columns(tabular_data, new_columns): - pass \ No newline at end of file + pass diff --git a/sdgx/data_processors/transformers/column_order.py b/sdgx/data_processors/transformers/column_order.py index 82e6fbdb..253dbe47 100644 --- a/sdgx/data_processors/transformers/column_order.py +++ b/sdgx/data_processors/transformers/column_order.py @@ -1,50 +1,51 @@ from __future__ import annotations -from pandas import DataFrame -from sdgx.data_models.metadata import Metadata -from sdgx.data_processors.transformers.base import Transformer +from pandas import DataFrame + +from sdgx.data_models.metadata import Metadata from sdgx.data_processors.extension import hookimpl +from sdgx.data_processors.transformers.base import Transformer from sdgx.utils import logger + class ColumnOrderTransformer(Transformer): - ''' + """ Transformer class for handling missing values in data. This Transformer is mainly used as a reference for Transformer to facilitate developers to quickly understand the role of Transformer. - ''' + """ column_list: list = None - ''' + """ The list of tabular data's columns. - ''' + """ def fit(self, metadata: Metadata | None = None): - ''' - Fit method for the transformer. - + """ + Fit method for the transformer. + Remember the order of the columns. - ''' + """ self.column_list = list(metadata.column_list) logger.info("ColumnOrderTransformer Fitted.") - return + return def convert(self, raw_data: DataFrame) -> DataFrame: - ''' + """ Convert method to handle missing values in the input data. - ''' + """ logger.info("Converting data using ColumnOrderTransformer...") logger.info("Converting data using ColumnOrderTransformer... Finished (No action).") return raw_data - - def reverse_convert(self, processed_data: DataFrame) -> DataFrame: - ''' - Reverse_convert method for the transformer. - ''' + def reverse_convert(self, processed_data: DataFrame) -> DataFrame: + """ + Reverse_convert method for the transformer. + """ logger.info("Data reverse-converted by ColumnOrderTransformer.") @@ -52,7 +53,7 @@ def reverse_convert(self, processed_data: DataFrame) -> DataFrame: pass + @hookimpl def register(manager): manager.register("ColumnOrderTransformer", ColumnOrderTransformer) - diff --git a/sdgx/data_processors/transformers/discrete.py b/sdgx/data_processors/transformers/discrete.py index b5e8e617..ad77621e 100644 --- a/sdgx/data_processors/transformers/discrete.py +++ b/sdgx/data_processors/transformers/discrete.py @@ -1,114 +1,116 @@ from __future__ import annotations -import pandas as pd +import pandas as pd from sklearn.preprocessing import OneHotEncoder -from sdgx.data_models.metadata import Metadata -from sdgx.data_processors.transformers.base import Transformer -from sdgx.data_processors.extension import hookimpl -from sdgx.utils import logger from sdgx.data_loader import DataLoader +from sdgx.data_models.metadata import Metadata +from sdgx.data_processors.extension import hookimpl +from sdgx.data_processors.transformers.base import Transformer from sdgx.models.components.optimize.ndarray_loader import NDArrayLoader +from sdgx.utils import logger + class DiscreteTransformer(Transformer): - ''' + """ DiscreteTransformer is an important component of sdgx, used to handle discrete columns. - + By default, DiscreteTransformer will perform one-hot encoding of discrete columns, and issue a warning message when dimensionality explosion occurs. - ''' + """ discrete_columns: list = None - ''' + """ Record which columns are of discrete type. - ''' + """ encoders: dict = {} - onehot_encoder_handle_unknown='ignore' + onehot_encoder_handle_unknown = "ignore" def fit(self, metadata: Metadata, tabular_data: DataLoader | pd.DataFrame): - ''' - Fit method for the DiscreteTransformer. - ''' + """ + Fit method for the DiscreteTransformer. + """ logger.info("Fitting using DiscreteTransformer...") - self.discrete_columns = metadata.get('discrete_columns') - - # no discrete columns - if len(self.discrete_columns) == 0 : + self.discrete_columns = metadata.get("discrete_columns") + + # no discrete columns + if len(self.discrete_columns) == 0: logger.info("Fitting using DiscreteTransformer... Finished (No Columns).") - return + return # then, there are >= 1 discrete colums for each_col in self.discrete_columns: - # fit each column + # fit each column self._fit_column(each_col, tabular_data[[each_col]]) logger.info("Fitting using DiscreteTransformer... Finished.") - - return - + + return + def _fit_column(self, column_name: str, column_data: pd.DataFrame): - ''' + """ Fit every discrete columns in `_fit_column`. Args: - column_data (pd.DataFrame): A dataframe containing a column. - column_name: str: column name. - ''' + """ - self.encoders[column_name] = OneHotEncoder(handle_unknown= self.onehot_encoder_handle_unknown) + self.encoders[column_name] = OneHotEncoder( + handle_unknown=self.onehot_encoder_handle_unknown + ) # fit the column data self.encoders[column_name].fit(column_data) - + logger.info(f"Discrete column {column_name} fitted.") - def convert(self, raw_data: pd.DataFrame) -> pd.DataFrame: - ''' + """ Convert method to handle discrete values in the input data. - ''' + """ logger.info("Converting data using DiscreteTransformer...") - # TODO - # transform every discrete column into + # TODO + # transform every discrete column into if len(self.discrete_columns) == 0: logger.info("Converting data using DiscreteTransformer... Finished (No column).") - return - + return + for each_col in self.discrete_columns: new_onehot_column_set = self.encoders[each_col].transform(raw_data[[each_col]]) # TODO 1- add new_onehot_column_set into the original dataframe - # TODO 2- delete the original column + # TODO 2- delete the original column logger.info(f"Column {each_col} converted.") - + logger.info("Converting data using DiscreteTransformer... Finished.") - + # return the result - return - + return + def _transform_column(self, column_name: str, column_data: pd.DataFrame | pd.Series): - ''' + """ Transform every single discrete columns in `_transform_column`. Args: - column_data (pd.DataFrame): A dataframe containing a column. - column_name: str: column name. - ''' + """ pass - + def reverse_convert(self, processed_data: pd.DataFrame) -> pd.DataFrame: - ''' - Reverse_convert method for the transformer. - - - ''' + """ + Reverse_convert method for the transformer. + + + """ logger.info("Data reverse-converted by DiscreteTransformer.") return processed_data - pass \ No newline at end of file + pass diff --git a/sdgx/data_processors/transformers/nan.py b/sdgx/data_processors/transformers/nan.py index ff07f8c1..053062ab 100644 --- a/sdgx/data_processors/transformers/nan.py +++ b/sdgx/data_processors/transformers/nan.py @@ -1,63 +1,65 @@ from __future__ import annotations -from pandas import DataFrame -from typing import Any +from typing import Any -from sdgx.data_models.metadata import Metadata -from sdgx.data_processors.transformers.base import Transformer +from pandas import DataFrame + +from sdgx.data_models.metadata import Metadata from sdgx.data_processors.extension import hookimpl +from sdgx.data_processors.transformers.base import Transformer from sdgx.utils import logger + class NonValueTransformer(Transformer): - ''' + """ Transformer class for handling missing values in data. This Transformer is mainly used as a reference for Transformer to facilitate developers to quickly understand the role of Transformer. - ''' + """ + + fill_na_value = 0 - fill_na_value = 0 - drop_na = False - def fit(self, metadata: Metadata | None = None, **kwargs: dict[str, Any]): - ''' - Fit method for the transformer. - + def fit(self, metadata: Metadata | None = None, **kwargs: dict[str, Any]): + """ + Fit method for the transformer. + Does not require any action. - ''' + """ logger.info("NonValueTransformer Fitted.") - return + return def convert(self, raw_data: DataFrame) -> DataFrame: - ''' + """ Convert method to handle missing values in the input data. - ''' + """ logger.info("Converting data using NonValueTransformer...") if self.drop_na: res = raw_data.dropna() else: - res = raw_data.fillna(value= self.fill_na_value) - + res = raw_data.fillna(value=self.fill_na_value) + logger.info("Converting data using NonValueTransformer... Finished.") return res - + def reverse_convert(self, processed_data: DataFrame) -> DataFrame: - ''' - Reverse_convert method for the transformer. - + """ + Reverse_convert method for the transformer. + Does not require any action. - ''' + """ logger.info("Data reverse-converted by NonValueTransformer (No Action).") return processed_data pass + @hookimpl def register(manager): manager.register("NonValueTransformer", NonValueTransformer) - diff --git a/sdgx/synthesizer.py b/sdgx/synthesizer.py index 0f38c72b..9b2daab8 100644 --- a/sdgx/synthesizer.py +++ b/sdgx/synthesizer.py @@ -291,7 +291,9 @@ def fit( start_time = time.time() for d in self.data_processors: d.fit(metadata) - logger.info(f"Fitted {len(self.data_processors)} data processors in {time.time() - start_time}s.") + logger.info( + f"Fitted {len(self.data_processors)} data processors in {time.time() - start_time}s." + ) def chunk_generator() -> Generator[pd.DataFrame, None, None]: for chunk in self.dataloader.iter(): diff --git a/tests/data_models/inspector/test_personal.py b/tests/data_models/inspector/test_personal.py index 1477c87d..9f417ac9 100644 --- a/tests/data_models/inspector/test_personal.py +++ b/tests/data_models/inspector/test_personal.py @@ -1,19 +1,20 @@ -import pytest import datetime import random import string -import pandas as pd +import pandas as pd +import pytest from faker import Faker + fake = Faker(locale="zh_CN") from sdgx.data_models.inspectors.personal import ( - EmailInspector, + ChinaMainlandAddressInspector, ChinaMainlandIDInspector, ChinaMainlandMobilePhoneInspector, ChinaMainlandPostCode, ChinaMainlandUnifiedSocialCreditCode, - ChinaMainlandAddressInspector + EmailInspector, ) @@ -193,6 +194,7 @@ def test_chn_uscc_inspector_generated_data(chn_personal_test_df: pd.DataFrame): assert inspector_USCC.inspect_level == 30 assert inspector_USCC.pii is True + # CHN address def test_chn_address_inspector_demo_data(raw_data): inspector_CHN_Address = ChinaMainlandAddressInspector() @@ -213,5 +215,6 @@ def test_chn_address_inspector_generated_data(chn_personal_test_df: pd.DataFrame assert inspector_CHN_Address.inspect_level == 30 assert inspector_CHN_Address.pii is True + if __name__ == "__main__": pytest.main(["-vv", "-s", __file__])