Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add CHN address inspector #157

Closed
wants to merge 19 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions sdgx/data_models/inspectors/personal.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,31 @@ def domain_verification(self, each_sample):
return True


class ChinaMainlandAddressInspector(RegexInspector):

# This regular expression does not take effect and is only for reference by developers.
# pattern = r"^[\u4e00-\u9fa5]{2,}(省|自治区|特别行政区|市)|[\u4e00-\u9fa5]{2,}(市|区|县|自治州|自治县|县级市|地区|盟|林区)?|[\u4e00-\u9fa5]{0,}(街道|镇|乡)?|[\u4e00-\u9fa5]{0,}(路|街|巷|弄)?|[\u4e00-\u9fa5]{0,}(号|弄)?$"

pattern = r"^[\u4e00-\u9fa5]{2,}(省|自治区|特别行政区|市|县|村|弄|乡|路|街)"

pii = True

data_type_name = "china_mainland_address"

_inspect_level = 30

def domain_verification(self, each_sample):
# CHN address should be between 8 - 30 characters
if len(each_sample) < 8:
return False
if len(each_sample) > 30:
return False
# notice to distinguishing from the company name
if each_sample.endswith("公司"):
return False
return True


@hookimpl
def register(manager):
manager.register("EmailInspector", EmailInspector)
Expand All @@ -85,3 +110,5 @@ def register(manager):
manager.register("ChinaMainlandPostCode", ChinaMainlandPostCode)

manager.register("ChinaMainlandUnifiedSocialCreditCode", ChinaMainlandUnifiedSocialCreditCode)

manager.register("ChinaMainlandAddressInspector", ChinaMainlandAddressInspector)
28 changes: 28 additions & 0 deletions sdgx/data_processors/formatters/base.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
from __future__ import annotations

import pandas as pd

from sdgx.data_processors.base import DataProcessor


Expand All @@ -16,3 +20,27 @@ class Formatter(DataProcessor):
- :ref:`Transformer` sometimes implements some functions with the help of Formatter.

"""

def reverse_convert(self, processed_data: pd.DataFrame) -> pd.DataFrame:
"""Convert processed data into raw data.

Args:
processed_data (pd.DataFrame): Processed data

Returns:
pd.DataFrame: Raw data
"""
return self.post_processing(processed_data)

def post_processing(self, processed_data: pd.DataFrame) -> pd.DataFrame:
"""
For formatter, please rewrite this method.

Args:
processed_data (pd.DataFrame): Processed data

Returns:
pd.DataFrame: Raw data
"""

return processed_data
62 changes: 62 additions & 0 deletions sdgx/data_processors/formatters/int.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from __future__ import annotations

from typing import Any

import pandas as pd

from sdgx.data_models.metadata import Metadata
from sdgx.data_processors.extension import hookimpl
from sdgx.data_processors.formatters.base import Formatter
from sdgx.utils import logger


class IntValueFormatter(Formatter):
"""
Formatter class for handling Int values in pd.DataFrame.
"""

int_columns = None

def fit(self, metadata: Metadata | None = None, **kwargs: dict[str, Any]):
"""
Fit method for the formatter.

Formatter need to use metadata to record which columns belong to the int type, and convert them back to the int type during post-processing.
"""

# get from metadata
self.int_columns = metadata.get("int_columns")

logger.info("IntValueFormatter Fitted.")

return

def convert(self, raw_data: pd.DataFrame) -> pd.DataFrame:
"""
Convert method to handle missing values in the input data.
"""

logger.info("Converting data using IntValueFormatter... Finished (No Action).")

return raw_data

def post_processing(self, processed_data: pd.DataFrame) -> pd.DataFrame:
"""
post_processing method for the formatter.

Does not require any action.
"""

for col in self.int_columns:
processed_data[col] = processed_data[col].astype(int)

logger.info("Data reverse-converted by IntValueFormatter.")

return processed_data

pass


@hookimpl
def register(manager):
manager.register("IntValueFormatter", IntValueFormatter)
65 changes: 65 additions & 0 deletions sdgx/data_processors/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,19 +10,84 @@


class DataProcessorManager(Manager):
"""
This is a plugin management class for data processing components.

Properties:
- register_type: Specifies the type of data processors to register.
- project_name: Stores the project name from the extension module.
- hookspecs_model: Stores the hook specifications model from the extension module.
- preset_default_processors: Stores a list of default processor names in lowercase.
- registed_data_processors: Property that returns the registered data processors.
- registed_default_processor_list: Property that returns the registered default data processors.

Methods:
- load_all_local_model: Loads all local models for formatters, generators, samplers, and transformers.
- init_data_processor: Initializes a data processor with the given name and keyword arguments.
- init_all_processors: Initializes all registered data processors with optional keyword arguments.
- init_default_processors: Initializes default processors that are both registered and preset.

"""

register_type = DataProcessor

project_name = PROJECT_NAME

hookspecs_model = extension

preset_defalut_processors = [p.lower() for p in ["NonValueTransformer", "IntValueFormatter"]]

@property
def registed_data_processors(self):
"""
This property returns all registered data processors
"""
return self.registed_cls

@property
def registed_default_processor_list(self):
"""
This property returns all registered default data processors
"""
registed_processor_list = self.registed_data_processors.keys()

# calculate intersection
target_processors = list(
set(registed_processor_list).intersection(self.preset_defalut_processors)
)

return target_processors

def load_all_local_model(self):
"""
loads all local models
"""
self._load_dir(data_processors.formatters)
self._load_dir(data_processors.generators)
self._load_dir(data_processors.samplers)
self._load_dir(data_processors.transformers)

def init_data_processor(self, processor_name, **kwargs: dict[str, Any]) -> DataProcessor:
"""
Initializes a data processor with the given name and parameters
"""
return self.init(processor_name, **kwargs)

def init_all_processors(self, **kwargs: Any) -> list[DataProcessor]:
"""
Initializes all registered data processors
"""
return [
self.init(processor_name, **kwargs)
for processor_name in self.registed_data_processors.keys()
]

def init_default_processors(self, **kwargs: Any) -> list[DataProcessor]:
"""
Initializes all default data processors
"""

return [
self.init(processor_name, **kwargs)
for processor_name in self.registed_default_processor_list
]
22 changes: 22 additions & 0 deletions sdgx/data_processors/transformers/base.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
import pandas as pd

from sdgx.data_loader import DataLoader
from sdgx.data_models.metadata import Metadata
from sdgx.data_processors.base import DataProcessor
from sdgx.models.components.optimize.ndarray_loader import NDArrayLoader


class Transformer(DataProcessor):
Expand All @@ -10,3 +15,20 @@ class Transformer(DataProcessor):

To achieve that, Transformer can use :ref:`Formatter` and :ref:`Inspector` to help.
"""

def fit(self, metadata: Metadata | None = None, tabular_data: DataLoader | pd.DataFrame = None):
"""
Fit method for the transformer.
"""

return

@staticmethod
def delete_column(tabular_data, column_name):

pass

@staticmethod
def attach_columns(tabular_data, new_columns):

pass
59 changes: 59 additions & 0 deletions sdgx/data_processors/transformers/column_order.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from __future__ import annotations

from pandas import DataFrame

from sdgx.data_models.metadata import Metadata
from sdgx.data_processors.extension import hookimpl
from sdgx.data_processors.transformers.base import Transformer
from sdgx.utils import logger


class ColumnOrderTransformer(Transformer):
"""
Transformer class for handling missing values in data.

This Transformer is mainly used as a reference for Transformer to facilitate developers to quickly understand the role of Transformer.
"""

column_list: list = None
"""
The list of tabular data's columns.
"""

def fit(self, metadata: Metadata | None = None):
"""
Fit method for the transformer.

Remember the order of the columns.
"""

self.column_list = list(metadata.column_list)

logger.info("ColumnOrderTransformer Fitted.")

return

def convert(self, raw_data: DataFrame) -> DataFrame:
"""
Convert method to handle missing values in the input data.
"""
logger.info("Converting data using ColumnOrderTransformer...")
logger.info("Converting data using ColumnOrderTransformer... Finished (No action).")

return raw_data

def reverse_convert(self, processed_data: DataFrame) -> DataFrame:
"""
Reverse_convert method for the transformer.
"""

logger.info("Data reverse-converted by ColumnOrderTransformer.")

return processed_data

pass


@hookimpl
def register(manager):
manager.register("ColumnOrderTransformer", ColumnOrderTransformer)
Loading
Loading