-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathQIDs-Values_Replacements.py
73 lines (72 loc) · 3.88 KB
/
QIDs-Values_Replacements.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import pandas as pd
import numpy as np
# Step 1: Read the already clustered data w.r.t. k-anonymity & l-diversity.
df = pd.read_csv('clustered database name.csv')
# In the below code, we will generalize the 'Age', 'Education', 'Workclass', and 'Sex' columns.
# The generalization process is the same for other attributes, just append the correct generalization.
# Some examples of the generalization are given in generalization_mappings.json for understanding.
# Generalize 'Age' into intervals, the interval can be adjusted as per the requirements.
df['age'] = pd.to_numeric(df['age'], errors='coerce')
df['age'].fillna(df['age'].mean(), inplace=True)
age_bins = list(range(0, 103, 2)) # Bin edges: [0, 3, 6, ..., 99, 102]
age_labels = [f'{i}-{i+3}' for i in range(0, 100, 2)] + ['100+'] # Labels for bins
df['Age Binned'] = pd.cut(df['age'], bins=age_bins, labels=age_labels, right=False)
# Generalize 'Education' into first/lower level categories
education_mapping = {
'Bachelors': 'Undergraduate',
'Masters': 'Postgraduate',
'Doctorate': 'Postgraduate',
'Assoc-acdm': 'Associate',
'Assoc-voc': 'Associate',
'10th': 'Junior-Secondry',
'11th': 'Senior-Secondry',
'12th': 'Senior-Secondry',
'HS-grad': 'Senior-Secondry',
'Some-college': 'Undergraduate',
'Preschool': 'Other',
'1st-4th': 'Elementry',
'5th-6th': 'Elementry',
'7th-8th': 'Elementry',
'9th': 'Junior-Secondry'
}
df['Education Generalized'] = df['education'].map(education_mapping).fillna('Other')
workclass_mapping = {
'Private': 'Private',
'Self-emp-not-inc': 'Self-Employed',
'Self-emp-inc': 'Self-Employed',
'Federal-gov': 'Government',
'Local-gov': 'Government',
'State-gov': 'Government',
'Without-pay': 'Others',
'Never-worked': 'Unemployed'
}
df['Workclass Generalized'] = df['workclass'].map(workclass_mapping).fillna('Others')
sex_mapping = {
'Male': 'Male', 'Female': 'Female', 'Other': 'Other' # This is just an example; modify as needed
}
df['Sex Generalized'] = df['sex'].map(sex_mapping).fillna('Other')
# Assuming 'Cluster ID' is the identifier for clusters. In our code, its 'Cluster' only not 'Cluster ID'
def apply_generalization_within_cluster(df, quasi_identifiers):
# Read clusters & apply generalizations
anonymized_data = []
for cluster_id, cluster_data in df.groupby('Cluster'):
print(f"Processing Cluster ID: {cluster_id}")
for column in quasi_identifiers:
if cluster_data[column].nunique() == 1:
# If there is only one unique value, skip the generalization for this column, later convert to real values also.
#Uncomment to see results, if desirable. print(f"Skipping generalization for column '{column}' in Cluster ID {cluster_id} (only one unique value).")
# Keep original value
df.loc[cluster_data.index, column] = cluster_data[column]
else:
# If there are multiple values, apply generalization.
#Uncomment to see results, if desirable.print(f"Generalizing column '{column}' for Cluster ID {cluster_id}.")
anonymized_data.append(cluster_data)
# Combine modified data and add it to the DataFrame
anonymized_df = pd.concat(anonymized_data, ignore_index=True)
return anonymized_df
# Add labels for the generalized columns. They can also be named as same like real data.
quasi_identifiers = ['Age Binned', 'Education Generalized', 'Workclass Generalized','Sex Generalized']
anonymized_df = apply_generalization_within_cluster(df,quasi_identifiers)
anonymized_df = anonymized_df[['Age Binned', 'Education Generalized', 'Workclass Generalized', 'Sex Generalized', 'income', 'Cluster']]
# Save results to external file for downstream tasks.
anonymized_df.to_csv('generalized_clustered_adult_data.csv', index=False)