Fix compatibility issues with pandas 2.0

xgchena · xgchena · commit 29d3d0a460a1 · 2023-04-03T19:27:57.000-07:00
There is a new pandas release (2.0) and then the library's test cases start failing with errors like "TypeError: DataFrame.drop() takes from 1 to 2 positional arguments but 3 were given". Per investigation the rootcause is that the pandas 2.0 deprecated non-keyword arguments of several methods including drop(), and it also deprecated Series.append. The commit fixes the problems by 1. using keyword arguments for the drop method 2. using pandas.concat() to replace the Series.append() method Internal tracking ticket: https://tiny.amazon.com/kzvlpcax
diff --git a/src/smclarify/bias/report.py b/src/smclarify/bias/report.py
@@ -207,7 +207,7 @@ def _positive_predicted_index(
         raise ValueError("Predicted Label Column series datatype is not the same as Label Column series")
     if predicted_label_datatype == common.DataType.CONTINUOUS:
         predicted_label_data = predicted_label_data.astype(label_data.dtype)
-        data_interval_indices = _interval_index(label_data.append(predicted_label_data), positive_label_values)
+        data_interval_indices = _interval_index(pd.concat([label_data, predicted_label_data]), positive_label_values)
         positive_predicted_index = _continuous_data_idx(predicted_label_data, data_interval_indices)
     elif predicted_label_datatype == common.DataType.CATEGORICAL and positive_label_values:
         positive_predicted_index = _categorical_data_idx(predicted_label_data, positive_label_values)
@@ -436,7 +436,7 @@ def model_performance_report(df: pd.DataFrame, label_column: LabelColumn, predic
         data=label_data_series, data_type=label_data_type, positive_values=positive_label_values
     )
     if label_column.name in df.columns:
-        df = df.drop(label_column.name, 1)
+        df = df.drop(labels=label_column.name, axis=1)
 
     predicted_label_data_type, predicted_label_data_series = common.ensure_series_data_type(
         predicted_label_column.series, positive_label_values
@@ -508,7 +508,7 @@ def bias_report(
         metrics_to_run.extend(post_training_metrics)
         predicted_label_series = predicted_label_column.series
         if predicted_label_column.name in df.columns:
-            df = df.drop(predicted_label_column.name, 1)
+            df = df.drop(labels=predicted_label_column.name, axis=1)
     else:
         pre_training_metrics = (
             smclarify.bias.metrics.PRETRAINING_METRICS
@@ -588,15 +588,15 @@ def _report(
 
     sensitive_facet_values = facet_column.sensitive_values
     facet_data_type, facet_data_series = common.ensure_series_data_type(df[facet_column.name], sensitive_facet_values)
-    df = df.drop(facet_column.name, 1)
+    df = df.drop(labels=facet_column.name, axis=1)
 
     positive_label_values = label_column.positive_label_values
     label_data_type, label_data_series = common.ensure_series_data_type(label_column.series, positive_label_values)
     positive_label_index, _ = _positive_label_index(
         data=label_data_series, data_type=label_data_type, positive_values=positive_label_values
     )
     if label_column.name in df.columns:
-        df = df.drop(label_column.name, 1)
+        df = df.drop(labels=label_column.name, axis=1)
 
     positive_predicted_label_index = [None]
     if predicted_label_column:
@@ -612,7 +612,7 @@ def _report(
                 positive_label_values=positive_label_values,
             )
         if predicted_label_column.name in df.columns:
-            df = df.drop(predicted_label_column.name, 1)
+            df = df.drop(labels=predicted_label_column.name, axis=1)
 
     # Above are validations and preprocessing, the real reporting logic is moved to a new method for clarity and
     # to avoid using wrong data by chance (e.g., label_data_series should be used, instead of label_column.data).