Fix future warnings for pandas>=2.2 (#1494)

This PR fixes #1490. - Resolves FutureWarning messages when running qualification with pandas>=2.2 by using new form of `df.method({col: value}, inplace=True)` for `df.fillna()` calls. - Resolves SettingWithCopyWarning by making an explicit copy of input dataframe before modification. --------- Signed-off-by: Lee Yang <[email protected]>
NVIDIA · Jan 10, 2025 · bb048d0 · bb048d0
1 parent a215df7
commit bb048d0
Show file tree

Hide file tree

Showing 3 changed files with 11 additions and 10 deletions.
diff --git a/user_tools/src/spark_rapids_pytools/rapids/qualification.py b/user_tools/src/spark_rapids_pytools/rapids/qualification.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Copyright (c) 2023-2025, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -555,9 +555,9 @@ def __update_apps_with_prediction_info(self,
                 # Rename the source column to the destination column
                 result_df.rename(columns={src_col: dst_col}, errors='ignore', inplace=True)
             # if the qualx does not have a speedup value, default to 1.0
-            result_df['Estimated GPU Speedup'].fillna(1.0, inplace=True)
+            result_df.fillna({'Estimated GPU Speedup': 1.0}, inplace=True)
             # if the qualx does not have a duration value, default to App Duration
-            result_df['Estimated GPU Duration'].fillna(result_df['App Duration'], inplace=True)
+            result_df.fillna({'Estimated GPU Duration': result_df['App Duration']}, inplace=True)
         # We need to be careful about other columns that depend on remapped columns
         result_df['Estimated GPU Time Saved'] = result_df['App Duration'] - result_df['Estimated GPU Duration']
         return result_df

diff --git a/user_tools/src/spark_rapids_tools/tools/qualx/preprocess.py b/user_tools/src/spark_rapids_tools/tools/qualx/preprocess.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -810,7 +810,7 @@ def scan_tbl(
 
     if not app_info.empty:
         app_info['appName'] = app_name
-        app_info['sparkVersion'].fillna('Unknown', inplace=True)
+        app_info.fillna({'sparkVersion': 'Unknown'}, inplace=True)
 
     # Get jar versions:
     cudf_version = '-'

diff --git a/user_tools/src/spark_rapids_tools/tools/top_candidates.py b/user_tools/src/spark_rapids_tools/tools/top_candidates.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -101,13 +101,14 @@ def _generate_output_table_internal(self, output_df: pd.DataFrame) -> pd.DataFra
         """
         # Create and append 'Speedup Category Order' column to output_df for sorting order
         speedup_category_order = self.props.get('ineligibleCategory') + self.props.get('eligibleCategories')
-        output_df['Speedup Category Order'] = \
-            output_df['Estimated GPU Speedup Category'].map({name: i for i, name in enumerate(speedup_category_order)})
+        df = output_df.copy()
+        df['Speedup Category Order'] = \
+            df['Estimated GPU Speedup Category'].map({name: i for i, name in enumerate(speedup_category_order)})
         # Sort columns and select output columns
         output_columns = self.props.get('outputColumns')
         sorting_columns = self.props.get('sortingColumns')
-        valid_output_columns = list(output_df.columns.intersection(output_columns))
-        res_df = output_df.sort_values(by=sorting_columns, ascending=False)[valid_output_columns]
+        valid_output_columns = list(df.columns.intersection(output_columns))
+        res_df = df.sort_values(by=sorting_columns, ascending=False)[valid_output_columns]
         # this is a bit weird since hardcoding, but we don't want this to have ** for csv output
         if 'Estimated GPU Speedup Category' in res_df:
             res_df.rename(columns={'Estimated GPU Speedup Category': 'Estimated GPU Speedup Category**'},