Merge pull request #168 from ONSdigital/lack-of-back-data-mishandled

lack of back data mishandled
ONSdigital · Jun 23, 2023 · 19d8a7f · 19d8a7f
2 parents 5a77292 + 918bbd5
commit 19d8a7f
Show file tree

Hide file tree

Showing 9 changed files with 70 additions and 39 deletions.
diff --git a/.flake8 b/.flake8
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
@@ -9,5 +9,5 @@ Briefly describe the purpose of the pr.
 
 ## Description
 
-Add a more detailed description of the pr including background and ticket
-references if necessary. 
+Add a more detailed description of the pr if necessary (can reference release
+notes if included).
diff --git a/.gitignore b/.gitignore
@@ -121,8 +121,6 @@ venv.bak/
 .devcontainer/
 # Editor backup files
 *~
-# portray generated docs
-site/
+
 # asdf tool versions
 .tool-versions
-
diff --git a/.isort.cfg b/.isort.cfg
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "statistical_methods_library"
-version = "13.1.0"
+version = "13.1.1"
 description = ""
 authors = ["Your Name <[email protected]>"]
 license = "MIT"
@@ -18,6 +18,7 @@ coverage = "^7.2"
 pytest-cov = "^4.1.0"
 pytest-dependency = "^0.5.1"
 pytest-tap = "^3.3"
+flake8-pyproject = "^1.2.3"
 
 [tool.pytest.ini_options]
 junit_suite_name = "statistical_methods_library"
@@ -31,3 +32,8 @@ addopts = [
 [build-system]
 requires = ["poetry-core>=1.0.0"]
 build-backend = "poetry.core.masonry.api"
+
+
+[tool.flake8]
+max-line-length = 90 
+exclude = ".venv"
diff --git a/release-notes/13.1.1.md b/release-notes/13.1.1.md
@@ -0,0 +1,18 @@
+# Statistical Methods Library 13.1.1
+
+Release date: 2023-06-23
+
+## Synopsis
+
+This release fixes a crash when passing a link filter and not passing back
+data to imputation.
+
+## Changes
+
+Imputation can now handle passing a link filter without back data.
+Previously it would incorrectly attempt to use back data in this case and
+crash due to the data frame being None.
+
+## Notes
+
+This change has no impact on existing outputs.
diff --git a/statistical_methods_library/imputation/engine.py b/statistical_methods_library/imputation/engine.py
@@ -270,15 +270,18 @@ def impute(
     }
 
     if link_filter:
-        filtered_refs = (
-            input_df.unionByName(back_data_df, allowMissingColumns=True).select(
-                col(reference_col).alias("ref"),
-                col(period_col).alias("period"),
-                col(grouping_col).alias("grouping"),
-                (
-                    expr(link_filter) if isinstance(link_filter, str) else link_filter
-                ).alias("match"),
-            )
+        if back_data_df:
+            filtered_refs = input_df.unionByName(back_data_df, allowMissingColumns=True)
+        else:
+            filtered_refs = input_df
+
+        filtered_refs = filtered_refs.select(
+            col(reference_col).alias("ref"),
+            col(period_col).alias("period"),
+            col(grouping_col).alias("grouping"),
+            (expr(link_filter) if isinstance(link_filter, str) else link_filter).alias(
+                "match"
+            ),
         ).localCheckpoint(eager=False)
 
     prepared_df = (

diff --git a/tests/imputation/test_scenarios.py b/tests/imputation/test_scenarios.py
@@ -101,21 +101,15 @@ def test_calculations(fxt_load_test_csv, ratio_calculator, scenario_type, scenar
     back_data_df = scenario_expected_output.filter(
         col(fields["period_col"]) < starting_period
     )
-
-    imputation_kwargs["back_data_df"] = back_data_df
+    if back_data_df.count() > 0:
+        imputation_kwargs["back_data_df"] = back_data_df
 
     scenario_input = scenario_input.filter(col(fields["period_col"]) >= starting_period)
 
     scenario_expected_output = scenario_expected_output.filter(
         col(fields["period_col"]) >= starting_period
     )
 
-    # We need to drop our auxiliary column from our output now
-    # we've potentially set up our back data as this must not come out of
-    # imputation.
-    scenario_expected_output = scenario_expected_output.drop(
-        fields["auxiliary_col"],
-    )
     scenario_actual_output = imputation.impute(
         input_df=scenario_input, **imputation_kwargs
     )