Merge pull request #1278: filter: Remove attempt at extracting variab…

…les from --query
nextstrain · Aug 14, 2023 · 0a62483 · 0a62483
2 parents 1d92f6d + 58e24a3
commit 0a62483
Show file tree

Hide file tree

Showing 4 changed files with 62 additions and 36 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -8,8 +8,13 @@
 * ancestral: add the ability to report mutations relative to a sequence other than the inferred root of the tree. This sequence can be specified via `--root-sequence` and difference between this sequence and the inferred root of the tree will be added as mutations to the root node for nucleotides and amino acids. All differences between the specified `root-sequence` and the inferred sequence of the root node of the tree will be added as mutations to the root node. This was previously already possible for `vcf` input via `--vcf-reference`. [#1258][] (@rneher)
 * refine: add `mid_point` as rooting option to `refine`. [#1257][] (@rneher)
 
+### Bug fixes
+
+* filter: In version 22.2.0, `--query` would fail when the `.str` accessor was used on a column. This has been fixed. [#1277][] (@victorlin)
+
 [#1257]: https://github.com/nextstrain/augur/pull/1257
 [#1258]: https://github.com/nextstrain/augur/pull/1258
+[#1277]: https://github.com/nextstrain/augur/issues/1277
 
 ## 22.2.0 (31 July 2023)
 

diff --git a/augur/filter/include_exclude_rules.py b/augur/filter/include_exclude_rules.py
@@ -13,12 +13,6 @@
 from augur.utils import read_strains
 from . import constants
 
-try:
-    # python ≥3.8 only
-    from typing import Literal  # type: ignore
-except ImportError:
-    from typing_extensions import Literal  # type: ignore
-
 try:
     # pandas ≥1.5.0 only
     PandasUndefinedVariableError = pd.errors.UndefinedVariableError  # type: ignore
@@ -190,11 +184,14 @@ def filter_by_query(metadata, query) -> FilterFunctionReturn:
     set()
 
     """
-    # Try converting all queried columns to numeric.
-    for column in extract_variables(query).intersection(metadata.columns):
-        metadata[column] = pd.to_numeric(metadata[column], errors='ignore')
+    # Create a copy to prevent modification of the original DataFrame.
+    metadata_copy = metadata.copy()
+
+    # Try converting all columns to numeric.
+    for column in metadata_copy.columns:
+        metadata_copy[column] = pd.to_numeric(metadata_copy[column], errors='ignore')
 
-    return set(metadata.query(query).index.values)
+    return set(metadata_copy.query(query).index.values)
 
 
 def filter_by_ambiguous_date(metadata, date_column, ambiguity) -> FilterFunctionReturn:
@@ -810,29 +807,3 @@ def _filter_kwargs_to_str(kwargs: FilterFunctionKwargs):
         kwarg_list.append((key, value))
 
     return json.dumps(kwarg_list)
-
-
-# From https://stackoverflow.com/a/76536356
-def extract_variables(pandas_query: str):
-    """Extract variable names used in a pandas query string."""
-
-    # Track variables in a dictionary to be used as a dictionary of globals.
-    variables: Dict[str, Literal[None]] = {}
-
-    while True:
-        try:
-            # Try creating a Expr object with the query string and dictionary of globals.
-            # This will raise an error as long as the dictionary of globals is incomplete.
-            env = pd.core.computation.scope.ensure_scope(level=0, global_dict=variables)
-            pd.core.computation.expr.Expr(pandas_query, env=env)
-
-            # Exit the loop when evaluation is successful.
-            break
-        except PandasUndefinedVariableError as e:
-            # This relies on the format defined here: https://github.com/pandas-dev/pandas/blob/965ceca9fd796940050d6fc817707bba1c4f9bff/pandas/errors/__init__.py#L401
-            name = re.findall("name '(.+?)' is not defined", str(e))[0]
-
-            # Add the name to the globals dictionary with a dummy value.
-            variables[name] = None
-
-    return set(variables.keys())
diff --git a/tests/functional/filter/cram/filter-query-and-exclude-ambiguous-dates-by.t b/tests/functional/filter/cram/filter-query-and-exclude-ambiguous-dates-by.t
@@ -0,0 +1,26 @@
+Setup
+
+  $ source "$TESTDIR"/_setup.sh
+
+Create metadata TSV file for testing.
+
+  $ cat >metadata.tsv <<~~
+  > strain	date	region
+  > SEQ_1	2020	Asia
+  > SEQ_2	2020	Asia
+  > SEQ_3	2020	Asia
+  > SEQ_4	2020	North America
+  > ~~
+
+Confirm that `--exclude-ambiguous-dates-by` works for all year only ambiguous dates.
+
+  $ ${AUGUR} filter \
+  >  --metadata metadata.tsv \
+  >  --query 'region=="Asia"' \
+  >  --exclude-ambiguous-dates-by any \
+  >  --empty-output-reporting silent \
+  >  --output-strains filtered_strains.txt
+  4 strains were dropped during filtering
+  \t1 of these were filtered out by the query: "region=="Asia"" (esc)
+  \t3 of these were dropped because of their ambiguous date in any (esc)
+  0 strains passed all filters
diff --git a/tests/functional/filter/cram/filter-query-str.t b/tests/functional/filter/cram/filter-query-str.t
@@ -0,0 +1,24 @@
+Setup
+
+  $ source "$TESTDIR"/_setup.sh
+
+Create metadata file for testing.
+
+  $ cat >metadata.tsv <<~~
+  > strain	column
+  > SEQ_1	value1
+  > SEQ_2	value2
+  > SEQ_3	value3
+  > ~~
+
+'column' should be query-able using the `.str` accessor.
+
+  $ ${AUGUR} filter \
+  >  --metadata metadata.tsv \
+  >  --query "column.str.startswith('value')" \
+  >  --output-strains filtered_strains.txt > /dev/null
+
+  $ sort filtered_strains.txt
+  SEQ_1
+  SEQ_2
+  SEQ_3