Merge branch 'trs/merge/no-source-columns-by-default'

nextstrain · Sep 17, 2024 · 934588a · 934588a
2 parents db54927 + 67e756a
commit 934588a
Show file tree

Hide file tree

Showing 3 changed files with 33 additions and 11 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -2,9 +2,9 @@
 
 ## __NEXT__
 
-### Features
+### Major Changes
 
-* merge: Generated source columns (e.g. `__source_metadata_{NAME}`) may now have their name template changed with `--source-columns=TEMPLATE` or may be omitted entirely with `--no-source-columns`. [#1625][] (@tsibley)
+* merge: Generated source columns (e.g. `__source_metadata_{NAME}`) are now omitted by default.  They may be explicitly included with `--source-columns=TEMPLATE` or explicitly omitted with `--no-source-columns`.  This may be a breaking change for any existing uses of `augur merge` relying on the generated columns, though as `augur merge` is relatively new we believe usage to be scant if extant at all. [#1625][] [#1632][] (@tsibley)
 
 ### Bug Fixes
 
@@ -13,6 +13,7 @@
 [#1588]: https://github.com/nextstrain/augur/issues/1588
 [#1598]: https://github.com/nextstrain/augur/issues/1598
 [#1625]: https://github.com/nextstrain/augur/issues/1625
+[#1632]: https://github.com/nextstrain/augur/issues/1632
 
 ## 25.4.0 (3 September 2024)
 

diff --git a/augur/merge.py b/augur/merge.py
@@ -16,12 +16,12 @@
 id column name.  Non-id columns in other input tables that would conflict with
 this output id column name are not allowed and if present will cause an error.
 
-One generated column per input table is appended to the end of the output
-table to identify the source of each row's data.  Column names are generated
-as "__source_metadata_{NAME}" where "{NAME}" is the table name given to
---metadata.  Values in each column are 1 or 0 for present or absent in that
-input table.  You may change the generated column names by providing your own
-template with --source-columns or omit these columns entirely with
+One generated column per input table may be optionally appended to the end of
+the output table to identify the source of each row's data.  Column names are
+generated with the template given to --source-columns where "{NAME}" in the
+template is replaced by the table name given to --metadata.  Values in each
+column are 1 or 0 for present or absent in that input table.  By default no
+source columns are generated.  You may make this behaviour explicit with
 --no-source-columns.
 
 Metadata tables of arbitrary size can be handled, limited only by available
@@ -91,8 +91,8 @@ def register_parser(parent_subparsers):
 
     output_group = parser.add_argument_group("outputs", "options related to output")
     output_group.add_argument('--output-metadata', required=True, metavar="FILE", help="Required. Merged metadata as TSV. Compressed files are supported." + SKIP_AUTO_DEFAULT_IN_HELP)
-    output_group.add_argument('--source-columns', default="__source_metadata_{NAME}", metavar="TEMPLATE", help=f"Template with which to generate names for the columns (described above) identifying the source of each row's data. Must contain a literal placeholder, {{NAME}}, which stands in for the metadata table names assigned in --metadata.")
-    output_group.add_argument('--no-source-columns', dest="source_columns", action="store_const", const=None, help=f"Suppress generated columns (described above) identifying the source of each row's data." + SKIP_AUTO_DEFAULT_IN_HELP)
+    output_group.add_argument('--source-columns', metavar="TEMPLATE", help=f"Template with which to generate names for the columns (described above) identifying the source of each row's data. Must contain a literal placeholder, {{NAME}}, which stands in for the metadata table names assigned in --metadata. (default: disabled)" + SKIP_AUTO_DEFAULT_IN_HELP)
+    output_group.add_argument('--no-source-columns', dest="source_columns", action="store_const", const=None, help=f"Suppress generated columns (described above) identifying the source of each row's data. This is the default behaviour, but it may be made explicit or used to override a previous --source-columns." + SKIP_AUTO_DEFAULT_IN_HELP)
     output_group.add_argument('--quiet', action="store_true", default=False, help="Suppress informational and warning messages normally written to stderr. (default: disabled)" + SKIP_AUTO_DEFAULT_IN_HELP)
 
     return parser

diff --git a/tests/functional/merge/cram/merge.t b/tests/functional/merge/cram/merge.t
@@ -23,6 +23,7 @@ each row's source file(s) in extra columns.
 
   $ ${AUGUR} merge \
   >   --metadata X=x.tsv Y=y.tsv \
+  >   --source-columns '__source_metadata_{NAME}' \
   >   --output-metadata - --quiet | csv2tsv --csv-delim $'\t' | tsv-pretty
   strain  a    b    c    f    e    d    __source_metadata_X  __source_metadata_Y
   one     X1a  X1b  X1c                                   1                    0
@@ -40,6 +41,7 @@ More than two inputs.
 
   $ ${AUGUR} merge \
   >   --metadata X=x.tsv Y=y.tsv Z=z.tsv \
+  >   --source-columns '__source_metadata_{NAME}' \
   >   --output-metadata - --quiet | csv2tsv --csv-delim $'\t' | tsv-pretty
   strain  a    b    c    f    e    d    g    __source_metadata_X  __source_metadata_Y  __source_metadata_Z
   one     X1a  X1b  X1c                 Z1g                    1                    0                    1
@@ -53,6 +55,7 @@ Augur's convention of preserving the input id column name.
   $ sed '1s/^strain/name/g' < x.tsv > x-name-column.tsv
   $ ${AUGUR} merge \
   >   --metadata X=x-name-column.tsv Y=y.tsv \
+  >   --source-columns '__source_metadata_{NAME}' \
   >   --output-metadata - --quiet | csv2tsv --csv-delim $'\t' | tsv-pretty
   name   a    b    c    f    e    d    __source_metadata_X  __source_metadata_Y
   one    X1a  X1b  X1c                                   1                    0
@@ -62,6 +65,7 @@ Augur's convention of preserving the input id column name.
   $ sed '1s/^strain/name/g' < y.tsv > y-name-column.tsv
   $ ${AUGUR} merge \
   >   --metadata X=x.tsv Y=y-name-column.tsv \
+  >   --source-columns '__source_metadata_{NAME}' \
   >   --output-metadata - --quiet | csv2tsv --csv-delim $'\t' | tsv-pretty
   strain  a    b    c    f    e    d    __source_metadata_X  __source_metadata_Y
   one     X1a  X1b  X1c                                   1                    0
@@ -74,6 +78,7 @@ Supports --metadata-id-columns.
   $ ${AUGUR} merge \
   >   --metadata X=x-id-column.tsv Y=y.tsv \
   >   --metadata-id-columns id strain \
+  >   --source-columns '__source_metadata_{NAME}' \
   >   --output-metadata - --quiet | csv2tsv --csv-delim $'\t' | tsv-pretty
   id     a    b    c    f    e    d    __source_metadata_X  __source_metadata_Y
   one    X1a  X1b  X1c                                   1                    0
@@ -85,6 +90,7 @@ Supports table-specific --metadata-id-columns.
   $ ${AUGUR} merge \
   >   --metadata X=x-id-column.tsv Y=y.tsv \
   >   --metadata-id-columns X=id \
+  >   --source-columns '__source_metadata_{NAME}' \
   >   --output-metadata - --quiet | csv2tsv --csv-delim $'\t' | tsv-pretty
   id     a    b    c    f    e    d    __source_metadata_X  __source_metadata_Y
   one    X1a  X1b  X1c                                   1                    0
@@ -94,6 +100,7 @@ Supports table-specific --metadata-id-columns.
   $ ${AUGUR} merge \
   >   --metadata X=x.tsv Y=y-name-column.tsv \
   >   --metadata-id-columns strain Y=name \
+  >   --source-columns '__source_metadata_{NAME}' \
   >   --output-metadata - --quiet | csv2tsv --csv-delim $'\t' | tsv-pretty
   strain  a    b    c    f    e    d    __source_metadata_X  __source_metadata_Y
   one     X1a  X1b  X1c                                   1                    0
@@ -105,6 +112,7 @@ Supports Augur's standard delimiter detection.
   $ sed 's/\t/,/g' < x.tsv > x.csv
   $ ${AUGUR} merge \
   >   --metadata X=x.csv Y=y.tsv \
+  >   --source-columns '__source_metadata_{NAME}' \
   >   --output-metadata - --quiet | csv2tsv --csv-delim $'\t' | tsv-pretty
   strain  a    b    c    f    e    d    __source_metadata_X  __source_metadata_Y
   one     X1a  X1b  X1c                                   1                    0
@@ -117,6 +125,7 @@ Supports --metadata-delimiters.
   $ ${AUGUR} merge \
   >   --metadata X=x.txt Y=y.tsv \
   >   --metadata-delimiters '|' $'\t' \
+  >   --source-columns '__source_metadata_{NAME}' \
   >   --output-metadata - --quiet | csv2tsv --csv-delim $'\t' | tsv-pretty
   strain  a    b    c    f    e    d    __source_metadata_X  __source_metadata_Y
   one     X1a  X1b  X1c                                   1                    0
@@ -128,6 +137,7 @@ Supports table-specific --metadata-delimiters.
   $ ${AUGUR} merge \
   >   --metadata X=x.txt Y=y.tsv \
   >   --metadata-delimiters X='|' \
+  >   --source-columns '__source_metadata_{NAME}' \
   >   --output-metadata - --quiet | csv2tsv --csv-delim $'\t' | tsv-pretty
   strain  a    b    c    f    e    d    __source_metadata_X  __source_metadata_Y
   one     X1a  X1b  X1c                                   1                    0
@@ -137,6 +147,7 @@ Supports table-specific --metadata-delimiters.
   $ ${AUGUR} merge \
   >   --metadata X=x.txt Y=y.tsv \
   >   --metadata-delimiters $'\t' X='|' \
+  >   --source-columns '__source_metadata_{NAME}' \
   >   --output-metadata - --quiet | csv2tsv --csv-delim $'\t' | tsv-pretty
   strain  a    b    c    f    e    d    __source_metadata_X  __source_metadata_Y
   one     X1a  X1b  X1c                                   1                    0
@@ -149,6 +160,7 @@ Supports Augur's standard accepted compression formats.
   $ zstd < y.tsv > y.tsv.zst
   $ ${AUGUR} merge \
   >   --metadata X=x.tsv.xz Y=y.tsv.zst \
+  >   --source-columns '__source_metadata_{NAME}' \
   >   --output-metadata - --quiet | csv2tsv --csv-delim $'\t' | tsv-pretty
   strain  a    b    c    f    e    d    __source_metadata_X  __source_metadata_Y
   one     X1a  X1b  X1c                                   1                    0
@@ -178,6 +190,7 @@ Metadata field values with metachars (field or record delimiters) are handled pr
   > ~~
   $ ${AUGUR} merge \
   >   --metadata X=x.tsv metachars=metachars.csv \
+  >   --source-columns '__source_metadata_{NAME}' \
   >   --output-metadata - --quiet
   strain	a	b	c	comma	tab	newline	__source_metadata_X	__source_metadata_metachars
   one	X1a	X1b	X1c	x,x	"x	x"	"x
@@ -195,7 +208,7 @@ Source columns template.
   two     X2a  X2b  Y2c  Y2f  Y2e  Y2d         1         1
   three                  Y3f  Y3e  Y3d         0         1
 
-No source columns.
+No source columns (explicitly or by default).
 
   $ ${AUGUR} merge \
   >   --metadata X=x.tsv Y=y.tsv \
@@ -206,6 +219,14 @@ No source columns.
   two     X2a  X2b  Y2c  Y2f  Y2e  Y2d
   three                  Y3f  Y3e  Y3d
 
+  $ ${AUGUR} merge \
+  >   --metadata X=x.tsv Y=y.tsv \
+  >   --output-metadata - --quiet | csv2tsv --csv-delim $'\t' | tsv-pretty
+  strain  a    b    c    f    e    d
+  one     X1a  X1b  X1c            
+  two     X2a  X2b  Y2c  Y2f  Y2e  Y2d
+  three                  Y3f  Y3e  Y3d
+
 
 ERROR HANDLING