From 128aad84b5a409e6f155abe8b7f9a9088d3121c1 Mon Sep 17 00:00:00 2001
From: james hadfield <hadfield.james@gmail.com>
Date: Tue, 23 Apr 2024 22:24:12 +1200
Subject: [PATCH 1/3] [export] enforce tree naming standards

Checking for duplicated node names and missing node names is in line
with the schema. Previously some calls to `export v2` would be ok with
missing node names (e.g. see the updated tests in `minify-output.t`) but
any usage with metadata would result in an uncaught error.
---
 augur/export_v2.py                            | 28 +++++++++++++++----
 .../export_v2/cram/duplicate-node-names.t     | 10 +++++++
 .../functional/export_v2/cram/minify-output.t | 25 +++++++++--------
 .../export_v2/cram/missing-internal-nodes.t   | 18 ++++++++++++
 4 files changed, 65 insertions(+), 16 deletions(-)
 create mode 100644 tests/functional/export_v2/cram/duplicate-node-names.t
 create mode 100644 tests/functional/export_v2/cram/missing-internal-nodes.t
diff --git a/augur/export_v2.py b/augur/export_v2.py
index af563bf50..f5aa95890 100644
--- a/augur/export_v2.py
+++ b/augur/export_v2.py
@@ -117,6 +117,24 @@ def order_nodes(node):
     return od
 
 
+def read_tree(fname):
+    tree = Phylo.read(fname, 'newick')
+    # augur export requires unique node names (both terminal and external) as these
+    # are used to associate metadata/node-data with nodes. Any duplication is fatal.
+    # The exception to this is unlabelled node names, which auspice will handle but
+    # won't be associated with any metadata within export.
+    node_names = [clade.name for clade in tree.root.find_clades()]
+    if None in node_names:
+        raise AugurError(f"Tree contains unnamed nodes. If these are internal nodes you may wish to run "+
+                         "`augur refine --tree <newick> --output-tree <newick>` to name them.")
+    if len(set(node_names))!=len(node_names):
+        from collections import Counter
+        dups = [name for name, count in Counter(node_names).items() if count>1]
+        raise AugurError(f"{len(dups)} node names occur multiple times in the tree: " +
+                         ", ".join([f"'{v}'" for v in dups[0:5]]) + ("..." if len(dups)>5 else ""))
+    return (tree, node_names)
+
+
 def node_div(T, node_attrs):
     """
     Scans the provided tree & metadata to see if divergence is defined, and if so returns
@@ -1059,12 +1077,12 @@ def create_branch_labels(branch_attrs, node_data, branch_data):
                 continue
             branch_attrs[node_name]["labels"][label_key] = label_value
 
-def parse_node_data_and_metadata(T, node_data, metadata):
+def parse_node_data_and_metadata(node_names, node_data, metadata):
     node_data_names = set()
     metadata_names = set()
 
     # assign everything to node_attrs, exclusions considered later
-    node_attrs = {clade.name: {} for clade in T.root.find_clades()}
+    node_attrs = {name: {} for name in node_names}
 
     # first pass: metadata
     for metadata_id, node in metadata.items():
@@ -1088,7 +1106,7 @@ def parse_node_data_and_metadata(T, node_data, metadata):
     # third pass: create `branch_attrs`. The data comes from
     # (a) some keys within `node_data['nodes']` (for legacy reasons)
     # (b) the `node_data['branches']` dictionary, which currently only defines labels
-    branch_attrs = {clade.name: defaultdict(dict) for clade in T.root.find_clades()}
+    branch_attrs = {name: defaultdict(dict) for name in node_names}
     create_branch_mutations(branch_attrs, node_data)
     create_branch_labels(branch_attrs, node_data['nodes'], node_data.get('branches', {}))
 
@@ -1173,9 +1191,9 @@ def run(args):
         metadata_file = {}
 
     # parse input files
-    T = Phylo.read(args.tree, 'newick')
+    (T, node_names) = read_tree(args.tree)
     node_data, node_attrs, node_data_names, metadata_names, branch_attrs = \
-            parse_node_data_and_metadata(T, node_data_file, metadata_file)
+            parse_node_data_and_metadata(node_names, node_data_file, metadata_file)
     config = get_config(args)
     additional_metadata_columns = get_additional_metadata_columns(config, args.metadata_columns, metadata_names)
 
diff --git a/tests/functional/export_v2/cram/duplicate-node-names.t b/tests/functional/export_v2/cram/duplicate-node-names.t
new file mode 100644
index 000000000..2a8c8ca77
--- /dev/null
+++ b/tests/functional/export_v2/cram/duplicate-node-names.t
@@ -0,0 +1,10 @@
+  $ source "$TESTDIR"/_setup.sh
+
+  $ echo "(tipA:1,(tipA:1,tipA:1)internalBC:2,(tipD:3,tipE:4,tipF:1)internalDEF:5)ROOT:0;" > tree1.nwk
+
+  $ ${AUGUR} export v2 \
+  >   --tree tree1.nwk \
+  >   --output minimal.json
+  ERROR: 1 node names occur multiple times in the tree: 'tipA'
+  [2]
+
diff --git a/tests/functional/export_v2/cram/minify-output.t b/tests/functional/export_v2/cram/minify-output.t
index 68a573369..a9035a9a0 100644
--- a/tests/functional/export_v2/cram/minify-output.t
+++ b/tests/functional/export_v2/cram/minify-output.t
@@ -16,13 +16,16 @@ names to create larger file sizes with less recursion.
   > }
 
 A small tree is not automatically minified.
-The unminified output is 13KB which is considered acceptable.
+The unminified output is 15KB which is considered acceptable.
+
+  $ echo "$(generate_newick 10);" > small_tree_raw.nwk
+
+Use augur refine to add internal node names
+  $ ${AUGUR} refine --tree small_tree_raw.nwk --output-tree small_tree.nwk  &>/dev/null
 
-  $ echo "$(generate_newick 10);" > small_tree.nwk
 
   $ ${AUGUR} export v2 \
   >   --tree small_tree.nwk \
-  >   --skip-validation \
   >   --output output.json &>/dev/null
 
   $ head -c 20 output.json
@@ -30,13 +33,12 @@ The unminified output is 13KB which is considered acceptable.
     "version": "v2", (no-eol)
 
   $ ls -l output.json | awk '{print $5}'
-  13813
+  15039
 
 It can be forcefully minified with an argument.
 
   $ ${AUGUR} export v2 \
   >   --tree small_tree.nwk \
-  >   --skip-validation \
   >   --minify-json \
   >   --output output.json &>/dev/null
 
@@ -48,21 +50,22 @@ even if it may seem "falsey".
 
   $ AUGUR_MINIFY_JSON=0 ${AUGUR} export v2 \
   >   --tree small_tree.nwk \
-  >   --skip-validation \
   >   --output output.json &>/dev/null
 
   $ head -c 20 output.json
   {"version": "v2", "m (no-eol)
 
 
-A large tree, when forcefully not minified, has an output size of 6MB which is
+A large tree, when forcefully not minified, has an output size of 8MB which is
 considered large.
 
-  $ echo "$(generate_newick 500);" > big_tree.nwk
+  $ echo "$(generate_newick 500);" > big_tree_raw.nwk
+
+Use augur refine to add internal node names
+  $ ${AUGUR} refine --tree big_tree_raw.nwk --output-tree big_tree.nwk  &>/dev/null
 
   $ ${AUGUR} export v2 \
   >   --tree big_tree.nwk \
-  >   --skip-validation \
   >   --no-minify-json \
   >   --output output.json &>/dev/null
 
@@ -71,7 +74,7 @@ considered large.
     "version": "v2", (no-eol)
 
   $ ls -l output.json | awk '{print $5}'
-  6568454
+  8591420
 
 This means it is automatically minified.
 
@@ -84,4 +87,4 @@ This means it is automatically minified.
   {"version": "v2", "m (no-eol)
 
   $ ls -l output.json | awk '{print $5}'
-  561436
+  576414
diff --git a/tests/functional/export_v2/cram/missing-internal-nodes.t b/tests/functional/export_v2/cram/missing-internal-nodes.t
new file mode 100644
index 000000000..3732a9c72
--- /dev/null
+++ b/tests/functional/export_v2/cram/missing-internal-nodes.t
@@ -0,0 +1,18 @@
+Setup
+
+  $ source "$TESTDIR"/_setup.sh
+
+Test a tree with unlabelled internal nodes.
+In Augur v24.3.0 and earlier these would work _sometimes_ depending
+on what code paths were hit, but in most real-life cases would raise
+an uncaught error. We now check for these when we parse the tree.
+
+  $ echo "(tipA:1,(tipB:1,tipC:1):2,(tipD:3,tipE:4,tipF:1):5):0;" > tree1.nwk
+
+  $ ${AUGUR} export v2 \
+  >   --tree tree1.nwk \
+  >   --metadata "$TESTDIR/../data/dataset1_metadata_with_name.tsv" \
+  >   --node-data "$TESTDIR/../data/div_node-data.json" \
+  >   --output test1.json
+  ERROR: Tree contains unnamed nodes.+ (re)
+  [2]

From 8baf3f8c45e59215401185cb4b9c62a93910cd41 Mon Sep 17 00:00:00 2001
From: james hadfield <hadfield.james@gmail.com>
Date: Wed, 24 Apr 2024 10:29:53 +1200
Subject: [PATCH 2/3] [export] allow multiple trees
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Multiple trees ("subtrees") have been available in Auspice since late
2021¹ and part of the associated schema since early 2022². Despite this
there was no way to produce such datasets within Augur itself, and
despite the schema changes the associated `augur validate` command was
never updated to allow them.

This commit adds multi-tree inputs to `augur export v2` as well as
allowing them to validate with our associated validation commands.

¹ <https://github.com/nextstrain/auspice/pull/1442>
² <https://github.com/nextstrain/augur/pull/851>
---
 augur/export_v2.py                           | 28 +++++++----
 augur/validate_export.py                     | 36 ++++++++++---
 tests/functional/export_v2/cram/multi-tree.t | 53 ++++++++++++++++++++
 3 files changed, 100 insertions(+), 17 deletions(-)
 create mode 100644 tests/functional/export_v2/cram/multi-tree.t

diff --git a/augur/export_v2.py b/augur/export_v2.py
index f5aa95890..cfb31a53a 100644
--- a/augur/export_v2.py
+++ b/augur/export_v2.py
@@ -117,13 +117,13 @@ def order_nodes(node):
     return od
 
 
-def read_tree(fname):
-    tree = Phylo.read(fname, 'newick')
+def read_trees(filenames):
+    trees = [Phylo.read(fname, 'newick') for fname in filenames]
     # augur export requires unique node names (both terminal and external) as these
     # are used to associate metadata/node-data with nodes. Any duplication is fatal.
     # The exception to this is unlabelled node names, which auspice will handle but
     # won't be associated with any metadata within export.
-    node_names = [clade.name for clade in tree.root.find_clades()]
+    node_names = [clade.name for tree in trees for clade in tree.root.find_clades()]
     if None in node_names:
         raise AugurError(f"Tree contains unnamed nodes. If these are internal nodes you may wish to run "+
                          "`augur refine --tree <newick> --output-tree <newick>` to name them.")
@@ -132,7 +132,7 @@ def read_tree(fname):
         dups = [name for name, count in Counter(node_names).items() if count>1]
         raise AugurError(f"{len(dups)} node names occur multiple times in the tree: " +
                          ", ".join([f"'{v}'" for v in dups[0:5]]) + ("..." if len(dups)>5 else ""))
-    return (tree, node_names)
+    return (trees, node_names)
 
 
 def node_div(T, node_attrs):
@@ -750,7 +750,12 @@ def _recursively_set_data(node):
             node['branch_attrs'] = branch_attrs[node['name']]
         for child in node.get("children", []):
             _recursively_set_data(child)
-    _recursively_set_data(data_json["tree"])
+
+    if isinstance(data_json["tree"], list):
+        for subtree in data_json['tree']:
+            _recursively_set_data(subtree)
+    else:
+        _recursively_set_data(data_json["tree"])
 
 
 def set_node_attrs_on_tree(data_json, node_attrs, additional_metadata_columns):
@@ -839,7 +844,11 @@ def _recursively_set_data(node):
         for child in node.get("children", []):
             _recursively_set_data(child)
 
-    _recursively_set_data(data_json["tree"])
+    if isinstance(data_json["tree"], list):
+        for subtree in data_json['tree']:
+            _recursively_set_data(subtree)
+    else:
+        _recursively_set_data(data_json["tree"])
 
 def node_data_prop_is_normal_trait(name):
     # those traits / keys / attrs which are not "special" and can be exported
@@ -893,7 +902,7 @@ def register_parser(parent_subparsers):
     required = parser.add_argument_group(
         title="REQUIRED"
     )
-    required.add_argument('--tree','-t', metavar="newick", required=True, help="Phylogenetic tree, usually output from `augur refine`")
+    required.add_argument('--tree','-t', metavar="newick", nargs='+', required=True, help="Phylogenetic tree(s), usually output from `augur refine`")
     required.add_argument('--output', metavar="JSON", required=True, help="Output file (typically for visualisation in auspice)")
 
     config = parser.add_argument_group(
@@ -1191,7 +1200,7 @@ def run(args):
         metadata_file = {}
 
     # parse input files
-    (T, node_names) = read_tree(args.tree)
+    (trees, node_names) = read_trees(args.tree)
     node_data, node_attrs, node_data_names, metadata_names, branch_attrs = \
             parse_node_data_and_metadata(node_names, node_data_file, metadata_file)
     config = get_config(args)
@@ -1223,7 +1232,8 @@ def run(args):
     set_filters(data_json, config)
 
     # set tree structure
-    data_json["tree"] = convert_tree_to_json_structure(T.root, node_attrs, node_div(T, node_attrs))
+    trees_json = [convert_tree_to_json_structure(tree.root, node_attrs, node_div(tree, node_attrs)) for tree in trees]
+    data_json["tree"] = trees_json[0] if len(trees_json)==1 else trees_json
     set_node_attrs_on_tree(data_json, node_attrs, additional_metadata_columns)
     set_branch_attrs_on_tree(data_json, branch_attrs)
 
diff --git a/augur/validate_export.py b/augur/validate_export.py
index 5cd3f4c4c..80da43bfe 100644
--- a/augur/validate_export.py
+++ b/augur/validate_export.py
@@ -7,7 +7,7 @@
 import sys
 from collections import defaultdict
 
-def ensure_no_duplicate_names(root, ValidateError):
+def ensure_no_duplicate_names(tree, ValidateError):
     """
     Check that all node names are identical, which is required for auspice (v2) JSONs.
     """
@@ -18,10 +18,14 @@ def recurse(node):
         names.add(node["name"])
         if "children" in node:
             [recurse(child) for child in node["children"]]
-    recurse(root)
+    if isinstance(tree, list):
+        for subtree in tree:
+            recurse(subtree)
+    else:
+        recurse(tree)
 
 
-def collectTreeAttrsV2(root, warn):
+def collectTreeAttrsV2(tree, warn):
     """
     Collect all keys specified on `node["node_attrs"]` throughout the tree
     and the values associated with them. Note that this will only look at
@@ -47,7 +51,12 @@ def recurse(node):
             [recurse(child) for child in node["children"]]
         else:
             num_terminal += 1
-    recurse(root)
+
+    if isinstance(tree, list):
+        for subtree in tree:
+            recurse(subtree)
+    else:
+        recurse(tree)
 
     for data in seen.values():
         if data["count"] == num_nodes:
@@ -56,7 +65,7 @@ def recurse(node):
     return(seen, num_terminal)
 
 
-def collectMutationGenes(root):
+def collectMutationGenes(tree):
     """
     Returns a set of all genes specified in the tree in the "aa_muts" objects
     """
@@ -67,17 +76,28 @@ def recurse(node):
             genes.update(mutations.keys())
         if "children" in node:
             [recurse(child) for child in node["children"]]
-    recurse(root)
+
+    if isinstance(tree, list):
+        for subtree in tree:
+            recurse(subtree)
+    else:
+        recurse(tree)
+
     genes -= {"nuc"}
     return genes
 
-def collectBranchLabels(root):
+def collectBranchLabels(tree):
     labels = set()
     def recurse(node):
         labels.update(node.get("branch_attrs", {}).get("labels", {}).keys())
         if "children" in node:
             [recurse(child) for child in node["children"]]
-    recurse(root)
+
+    if isinstance(tree, list):
+        for subtree in tree:
+            recurse(subtree)
+    else:
+        recurse(tree)
     return labels
 
 def verifyMainJSONIsInternallyConsistent(data, ValidateError):
diff --git a/tests/functional/export_v2/cram/multi-tree.t b/tests/functional/export_v2/cram/multi-tree.t
new file mode 100644
index 000000000..a7a45b948
--- /dev/null
+++ b/tests/functional/export_v2/cram/multi-tree.t
@@ -0,0 +1,53 @@
+
+Setup
+
+  $ source "$TESTDIR"/_setup.sh
+
+Create a small second tree (which has different names/labels than 'tree.nwk')
+  $ cat > tree2.nwk <<~~
+  > (tipG:1,(tipH:1,tipI:1)internalHI:2)SECOND_ROOT:0;
+  > ~~
+
+Minimal export -- no node data, no metadata etc etc
+  $ ${AUGUR} export v2 \
+  >   --tree "$TESTDIR/../data/tree.nwk" tree2.nwk \
+  >   --output minimal.json &> /dev/null
+
+More realistic export - with node_data for all nodes and metadata for some of them
+
+  $ cat > metadata.tsv <<~~
+  > strain	something
+  > tipA	foo
+  > tipB	foo
+  > tipC	foo
+  > tipG	bar
+  > tipH	bar
+  > ~~
+
+
+  $ cat > node-data.json <<~~
+  > {
+  >   "nodes": {
+  >     "ROOT": {"mutation_length": 0},
+  >     "tipA": {"mutation_length": 1},
+  >     "internalBC": {"mutation_length": 2},
+  >     "tipB": {"mutation_length": 1},
+  >     "tipC": {"mutation_length": 1},
+  >     "internalDEF": {"mutation_length": 5},
+  >     "tipD": {"mutation_length": 3},
+  >     "tipE": {"mutation_length": 4},
+  >     "tipF": {"mutation_length": 1},
+  >     "SECOND_ROOT": {"mutation_length": 0},
+  >     "tipG": {"mutation_length": 1},
+  >     "internalHI": {"mutation_length": 2},
+  >     "tipH": {"mutation_length": 1},
+  >     "tipI": {"mutation_length": 1}
+  >   }
+  > }
+  > ~~
+
+  $ ${AUGUR} export v2 \
+  >   --tree "$TESTDIR/../data/tree.nwk" tree2.nwk \
+  >   --metadata metadata.tsv --color-by-metadata something \
+  >   --node-data node-data.json \
+  >   --output output.json &> /dev/null

From 4b6944e3288e89519563ac0bc93ed5c1b41a8aee Mon Sep 17 00:00:00 2001
From: james hadfield <hadfield.james@gmail.com>
Date: Wed, 24 Apr 2024 14:34:03 +1200
Subject: [PATCH 3/3] WIP [refine] unique internal node names

Needed for pipelines which will produce multiple trees via `augur
refine` and then supply these trees to `augur export v2`
---
 augur/refine.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/augur/refine.py b/augur/refine.py
index 95717861f..348898377 100644
--- a/augur/refine.py
+++ b/augur/refine.py
@@ -143,6 +143,7 @@ def register_parser(parent_subparsers):
                         default='mutations-per-site', help='Units in which sequence divergences is exported.')
     parser.add_argument('--seed', type=int, help='seed for random number generation')
     parser.add_argument('--verbosity', type=int, default=1, help='treetime verbosity, between 0 and 6 (higher values more output)')
+    parser.add_argument('--content-aware-internal-node-names', action="store_true")
     parser.set_defaults(covariance=True)
     return parser
 
@@ -342,6 +343,21 @@ def are_sequence_states_different(nuc1, nuc2):
         print("ERROR: divergence unit",args.divergence_units,"not supported!", file=sys.stderr)
         return 1
 
+    if args.content_aware_internal_node_names:
+        terminals = [n.name for n in T.find_clades() if n.is_terminal()]
+        terminals.sort()
+        internals = {n.name for n in T.find_clades() if not n.is_terminal()}
+        assert all(n.startswith('NODE_') for n in internals)
+        # compute hash similar to git short commit hash
+        import hashlib
+        id = hashlib.sha256("".join(terminals).encode('utf-8')).hexdigest()[0:7]
+        def rename(name):
+            if name not in internals: return name
+            return f"NODE_{id}_{name.split('_')[1]}"
+        for n in T.find_clades():
+            n.name = rename(n.name)
+        node_data['nodes'] = {rename(name):data for name,data in node_data['nodes'].items()}
+
     # Export refined tree and node data
     tree_success = Phylo.write(T, tree_fname, 'newick', format_branch_length='%1.8f', branch_length_only=True)
     print("updated tree written to",tree_fname, file=sys.stdout)