From 90d1a5fe9d8a8e206d94b15a10a4e728a07a79ab Mon Sep 17 00:00:00 2001
From: James Hadfield <hadfield.james@gmail.com>
Date: Thu, 27 May 2021 21:27:56 +1200
Subject: [PATCH 01/12] Allow branch labels in node-data JSONs

Previously branch labels could not be specified in data passed to
`augur export v2` except for two "special cases":
(i) AA mutations (stored in node-data-json -> nodes) would create branch
labels "aa", if applicable.
(ii) `clade_annotation` (stored in node-data-json -> nodes) was
interpreted to be the "clade" branch label, and exported as such.

Here we extend the allowed node-data structure to include a top-level
key `branches` as described in [1] and the test data added here [2].
This data is exported in the appropriate format for Auspice (unchanged).
This paves the way for pipelines to define a range of branch labels for
export. Currently the only usable key in this dict is 'labels'.

If a branch label (via node-data-json -> branches -> node_name -> label)
is provided for 'aa' or 'clade' then this will overwrite the values
generated above (i, ii).

A side-effect of this work is that the requirement for node-data JSONs
to specify "nodes" has been relaxed (see [2] for an example); however
if neither "nodes" nor "branches" are defined then we raise a validation
error.

[1] https://github.com/nextstrain/augur/issues/720
[2] ./tests/functional/export_v2/branch-labels.json
---
 augur/export_v2.py                            | 129 ++++++++------
 augur/util_support/node_data_file.py          |  16 +-
 tests/functional/export_v2.t                  |  11 ++
 tests/functional/export_v2/aa_muts_1.json     |  33 ++++
 tests/functional/export_v2/branch-labels.json |  37 ++++
 .../export_v2/dataset-with-branch-labels.json | 162 ++++++++++++++++++
 tests/functional/export_v2/nt_muts_1.json     |  21 +++
 tests/util_support/test_node_data_reader.py   |   4 +-
 8 files changed, 361 insertions(+), 52 deletions(-)
 create mode 100644 tests/functional/export_v2/aa_muts_1.json
 create mode 100644 tests/functional/export_v2/branch-labels.json
 create mode 100644 tests/functional/export_v2/dataset-with-branch-labels.json
 create mode 100644 tests/functional/export_v2/nt_muts_1.json

diff --git a/augur/export_v2.py b/augur/export_v2.py
index 3f606c797..4c1988a11 100644
--- a/augur/export_v2.py
+++ b/augur/export_v2.py
@@ -151,23 +151,15 @@ def convert_tree_to_json_structure(node, metadata, div=0):
 
     return node_struct
 
-def are_mutations_defined(node_attrs):
-    for node, data in node_attrs.items():
-        if data.get("aa_muts") or data.get("muts"):
-            return True
-    return False
-
-
-def are_clades_defined(node_attrs):
-    for node, data in node_attrs.items():
-        if data.get("clade_membership") or data.get("clade_annotation"):
+def are_mutations_defined(branch_attrs):
+    for branch in branch_attrs.values():
+        if branch.get("mutations"):
             return True
     return False
 
-
-def are_dates_defined(node_attrs):
+def is_node_attr_defined(node_attrs, attr_name):
     for node, data in node_attrs.items():
-        if data.get("num_date"):
+        if data.get(attr_name):
             return True
     return False
 
@@ -223,7 +215,7 @@ def get_config_colorings_as_dict(config):
     return config_colorings
 
 
-def set_colorings(data_json, config, command_line_colorings, metadata_names, node_data_colorings, provided_colors, node_attrs):
+def set_colorings(data_json, config, command_line_colorings, metadata_names, node_data_colorings, provided_colors, node_attrs, branch_attrs):
 
     def _get_type(key, trait_values):
         # for some keys we know what the type must be
@@ -372,7 +364,7 @@ def _create_coloring(key):
     def _is_valid(coloring):
         key = coloring["key"]
         trait_values = get_values_across_nodes(node_attrs, key) # e.g. list of countries, regions etc
-        if key == "gt" and not are_mutations_defined(node_attrs):
+        if key == "gt" and not are_mutations_defined(branch_attrs):
             warn("[colorings] You asked for mutations (\"gt\"), but none are defined on the tree. They cannot be used as a coloring.")
             return False
         if key != "gt" and not trait_values:
@@ -411,11 +403,11 @@ def _get_colorings():
 
         explicitly_defined_colorings = [x["key"] for x in colorings]
         # add in genotype as a special case if (a) not already set and (b) the data supports it
-        if "gt" not in explicitly_defined_colorings and are_mutations_defined(node_attrs):
+        if "gt" not in explicitly_defined_colorings and are_mutations_defined(branch_attrs):
             colorings.insert(0,{'key':'gt'})
-        if "num_date" not in explicitly_defined_colorings and are_dates_defined(node_attrs):
+        if "num_date" not in explicitly_defined_colorings and is_node_attr_defined(node_attrs, "num_date"):
             colorings.insert(0,{'key':'num_date'})
-        if "clade_membership" not in explicitly_defined_colorings and are_clades_defined(node_attrs):
+        if "clade_membership" not in explicitly_defined_colorings and is_node_attr_defined(node_attrs, "clade_membership"):
             colorings.insert(0,{'key':'clade_membership'})
 
         return colorings
@@ -702,10 +694,23 @@ def node_to_author_tuple(data):
 
     return node_author_info
 
+def set_branch_attrs_on_tree(data_json, branch_attrs):
+    """
+    Shifts the provided `branch_attrs` onto the (auspice) `data_json`.
+    Currently all data is transferred, there is no way for (e.g.) the set of exported
+    labels to be restricted by the user in a config.
+    """
+    def _recursively_set_data(node):
+        if branch_attrs.get(node['name'], {}):
+            node['branch_attrs'] = branch_attrs[node['name']]
+        for child in node.get("children", []):
+            _recursively_set_data(child)
+    _recursively_set_data(data_json["tree"])
+
 
 def set_node_attrs_on_tree(data_json, node_attrs):
     '''
-    Assign desired colorings, metadata etc to the tree structure
+    Assign desired colorings, metadata etc to the `node_attrs` of nodes in the tree
 
     Parameters
     ----------
@@ -716,33 +721,10 @@ def set_node_attrs_on_tree(data_json, node_attrs):
 
     author_data = create_author_data(node_attrs)
 
-    def _transfer_mutations(node, raw_data):
-        if "aa_muts" in raw_data or "muts" in raw_data:
-            node["branch_attrs"]["mutations"] = {}
-            if "muts" in raw_data and len(raw_data["muts"]):
-                node["branch_attrs"]["mutations"]["nuc"] = raw_data["muts"]
-            if "aa_muts" in raw_data:
-                aa = {gene:data for gene, data in raw_data["aa_muts"].items() if len(data)}
-                node["branch_attrs"]["mutations"].update(aa)
-                #convert mutations into a label
-                if aa:
-                    aa_lab = '; '.join("{!s}: {!s}".format(key,', '.join(val)) for (key,val) in aa.items())
-                    if 'labels' in node["branch_attrs"]:
-                        node["branch_attrs"]["labels"]["aa"] = aa_lab
-                    else:
-                        node["branch_attrs"]["labels"] = { "aa": aa_lab }
-
     def _transfer_vaccine_info(node, raw_data):
         if raw_data.get("vaccine"):
             node["node_attrs"]['vaccine'] = raw_data['vaccine']
 
-    def _transfer_labels(node, raw_data):
-        if "clade_annotation" in raw_data and is_valid(raw_data["clade_annotation"]):
-            if 'labels' in node["branch_attrs"]:
-                node["branch_attrs"]["labels"]['clade'] = raw_data["clade_annotation"]
-            else:
-                node["branch_attrs"]["labels"] = { "clade": raw_data["clade_annotation"] }
-
     def _transfer_hidden_flag(node, raw_data):
         hidden = raw_data.get("hidden", None)
         if hidden:
@@ -791,9 +773,7 @@ def _recursively_set_data(node):
         # get all the available information for this particular node
         raw_data = node_attrs[node["name"]]
         # transfer "special cases"
-        _transfer_mutations(node, raw_data)
         _transfer_vaccine_info(node, raw_data)
-        _transfer_labels(node, raw_data)
         _transfer_hidden_flag(node, raw_data)
         _transfer_num_date(node, raw_data)
         _transfer_url_accession(node, raw_data)
@@ -810,8 +790,6 @@ def node_data_prop_is_normal_trait(name):
     # those traits / keys / attrs which are not "special" and can be exported
     # as normal attributes on nodes
     excluded = [
-        "clade_annotation", # Clade annotation is label, not colorby!
-        "clade_membership", # will be auto-detected if it is available
         "authors",          # authors are set as a node property, not a trait property
         "author",           # see above
         "vaccine",          # vaccine info is stored as a "special" node prop
@@ -983,6 +961,45 @@ def set_description(data_json, cmd_line_description_file):
     except FileNotFoundError:
         fatal("Provided desciption file {} does not exist".format(cmd_line_description_file))
 
+def create_branch_mutations(branch_attrs, node_data):
+    for node_name, node_info in node_data['nodes'].items():
+        if node_name not in branch_attrs:
+            continue # strain name not in the tree
+        if "aa_muts" not in node_info and "muts" not in node_info:
+            continue
+        branch_attrs[node_name]['mutations'] = {}
+        if "muts" in node_info and len(node_info["muts"]):
+            branch_attrs[node_name]["mutations"]["nuc"] = node_info["muts"]
+        if "aa_muts" in node_info:
+            aa = {gene:data for gene, data in node_info["aa_muts"].items() if len(data)}
+            branch_attrs[node_name]["mutations"].update(aa)
+
+def create_branch_labels(branch_attrs, node_data, branch_data):
+    ## start by creating the 'aa' branch label, summarising any amino acid mutations.
+    ## (We have already set mutations on 'branch_attrs' if they exist, just not the label)
+    ## This is done first so that if the user defines their own 'aa' labels they will
+    ## overwrite the ones created here
+    for branch_info in branch_attrs.values():
+        genes = [gene for gene in branch_info.get('mutations', {}) if gene!='nuc']
+        if len(genes):
+            branch_info['labels']['aa'] = \
+                '; '.join(f"{gene}: {', '.join(branch_info['mutations'][gene])}" for gene in genes)
+
+    ## check for the special key 'clade_annotation' defined via node data.
+    ## For historical reasons, this is interpreted as a branch label 'clade'
+    for node_name, node_info in node_data.items():
+        if node_name in branch_attrs and "clade_annotation" in node_info and is_valid(node_info["clade_annotation"]):
+            branch_attrs[node_name]['labels']['clade'] = node_info["clade_annotation"]
+
+    ## finally transfer any labels defined via <NODE DATA JSON> -> 'branches' -> labels
+    for node_name, branch_info in branch_data.items():
+        if node_name not in branch_attrs:
+            continue
+        for label_key, label_value in branch_info.get('labels', {}).items():
+            if label_key.upper() == "NONE" or not is_valid(label_value):
+                continue
+            branch_attrs[node_name]["labels"][label_key] = label_value
+
 def parse_node_data_and_metadata(T, node_data, metadata):
     node_data_names = set()
     metadata_names = set()
@@ -999,14 +1016,24 @@ def parse_node_data_and_metadata(T, node_data, metadata):
                 metadata_names.add(corrected_key)
 
     # second pass: node data JSONs (overwrites keys of same name found in metadata)
+    node_attrs_which_are_actually_branch_attrs = ["clade_annotation", "aa_muts", "muts"]
     for name, info in node_data['nodes'].items():
         if name in node_attrs: # i.e. this node name is in the tree
             for key, value in info.items():
+                if key in node_attrs_which_are_actually_branch_attrs:
+                    continue # these will be handled below
                 corrected_key = update_deprecated_names(key)
                 node_attrs[name][corrected_key] = value
                 node_data_names.add(corrected_key)
 
-    return (node_data, node_attrs, node_data_names, metadata_names)
+    # third pass: create `branch_attrs`. The data comes from
+    # (a) some keys within `node_data['nodes']` (for legacy reasons)
+    # (b) the `node_data['branches']` dictionary, which currently only defines labels
+    branch_attrs = {clade.name: defaultdict(dict) for clade in T.root.find_clades()}
+    create_branch_mutations(branch_attrs, node_data)
+    create_branch_labels(branch_attrs, node_data['nodes'], node_data.get('branches', {}))
+
+    return (node_data, node_attrs, node_data_names, metadata_names, branch_attrs)
 
 def get_config(args):
     if not args.auspice_config:
@@ -1054,7 +1081,8 @@ def run(args):
 
     # parse input files
     T = Phylo.read(args.tree, 'newick')
-    node_data, node_attrs, node_data_names, metadata_names = parse_node_data_and_metadata(T, node_data_file, metadata_file)
+    node_data, node_attrs, node_data_names, metadata_names, branch_attrs = \
+            parse_node_data_and_metadata(T, node_data_file, metadata_file)
     config = get_config(args)
 
     # set metadata data structures
@@ -1074,7 +1102,8 @@ def run(args):
             metadata_names=metadata_names,
             node_data_colorings=node_data_names,
             provided_colors=read_colors(args.colors),
-            node_attrs=node_attrs
+            node_attrs=node_attrs,
+            branch_attrs=branch_attrs
         )
     except FileNotFoundError as e:
         print(f"ERROR: required file could not be read: {e}")
@@ -1084,6 +1113,8 @@ def run(args):
     # set tree structure
     data_json["tree"] = convert_tree_to_json_structure(T.root, node_attrs)
     set_node_attrs_on_tree(data_json, node_attrs)
+    set_branch_attrs_on_tree(data_json, branch_attrs)
+
     set_geo_resolutions(data_json, config, args.geo_resolutions, read_lat_longs(args.lat_longs), node_attrs)
     set_panels(data_json, config, args.panels)
     set_data_provenance(data_json, config)
diff --git a/augur/util_support/node_data_file.py b/augur/util_support/node_data_file.py
index 5d389771e..39479fb09 100644
--- a/augur/util_support/node_data_file.py
+++ b/augur/util_support/node_data_file.py
@@ -27,7 +27,12 @@ def annotations(self):
 
     @property
     def nodes(self):
-        return self.attrs.get("nodes")
+        return self.attrs.get("nodes", {})
+
+    @property
+    def branches(self):
+        # these are optional, so we provide an empty dict as a default
+        return self.attrs.get("branches", {})
 
     @property
     def generated_by(self):
@@ -73,6 +78,15 @@ def validate(self):
                 f"`nodes` value in {self.fname} is not a dictionary. Please check the formatting of this JSON!"
             )
 
+        if not isinstance(self.branches, dict):
+            raise AugurError(
+                f"`branches` value in {self.fname} is not a dictionary. Please check the formatting of this JSON!"            )
+
+        if not self.nodes and not self.branches:
+            raise AugurError(
+                f"{self.fname} did not contain either `nodes` or `branches`. Please check the formatting of this JSON!"
+        )
+
         if self.validation_mode is not ValidationMode.SKIP and self.is_generated_by_incompatible_augur:
             msg = (
                 f"Augur version incompatibility detected: the JSON {self.fname} was generated by "
diff --git a/tests/functional/export_v2.t b/tests/functional/export_v2.t
index b9eb06e94..1d78bb057 100644
--- a/tests/functional/export_v2.t
+++ b/tests/functional/export_v2.t
@@ -166,5 +166,16 @@ Run export with metadata and external colors TSV that contains zero values.
   {}
   $ rm -f "$TMP/dataset6.json"
 
+Test that attributes are correctly exported as branch_attrs. Currently this includes branch labels (node_data→branches),
+mutations (node_data→nodes) and a historical node_data→nodes→<name>→clade_annotation branch label.
+  $ ${AUGUR} export v2 \
+  >   --tree export_v2/tree.nwk \
+  >   --node-data export_v2/div_node-data.json export_v2/nt_muts_1.json export_v2/aa_muts_1.json export_v2/branch-labels.json \
+  >   --maintainers "Nextstrain Team" \
+  >   --output "$TMP/dataset-with-branch-labels.json" > /dev/null
+
+  $ python3 "$TESTDIR/../../scripts/diff_jsons.py"  export_v2/dataset-with-branch-labels.json "$TMP/dataset-with-branch-labels.json" \
+  >   --exclude-paths "root['meta']['updated']"
+  {}
 
   $ popd > /dev/null
diff --git a/tests/functional/export_v2/aa_muts_1.json b/tests/functional/export_v2/aa_muts_1.json
new file mode 100644
index 000000000..fe9dc9fa6
--- /dev/null
+++ b/tests/functional/export_v2/aa_muts_1.json
@@ -0,0 +1,33 @@
+{
+  "annotations": {
+    "gene1": {
+      "end": 150,
+      "start": 50,
+      "strand": "+"
+    },
+    "gene2": {
+      "end": 300,
+      "start": 200,
+      "strand": "+"
+    },
+    "nuc": {
+      "end": 500,
+      "start": 1,
+      "strand": "+"
+    }
+  },
+  "nodes": {
+    "internalBC": {
+      "aa_muts": {
+        "gene1": ["S10G", "P20S"],
+        "gene2": []
+      }
+    },
+    "internalDEF": {
+      "aa_muts": {
+        "gene1": ["P20S"],
+        "gene2": []
+      }
+    }
+  }
+}
\ No newline at end of file
diff --git a/tests/functional/export_v2/branch-labels.json b/tests/functional/export_v2/branch-labels.json
new file mode 100644
index 000000000..d0595832e
--- /dev/null
+++ b/tests/functional/export_v2/branch-labels.json
@@ -0,0 +1,37 @@
+{
+  "nodes": {
+    "tipD": {
+      "clade_membership": "membership D",
+      "clade_annotation": "set via nodes→clade_annotation"
+    },
+    "tipC": {
+      "clade_membership": "membership C",
+      "clade_annotation": "this should be overwritten by custom clade label"
+    }
+  },
+  "branches": {
+    "ROOT": {
+      "labels": {
+        "fruit": "apple"
+      }
+    },
+    "tipA": {
+      "labels": {
+        "fruit": "orange"
+      }
+    },
+    "tipC": {
+      "labels": {
+        "clade": "clade C",
+        "none": "this will not be exported"
+      }
+    },
+    "internalBC": {
+      "labels": {
+        "fruit": "pomegranate",
+        "vegetable": "pumpkin",
+        "aa": "custom aa label"
+      }
+    }
+  }
+}
\ No newline at end of file
diff --git a/tests/functional/export_v2/dataset-with-branch-labels.json b/tests/functional/export_v2/dataset-with-branch-labels.json
new file mode 100644
index 000000000..d09dfb61a
--- /dev/null
+++ b/tests/functional/export_v2/dataset-with-branch-labels.json
@@ -0,0 +1,162 @@
+{
+  "version": "v2",
+  "meta": {
+    "maintainers": [
+      {
+        "name": "Nextstrain Team"
+      }
+    ],
+    "genome_annotations": {
+      "nuc": {
+        "end": 500,
+        "start": 1,
+        "strand": "+"
+      },
+      "gene1": {
+        "end": 150,
+        "start": 50,
+        "strand": "+"
+      },
+      "gene2": {
+        "end": 300,
+        "start": 200,
+        "strand": "+"
+      }
+    },
+    "colorings": [
+      {
+        "key": "gt",
+        "title": "Genotype",
+        "type": "categorical"
+      },
+      {
+        "key": "clade_membership",
+        "title": "Clade",
+        "type": "categorical"
+      }
+    ],
+    "filters": [
+      "clade_membership"
+    ],
+    "panels": [
+      "tree",
+      "entropy"
+    ],
+    "updated": "2022-09-09"
+  },
+  "tree": {
+    "name": "ROOT",
+    "node_attrs": {
+      "div": 0
+    },
+    "branch_attrs": {
+      "labels": {
+        "fruit": "apple"
+      }
+    },
+    "children": [
+      {
+        "name": "tipA",
+        "branch_attrs": {
+          "labels": {
+            "fruit": "orange"
+          }
+        },
+        "node_attrs": {
+          "div": 1
+        }
+      },
+      {
+        "name": "internalBC",
+        "node_attrs": {
+          "div": 2
+        },
+        "branch_attrs": {
+          "mutations": {
+            "gene1": [
+              "S10G",
+              "P20S"
+            ]
+          },
+          "labels": {
+            "aa": "custom aa label",
+            "fruit": "pomegranate",
+            "vegetable": "pumpkin"
+          }
+        },
+        "children": [
+          {
+            "name": "tipB",
+            "node_attrs": {
+              "div": 3
+            },
+            "branch_attrs": {}
+          },
+          {
+            "name": "tipC",
+            "node_attrs": {
+              "div": 3,
+              "clade_membership": {
+                "value": "membership C"
+              }
+            },
+            "branch_attrs": {
+              "labels": {
+                "clade": "clade C"
+              }
+            }
+          }
+        ]
+      },
+      {
+        "name": "internalDEF",
+        "node_attrs": {
+          "div": 5
+        },
+        "branch_attrs": {
+          "mutations": {
+            "nuc": [
+              "A400T"
+            ],
+            "gene1": [
+              "P20S"
+            ]
+          },
+          "labels": {
+            "aa": "gene1: P20S"
+          }
+        },
+        "children": [
+          {
+            "name": "tipD",
+            "node_attrs": {
+              "div": 8,
+              "clade_membership": {
+                "value": "membership D"
+              }
+            },
+            "branch_attrs": {
+              "labels": {
+                "clade": "set via nodes\u2192clade_annotation"
+              }
+            }
+          },
+          {
+            "name": "tipE",
+            "node_attrs": {
+              "div": 9
+            },
+            "branch_attrs": {}
+          },
+          {
+            "name": "tipF",
+            "node_attrs": {
+              "div": 6
+            },
+            "branch_attrs": {}
+          }
+        ]
+      }
+    ]
+  }
+}
\ No newline at end of file
diff --git a/tests/functional/export_v2/nt_muts_1.json b/tests/functional/export_v2/nt_muts_1.json
new file mode 100644
index 000000000..9f0662309
--- /dev/null
+++ b/tests/functional/export_v2/nt_muts_1.json
@@ -0,0 +1,21 @@
+{
+  "annotations": {
+    "nuc": {
+      "end": 500,
+      "start": 1,
+      "strand": "+"
+    }
+  },
+  "nodes": {
+    "internalDEF": {
+      "muts": [
+        "A400T"
+      ]
+    },
+    "not_in_tree": {
+      "muts": [
+        "A100T"
+      ]
+    }
+  }
+}
diff --git a/tests/util_support/test_node_data_reader.py b/tests/util_support/test_node_data_reader.py
index 889a862b8..837a8d695 100644
--- a/tests/util_support/test_node_data_reader.py
+++ b/tests/util_support/test_node_data_reader.py
@@ -60,7 +60,7 @@ def test_read_dict_nonuniformity(self, tmpdir, prepare_file):
             "file1.json",
             """
             {
-                "nodes": {},
+                "nodes": {"node_name": "some_value"},
                 "a": {}
             }
             """,
@@ -69,7 +69,7 @@ def test_read_dict_nonuniformity(self, tmpdir, prepare_file):
             "file2.json",
             """
             {
-                "nodes": {},
+                "nodes": {"node_name": "some_other_value"},
                 "a": "nah"
             }
             """,

From 5ba7cf10e3742c46410542c264801070c1ca4432 Mon Sep 17 00:00:00 2001
From: James Hadfield <hadfield.james@gmail.com>
Date: Fri, 9 Sep 2022 17:02:02 +1200
Subject: [PATCH 02/12] [clades] export labels as specific branch labels
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously clade membership (i.e. the coloring) and the branch labels
defining the root of the clade were defined via:
<OUTPUT_NODE_DATA> → nodes → <node_name> → clade_membership, and
<OUTPUT_NODE_DATA> → nodes → <node_name> → clade_annotation.
`augur export` would then convert the clade_annotation into a
branch label named 'clade'.

Here we change the format of augur clade's OUTPUT_NODE_DATA so
that the membership and labels are now stored via:
<OUTPUT_NODE_DATA> → nodes → <node_name> → clade_membership, and
<OUTPUT_NODE_DATA> → branches → <node_name> → labels → clade.
The previous commit modified augur export to handle this format.

Augur pipelines should be fully backwards compatible as long as a new
major version of augur is released, as we ensure that node-data files
are created by the same augur (major) version. Scripts which relied on
the format of this node-data file may be affected.

Note that we keep the key 'clade_membership' deliberately: this is
used in auspice-config JSONs and auspice URLs, and so changing it will
cause lots of downstream issues for a minimal syntax improvement. (The
`clade_annotation` key name was never exported in auspice JSONs.)

This commit paves the way for allowing custom key names.
---
 augur/clades.py                     | 48 ++++++++++++++++++++---------
 tests/functional/clades/clades.json | 26 +++++++++++++---
 2 files changed, 55 insertions(+), 19 deletions(-)

diff --git a/augur/clades.py b/augur/clades.py
index acfb4c301..82048ba28 100644
--- a/augur/clades.py
+++ b/augur/clades.py
@@ -1,5 +1,10 @@
 """
 Assign clades to nodes in a tree based on amino-acid or nucleotide signatures.
+
+Membership of a clade (i.e. the coloring) will be stored in
+<OUTPUT_NODE_DATA> → nodes → <node_name> → clade_membership.
+Branch labels defining the clade will be stored in
+<OUTPUT_NODE_DATA> → branches → <node_name> → labels → clade.
 """
 
 import sys
@@ -152,8 +157,8 @@ def is_node_in_clade(clade_alleles, node, ref):
 def assign_clades(clade_designations, all_muts, tree, ref=None):
     '''
     Ensures all nodes have an entry (or auspice doesn't display nicely), tests each node
-    to see if it's the first member of a clade (assigns 'clade_annotation'), and sets
-    all nodes's clade_membership to the value of their parent. This will change if later found to be
+    to see if it's the first member of a clade (this is the label), and sets the membership of each
+    node to the value of their parent. This will change if later found to be
     the first member of a clade.
 
     Parameters
@@ -169,16 +174,18 @@ def assign_clades(clade_designations, all_muts, tree, ref=None):
 
     Returns
     -------
-    dict
-        mapping of node to clades
+    (dict, dict)
+        [0]: mapping of node to clade membership (where applicable)
+        [1]: mapping of node to clade label (where applicable)
     '''
 
     clade_membership = {}
+    clade_labels = {}
     parents = get_parent_name_by_child_name_for_tree(tree)
 
     # first pass to set all nodes to unassigned as precaution to ensure attribute is set
     for node in tree.find_clades(order = 'preorder'):
-        clade_membership[node.name] = {'clade_membership': 'unassigned'}
+        clade_membership[node.name] = 'unassigned'
 
     # count leaves
     for node in tree.find_clades(order = 'postorder'):
@@ -208,7 +215,7 @@ def assign_clades(clade_designations, all_muts, tree, ref=None):
                     node.sequences[gene][pos] = d
 
 
-    # second pass to assign 'clade_annotation' to basal nodes within each clade
+    # second pass to assign basal nodes within each clade to the clade_labels dict
     # if multiple nodes match, assign annotation to largest
     # otherwise occasional unwanted cousin nodes get assigned the annotation
     for clade_name, clade_alleles in clade_designations.items():
@@ -219,16 +226,17 @@ def assign_clades(clade_designations, all_muts, tree, ref=None):
         sorted_nodes = sorted(node_counts, key=lambda x: x.leaf_count, reverse=True)
         if len(sorted_nodes) > 0:
             target_node = sorted_nodes[0]
-            clade_membership[target_node.name] = {'clade_annotation': clade_name, 'clade_membership': clade_name}
+            clade_membership[target_node.name] = clade_name
+            clade_labels[target_node.name] = clade_name
 
-    # third pass to propagate 'clade_membership'
-    # don't propagate if encountering 'clade_annotation'
+    # third pass to propagate clade_membership to descendant nodes
+    # (until we encounter a node with its own clade_membership)
     for node in tree.find_clades(order = 'preorder'):
         for child in node:
-            if 'clade_annotation' not in clade_membership[child.name]:
-                clade_membership[child.name]['clade_membership'] = clade_membership[node.name]['clade_membership']
+            if child.name not in clade_labels:
+                clade_membership[child.name] = clade_membership[node.name]
 
-    return clade_membership
+    return (clade_membership, clade_labels)
 
 
 def get_reference_sequence_from_root_node(all_muts, root_name):
@@ -279,8 +287,18 @@ def run(args):
 
     clade_designations = read_in_clade_definitions(args.clades)
 
-    clade_membership = assign_clades(clade_designations, all_muts, tree, ref)
+    membership, labels = assign_clades(clade_designations, all_muts, tree, ref)
+
+    membership_key="clade_membership"
+    label_key = "clade"
+
+    node_data_json = {
+        'nodes': {node: {membership_key: clade} for node,clade in membership.items()},
+        'branches': {node: {'labels': {label_key: label}} for node,label in labels.items()}
+    }
 
     out_name = get_json_name(args)
-    write_json({'nodes': clade_membership}, out_name)
-    print("clades written to", out_name, file=sys.stdout)
+    write_json(node_data_json, out_name)
+    print(f"Clades written to {out_name}", file=sys.stdout)
+    print(f"\tClade membership was stored on nodes → <node_name> → {membership_key}", file=sys.stdout)
+    print(f"\tClade labels were stored on branches → <node_name> → labels → {label_key}", file=sys.stdout)
diff --git a/tests/functional/clades/clades.json b/tests/functional/clades/clades.json
index bcb32caa4..3d8eae6ed 100644
--- a/tests/functional/clades/clades.json
+++ b/tests/functional/clades/clades.json
@@ -23,18 +23,15 @@
       "clade_membership": "A.1"
     },
     "NODE_0000001": {
-      "clade_annotation": "B",
       "clade_membership": "B"
     },
     "NODE_0000002": {
       "clade_membership": "A"
     },
     "NODE_0000003": {
-      "clade_annotation": "A",
       "clade_membership": "A"
     },
     "NODE_0000004": {
-      "clade_annotation": "A.1",
       "clade_membership": "A.1"
     },
     "NODE_0000005": {
@@ -47,7 +44,6 @@
       "clade_membership": "B"
     },
     "NODE_0000008": {
-      "clade_annotation": "B.1",
       "clade_membership": "B.1"
     },
     "PAN/CDC_259359_V1_V3/2015": {
@@ -62,5 +58,27 @@
     "ZKC2/2016": {
       "clade_membership": "A"
     }
+  },
+  "branches": {
+    "NODE_0000001": {
+      "labels": {
+        "clade": "B"
+      }
+    },
+    "NODE_0000003": {
+      "labels": {
+        "clade": "A"
+      }
+    },
+    "NODE_0000004": {
+      "labels": {
+        "clade": "A.1"
+      }
+    },
+    "NODE_0000008": {
+      "labels": {
+        "clade": "B.1"
+      }
+    }
   }
 }
\ No newline at end of file

From fd88aa7a891826094a39686a50e82a91f3ef6842 Mon Sep 17 00:00:00 2001
From: James Hadfield <hadfield.james@gmail.com>
Date: Fri, 9 Sep 2022 17:43:32 +1200
Subject: [PATCH 03/12] [clades] allow custom membership / label names

These arguments shouldn't need to be used in most cases but are really
useful for pipelines which run `augur clades` multiple times (e.g.
nCoV's emerging lineages). This will allow _n_ node-data files to be
passed to `augur export` with a resulting _n_ colorings and labels.
(Currently you need multiple extra steps: the node-data JSON needs to
have the key names changed, and then you need to manually set branch
labels in the auspice JSON.)
---
 augur/clades.py           | 29 ++++++++++++++++++-----------
 tests/functional/clades.t | 30 +++++++++++++++++++++++++++++-
 2 files changed, 47 insertions(+), 12 deletions(-)

diff --git a/augur/clades.py b/augur/clades.py
index 82048ba28..97f004ef7 100644
--- a/augur/clades.py
+++ b/augur/clades.py
@@ -1,10 +1,14 @@
 """
 Assign clades to nodes in a tree based on amino-acid or nucleotide signatures.
 
-Membership of a clade (i.e. the coloring) will be stored in
-<OUTPUT_NODE_DATA> → nodes → <node_name> → clade_membership.
-Branch labels defining the clade will be stored in
+Nodes which are members of a clade are stored via 
+<OUTPUT_NODE_DATA> → nodes → <node_name> → clade_membership
+and if this file is used in `augur export v2` these will automatically become a coloring.
+
+The basal nodes of each clade are also given a branch label which is stored via
 <OUTPUT_NODE_DATA> → branches → <node_name> → labels → clade.
+
+The keys "clade_membership" and "clade" are customisable via command line arguments.
 """
 
 import sys
@@ -263,6 +267,8 @@ def register_parser(parent_subparsers):
     parser.add_argument('--reference', nargs='+', help='fasta files containing reference and tip nucleotide and/or amino-acid sequences ')
     parser.add_argument('--clades', type=str, help='TSV file containing clade definitions by amino-acid')
     parser.add_argument('--output-node-data', type=str, help='name of JSON file to save clade assignments to')
+    parser.add_argument('--membership-name', type=str, default="clade_membership", help='Key to store clade membership under; use "None" to not export this')
+    parser.add_argument('--label-name', type=str, default="clade", help='Key to store clade labels under; use "None" to not export this')
     return parser
 
 
@@ -289,16 +295,17 @@ def run(args):
 
     membership, labels = assign_clades(clade_designations, all_muts, tree, ref)
 
-    membership_key="clade_membership"
-    label_key = "clade"
+    membership_key= args.membership_name if args.membership_name.upper() != "NONE" else None
+    label_key= args.label_name if args.label_name.upper() != "NONE" else None
 
-    node_data_json = {
-        'nodes': {node: {membership_key: clade} for node,clade in membership.items()},
-        'branches': {node: {'labels': {label_key: label}} for node,label in labels.items()}
-    }
+    node_data_json = {}
+    if membership_key:
+        node_data_json['nodes'] = {node: {membership_key: clade} for node,clade in membership.items()}
+        print(f"Clade membership stored on nodes → <node_name> → {membership_key}", file=sys.stdout)
+    if label_key:
+        node_data_json['branches'] = {node: {'labels': {label_key: label}} for node,label in labels.items()}
+        print(f"Clade labels stored on branches → <node_name> → labels → {label_key}", file=sys.stdout)
 
     out_name = get_json_name(args)
     write_json(node_data_json, out_name)
     print(f"Clades written to {out_name}", file=sys.stdout)
-    print(f"\tClade membership was stored on nodes → <node_name> → {membership_key}", file=sys.stdout)
-    print(f"\tClade labels were stored on branches → <node_name> → labels → {label_key}", file=sys.stdout)
diff --git a/tests/functional/clades.t b/tests/functional/clades.t
index 619c35d56..c3081c29e 100644
--- a/tests/functional/clades.t
+++ b/tests/functional/clades.t
@@ -13,4 +13,32 @@ Test augur clades with simple Zika input files and hierarchical clades.
 
   $ python3 "$TESTDIR/../../scripts/diff_jsons.py"  clades/clades.json "$TMP/clades.json" \
   >   --exclude-paths "root['generated_by']"
-  {}
\ No newline at end of file
+  {}
+
+Test custom membership key + label key. The only change should be the key names
+
+  $ ${AUGUR} clades \
+  >   --tree clades/tree.nwk \
+  >   --mutations clades/aa_muts.json clades/nt_muts_small.json \
+  >   --clades clades/clades.tsv \
+  >   --membership-name lineage --label-name origin \
+  >   --output-node-data "$TMP/clades_custom.json" &>/dev/null
+
+  $ cat "$TMP/clades_custom.json" | sed "s/lineage/clade_membership/" | sed "s/origin/clade/" > "$TMP/clades_sed.json"
+
+  $ python3 "$TESTDIR/../../scripts/diff_jsons.py" clades/clades.json "$TMP/clades_sed.json" \
+  >   --exclude-paths "root['generated_by']"
+  {}
+
+Test the ability to _not_ export a branch label (same logic as not exporting the membership)
+
+  $ ${AUGUR} clades \
+  >   --tree clades/tree.nwk \
+  >   --mutations clades/aa_muts.json clades/nt_muts_small.json \
+  >   --clades clades/clades.tsv \
+  >   --label-name none \
+  >   --output-node-data "$TMP/clades_no-labels.json" &>/dev/null
+
+  $ python3 "$TESTDIR/../../scripts/diff_jsons.py" clades/clades.json "$TMP/clades_no-labels.json" \
+  >   --exclude-paths "root['generated_by']"
+  {'dictionary_item_removed': [root['branches']]}

From 007cb474082e5812af77a20270a5285c2abf9b9b Mon Sep 17 00:00:00 2001
From: James Hadfield <hadfield.james@gmail.com>
Date: Mon, 12 Sep 2022 14:51:51 +1200
Subject: [PATCH 04/12] update changes.md and docs

---
 CHANGES.md         | 4 ++++
 docs/faq/clades.md | 4 ++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/CHANGES.md b/CHANGES.md
index 79f52910e..28d3fed2f 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -11,11 +11,15 @@
 * Constrain `bcbio-gff` to >=0.7.0 and allow `Biopython` >=1.81 again. We had to introduce the `Biopython` constraint in v21.0.1 (see [#1152][]) due to `bcbio-gff` <0.7.0 relying on the removed `Biopython` feature `UnknownSeq`. [#1178][] (@corneliusroemer)
 * `augur.io.read_metadata` (used by export, filter, frequencies, refine, and traits): Previously, this used the Python parser engine for [`pandas.read_csv()`][]. Updated to use the C engine for faster reading of metadata. [#812][] (@victorlin)
 
+* clades, export v2: Clade labels + coloring keys are now definable via arguments to augur clades allowing pipelines to use multiple invocations of augur clades resulting in multiple sets of colors and branch labels. How labels are stored in the (intermediate) node-data JSON files has changed. This should be fully backwards compatible for pipelines using augur commands, however custom scripts may need updating. PR [#728][] (@jameshadfield)
+
+
 ### Bug fixes
 
 * filter, frequencies, refine, parse: Previously, ambiguous dates in the future had a limit of today's date imposed on the upper value but not the lower value. It is now imposed on the lower value as well. [#1171][] (@victorlin)
 * refine: `--year-bounds` was ignored in versions 9.0.0 through 20.0.0. It now works. [#1136][] (@victorlin)
 
+[#728]: https://github.com/nextstrain/augur/pull/728
 [#812]: https://github.com/nextstrain/augur/pull/812
 [#1136]: https://github.com/nextstrain/augur/issues/1136
 [#1152]: https://github.com/nextstrain/augur/pull/1152
diff --git a/docs/faq/clades.md b/docs/faq/clades.md
index 35a1488b2..17c8f50cc 100644
--- a/docs/faq/clades.md
+++ b/docs/faq/clades.md
@@ -39,7 +39,7 @@ rule clades:
 As input, this command requires the tree, the output of the ancestral reconstruction steps and the translation step (assuming your clades are defined using translations), as well as the file with clade definitions.
 
 The output of this command is a json file in the common augur format that specifies `clade_membership` for each node in the tree.
-Nodes that didn't match any clade definition will be left `unassigned`.
-Internal nodes that form the root of clades will have an additional field `clade_annotation` that auspice uses to label branches in the tree.
+Nodes that didn't match any clade definition will be left unassigned.
+Internal nodes that form the root of each clade will also have a branch label assigned (you can prevent this by adding `--label-name none`).
 
 

From 4316a7dfca66f96879dc68d0cb97bbf8395fb79c Mon Sep 17 00:00:00 2001
From: James Hadfield <hadfield.james@gmail.com>
Date: Tue, 11 Apr 2023 16:48:50 +1200
Subject: [PATCH 05/12] [clades] allow node-data nodes to be a subset of tree
 nodes

Our current implementation of read_node_data requires that every
node in the tree is specified in the (merged) node_data files. For
mutations this is overkill -- many nodes don't have mutations and it's
overkill to require node_data JSONs to specify things like
`"node_name": {"muts": []}`.

This may well be the general behaviour we want, but i didn't want to
modify the read_node_data function which sees extensive use.

A welcome side effect of these changes is that we no longer have to
supply both nuc and aa_muts.
---
 augur/clades.py | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/augur/clades.py b/augur/clades.py
index 97f004ef7..3a32d7ee9 100644
--- a/augur/clades.py
+++ b/augur/clades.py
@@ -18,6 +18,7 @@
 from collections import defaultdict
 import networkx as nx
 from itertools import islice
+from .errors import AugurError
 from .utils import get_parent_name_by_child_name_for_tree, read_node_data, write_json, get_json_name
 
 def read_in_clade_definitions(clade_file):
@@ -200,16 +201,16 @@ def assign_clades(clade_designations, all_muts, tree, ref=None):
             c.up=node
     tree.root.up = None
     tree.root.sequences = {'nuc':{}}
-    tree.root.sequences.update({gene:{} for gene in all_muts[tree.root.name]['aa_muts']})
+    tree.root.sequences.update({gene:{} for gene in all_muts.get(tree.root.name, {}).get('aa_muts', {})})
 
     # attach sequences to all nodes
     for node in tree.find_clades(order='preorder'):
         if node.up:
             node.sequences = {gene:muts.copy() for gene, muts in node.up.sequences.items()}
-        for mut in all_muts[node.name]['muts']:
+        for mut in all_muts.get(node.name, {}).get('muts', []):
             a, pos, d = mut[0], int(mut[1:-1])-1, mut[-1]
             node.sequences['nuc'][pos] = d
-        if 'aa_muts' in all_muts[node.name]:
+        if 'aa_muts' in all_muts.get(node.name, {}):
             for gene in all_muts[node.name]['aa_muts']:
                 for mut in all_muts[node.name]['aa_muts'][gene]:
                     a, pos, d = mut[0], int(mut[1:-1])-1, mut[-1]
@@ -251,7 +252,7 @@ def get_reference_sequence_from_root_node(all_muts, root_name):
     except:
         print("WARNING in augur.clades: nucleotide mutation json does not contain full sequences for the root node.")
 
-    if "aa_muts" in all_muts[root_name]:
+    if "aa_muts" in all_muts.get(root_name, {}):
         try:
             ref.update({gene:list(seq) for gene, seq in all_muts[root_name]["aa_sequences"].items()})
         except:
@@ -259,6 +260,18 @@ def get_reference_sequence_from_root_node(all_muts, root_name):
 
     return ref
 
+def parse_nodes(tree_file, node_data_files):
+    tree = Phylo.read(tree_file, 'newick')
+    # don't supply tree to read_node_data as we don't want to require that every node is present in the node_data JSONs
+    node_data = read_node_data(node_data_files)
+    # node_data files can be parsed without 'nodes' (if they have 'branches')
+    if "nodes" not in node_data or len(node_data['nodes'].keys())==0:
+        raise AugurError(f"No nodes found in the supplied node data files. Please check {', '.join(node_data_files)}")
+    json_nodes = set(node_data["nodes"].keys())
+    tree_nodes = set([clade.name for clade in tree.find_clades()])
+    if not json_nodes.issubset(tree_nodes):
+        raise AugurError(f"The following nodes in the node_data files ({', '.join(node_data_files)}) are not found in the tree ({tree_file}): {', '.join(json_nodes - tree_nodes)}")
+    return (tree, node_data['nodes'])
 
 def register_parser(parent_subparsers):
     parser = parent_subparsers.add_parser("clades", help=__doc__)
@@ -273,13 +286,7 @@ def register_parser(parent_subparsers):
 
 
 def run(args):
-    ## read tree and data, if reading data fails, return with error code
-    tree = Phylo.read(args.tree, 'newick')
-    node_data = read_node_data(args.mutations, args.tree)
-    if node_data is None:
-        print("ERROR: could not read node data (incl sequences)")
-        return 1
-    all_muts = node_data['nodes']
+    (tree, all_muts) = parse_nodes(args.tree, args.mutations)
 
     if args.reference:
         # PLACE HOLDER FOR vcf WORKFLOW.

From 22e2444b630a683c42f464fa75ef4b4607861e2b Mon Sep 17 00:00:00 2001
From: James Hadfield <hadfield.james@gmail.com>
Date: Tue, 11 Apr 2023 18:01:38 +1200
Subject: [PATCH 06/12] [clades] tests for clades set at the root node

See comments in tests/functional/clades.t

Also adds / updates comments and docstrings which were noticed as I
worked through the code relating to these tests.
---
 augur/clades.py                               | 21 ++++++----
 tests/functional/clades.t                     | 41 +++++++++++++++++++
 tests/functional/clades/toy_clades_1.json     | 30 ++++++++++++++
 tests/functional/clades/toy_clades_2.json     | 35 ++++++++++++++++
 tests/functional/clades/toy_clades_nuc.tsv    |  3 ++
 .../toy_muts_explicit_root_mutation.json      | 10 +++++
 tests/functional/clades/toy_muts_no_ref.json  |  7 ++++
 tests/functional/clades/toy_muts_ref.json     | 10 +++++
 tests/functional/clades/toy_tree.nwk          |  1 +
 9 files changed, 151 insertions(+), 7 deletions(-)
 create mode 100644 tests/functional/clades/toy_clades_1.json
 create mode 100644 tests/functional/clades/toy_clades_2.json
 create mode 100644 tests/functional/clades/toy_clades_nuc.tsv
 create mode 100644 tests/functional/clades/toy_muts_explicit_root_mutation.json
 create mode 100644 tests/functional/clades/toy_muts_no_ref.json
 create mode 100644 tests/functional/clades/toy_muts_ref.json
 create mode 100644 tests/functional/clades/toy_tree.nwk

diff --git a/augur/clades.py b/augur/clades.py
index 3a32d7ee9..797e9ee16 100644
--- a/augur/clades.py
+++ b/augur/clades.py
@@ -124,7 +124,7 @@ def read_in_clade_definitions(clade_file):
     return clades
 
 
-def is_node_in_clade(clade_alleles, node, ref):
+def is_node_in_clade(clade_alleles, node, root_sequence):
     '''
     Determines whether a node matches the clade definition based on sequence
     For any condition, will first look in mutations stored in node.sequences,
@@ -133,11 +133,13 @@ def is_node_in_clade(clade_alleles, node, ref):
     Parameters
     ----------
     clade_alleles : list
-        list of clade defining alleles
+        list of clade defining alleles (typically supplied from the input TSV)
     node : Bio.Phylo.BaseTree.Clade
         node to check, assuming sequences (as mutations) are attached to node
-    ref : str or list
-        positions
+        node.sequences specifies nucleotides/codons which are newly observed on this node
+        i.e. they are the result of a mutation observed on the branch leading to this node
+    root_sequence : dict
+        {geneName: observed root sequence (list)}
 
     Returns
     -------
@@ -149,8 +151,8 @@ def is_node_in_clade(clade_alleles, node, ref):
     for gene, pos, clade_state in clade_alleles:
         if gene in node.sequences and pos in node.sequences[gene]:
             state = node.sequences[gene][pos]
-        elif ref and gene in ref:
-            state = ref[gene][pos]
+        elif root_sequence and gene in root_sequence:
+            state = root_sequence[gene][pos]
         else:
             state = ''
 
@@ -245,13 +247,18 @@ def assign_clades(clade_designations, all_muts, tree, ref=None):
 
 
 def get_reference_sequence_from_root_node(all_muts, root_name):
-    # attach sequences to root
+    """
+    Extracts the (nuc) sequence from the root node, if set, as well as
+    the (aa) sequences. Returns a dictionary of {geneName: rootSequence}
+    where rootSequence is a list and geneName may be 'nuc'.
+    """
     ref = {}
     try:
         ref['nuc'] = list(all_muts[root_name]["sequence"])
     except:
         print("WARNING in augur.clades: nucleotide mutation json does not contain full sequences for the root node.")
 
+    # note that we require "aa_muts" to be encoded on the root node, but we actually use the "aa_sequences" key.
     if "aa_muts" in all_muts.get(root_name, {}):
         try:
             ref.update({gene:list(seq) for gene, seq in all_muts[root_name]["aa_sequences"].items()})
diff --git a/tests/functional/clades.t b/tests/functional/clades.t
index c3081c29e..2b4bd759c 100644
--- a/tests/functional/clades.t
+++ b/tests/functional/clades.t
@@ -42,3 +42,44 @@ Test the ability to _not_ export a branch label (same logic as not exporting the
   $ python3 "$TESTDIR/../../scripts/diff_jsons.py" clades/clades.json "$TMP/clades_no-labels.json" \
   >   --exclude-paths "root['generated_by']"
   {'dictionary_item_removed': [root['branches']]}
+
+
+A clade which exists at the root is not identified by inferring the root state
+(i.e. we don't infer the root state to be A if we observe a subsequent A10T mutation)
+This is an oversight and ideally would be fixed
+
+  $ ${AUGUR} clades \
+  >   --tree clades/toy_tree.nwk \
+  >   --mutations clades/toy_muts_no_ref.json \
+  >   --clades clades/toy_clades_nuc.tsv \
+  >   --output-node-data "$TMP/toy_clades_1.json" &>/dev/null
+
+  $ python3 "$TESTDIR/../../scripts/diff_jsons.py" clades/toy_clades_1.json "$TMP/toy_clades_1.json" \
+  >   --exclude-paths "root['generated_by']"
+  {}
+
+A clade which exists at the root is identified (and correctly propogated) if the root sequence
+is explicitly set.
+
+  $ ${AUGUR} clades \
+  >   --tree clades/toy_tree.nwk \
+  >   --mutations clades/toy_muts_ref.json \
+  >   --clades clades/toy_clades_nuc.tsv \
+  >   --output-node-data "$TMP/toy_clades_2a.json" &>/dev/null
+
+  $ python3 "$TESTDIR/../../scripts/diff_jsons.py" clades/toy_clades_2.json "$TMP/toy_clades_2a.json" \
+  >   --exclude-paths "root['generated_by']"
+  {}
+
+A clade which exists at the root is identified (and correctly propogated) without a root sequence
+if the (branch leading to the) root has the clade-defining mutation.
+
+  $ ${AUGUR} clades \
+  >   --tree clades/toy_tree.nwk \
+  >   --mutations clades/toy_muts_explicit_root_mutation.json \
+  >   --clades clades/toy_clades_nuc.tsv \
+  >   --output-node-data "$TMP/toy_clades_2b.json" &>/dev/null
+
+  $ python3 "$TESTDIR/../../scripts/diff_jsons.py" clades/toy_clades_2.json "$TMP/toy_clades_2b.json" \
+  >   --exclude-paths "root['generated_by']"
+  {}
\ No newline at end of file
diff --git a/tests/functional/clades/toy_clades_1.json b/tests/functional/clades/toy_clades_1.json
new file mode 100644
index 000000000..aa5a308e0
--- /dev/null
+++ b/tests/functional/clades/toy_clades_1.json
@@ -0,0 +1,30 @@
+{
+  "branches": {
+    "A": {
+      "labels": {
+        "clade": "clade_2"
+      }
+    }
+  },
+  "generated_by": {
+    "program": "augur",
+    "version": "21.1.0"
+  },
+  "nodes": {
+    "A": {
+      "clade_membership": "clade_2"
+    },
+    "AB": {
+      "clade_membership": "unassigned"
+    },
+    "B": {
+      "clade_membership": "unassigned"
+    },
+    "C": {
+      "clade_membership": "unassigned"
+    },
+    "ROOT": {
+      "clade_membership": "unassigned"
+    }
+  }
+}
\ No newline at end of file
diff --git a/tests/functional/clades/toy_clades_2.json b/tests/functional/clades/toy_clades_2.json
new file mode 100644
index 000000000..206a0560a
--- /dev/null
+++ b/tests/functional/clades/toy_clades_2.json
@@ -0,0 +1,35 @@
+{
+  "branches": {
+    "A": {
+      "labels": {
+        "clade": "clade_2"
+      }
+    },
+    "ROOT": {
+      "labels": {
+        "clade": "clade_1"
+      }
+    }
+  },
+  "generated_by": {
+    "program": "augur",
+    "version": "21.1.0"
+  },
+  "nodes": {
+    "A": {
+      "clade_membership": "clade_2"
+    },
+    "AB": {
+      "clade_membership": "clade_1"
+    },
+    "B": {
+      "clade_membership": "clade_1"
+    },
+    "C": {
+      "clade_membership": "clade_1"
+    },
+    "ROOT": {
+      "clade_membership": "clade_1"
+    }
+  }
+}
\ No newline at end of file
diff --git a/tests/functional/clades/toy_clades_nuc.tsv b/tests/functional/clades/toy_clades_nuc.tsv
new file mode 100644
index 000000000..0e1afa6e0
--- /dev/null
+++ b/tests/functional/clades/toy_clades_nuc.tsv
@@ -0,0 +1,3 @@
+clade	gene	site	alt
+clade_1	nuc	10	A
+clade_2	nuc	10	T
\ No newline at end of file
diff --git a/tests/functional/clades/toy_muts_explicit_root_mutation.json b/tests/functional/clades/toy_muts_explicit_root_mutation.json
new file mode 100644
index 000000000..f9587e89b
--- /dev/null
+++ b/tests/functional/clades/toy_muts_explicit_root_mutation.json
@@ -0,0 +1,10 @@
+{
+  "nodes": {
+    "ROOT": {
+      "muts": ["C10A"]
+    },
+    "A": {
+      "muts": ["A10T"]
+    }
+  }
+}
\ No newline at end of file
diff --git a/tests/functional/clades/toy_muts_no_ref.json b/tests/functional/clades/toy_muts_no_ref.json
new file mode 100644
index 000000000..f909383f7
--- /dev/null
+++ b/tests/functional/clades/toy_muts_no_ref.json
@@ -0,0 +1,7 @@
+{
+  "nodes": {
+    "A": {
+      "muts": ["A10T"]
+    }
+  }
+}
\ No newline at end of file
diff --git a/tests/functional/clades/toy_muts_ref.json b/tests/functional/clades/toy_muts_ref.json
new file mode 100644
index 000000000..35287cb8a
--- /dev/null
+++ b/tests/functional/clades/toy_muts_ref.json
@@ -0,0 +1,10 @@
+{
+  "nodes": {
+    "ROOT": {
+      "sequence": "AAAAAAAAAA"
+    },
+    "A": {
+      "muts": ["A10T"]
+    }
+  }
+}
\ No newline at end of file
diff --git a/tests/functional/clades/toy_tree.nwk b/tests/functional/clades/toy_tree.nwk
new file mode 100644
index 000000000..6bbdf908d
--- /dev/null
+++ b/tests/functional/clades/toy_tree.nwk
@@ -0,0 +1 @@
+((A:0.5,B:0.3)AB:0.8,C:0.3)ROOT:0.2;

From 0cb841d22e949c408e8ac3205a9e975366a4bcce Mon Sep 17 00:00:00 2001
From: James Hadfield <hadfield.james@gmail.com>
Date: Tue, 11 Apr 2023 19:07:35 +1200
Subject: [PATCH 07/12] [clades] supress unused --references arg

Workflows may be using this so I elected to hide it rather than remove
it (and warn people it's a no-op if they do happen to be using it)
---
 augur/clades.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/augur/clades.py b/augur/clades.py
index 797e9ee16..52df1055f 100644
--- a/augur/clades.py
+++ b/augur/clades.py
@@ -19,6 +19,7 @@
 import networkx as nx
 from itertools import islice
 from .errors import AugurError
+from argparse import SUPPRESS
 from .utils import get_parent_name_by_child_name_for_tree, read_node_data, write_json, get_json_name
 
 def read_in_clade_definitions(clade_file):
@@ -284,7 +285,7 @@ def register_parser(parent_subparsers):
     parser = parent_subparsers.add_parser("clades", help=__doc__)
     parser.add_argument('--tree', help="prebuilt Newick -- no tree will be built if provided")
     parser.add_argument('--mutations', nargs='+', help='JSON(s) containing ancestral and tip nucleotide and/or amino-acid mutations ')
-    parser.add_argument('--reference', nargs='+', help='fasta files containing reference and tip nucleotide and/or amino-acid sequences ')
+    parser.add_argument('--reference', nargs='+', help=SUPPRESS)
     parser.add_argument('--clades', type=str, help='TSV file containing clade definitions by amino-acid')
     parser.add_argument('--output-node-data', type=str, help='name of JSON file to save clade assignments to')
     parser.add_argument('--membership-name', type=str, default="clade_membership", help='Key to store clade membership under; use "None" to not export this')
@@ -299,6 +300,7 @@ def run(args):
         # PLACE HOLDER FOR vcf WORKFLOW.
         # Works without a reference for now but can be added if clade defs contain positions
         # that are monomorphic across reference and sequence sample.
+        print(f"WARNING in augur.clades: You have provided a --reference file(s) ({args.reference}) however this is functionality is not yet supported.")
         ref = None
     else:
         # extract reference sequences from the root node entry in the mutation json

From 0aaf6a74661d9f564d4bce546fee4aeb586f463b Mon Sep 17 00:00:00 2001
From: James Hadfield <hadfield.james@gmail.com>
Date: Tue, 11 Apr 2023 19:37:34 +1200
Subject: [PATCH 08/12] [clades] improve reference sequence parsing

This function had a few subtle bugs in it which are fixed here, as well
as improving the warning message to explain how this may affect clade
inference.

Note that the presence of sequences on nodes other than the root is
not considered by augur clades.
---
 augur/clades.py | 34 +++++++++++++++++++++++++---------
 1 file changed, 25 insertions(+), 9 deletions(-)

diff --git a/augur/clades.py b/augur/clades.py
index 52df1055f..9bc4898a3 100644
--- a/augur/clades.py
+++ b/augur/clades.py
@@ -254,17 +254,33 @@ def get_reference_sequence_from_root_node(all_muts, root_name):
     where rootSequence is a list and geneName may be 'nuc'.
     """
     ref = {}
-    try:
-        ref['nuc'] = list(all_muts[root_name]["sequence"])
-    except:
-        print("WARNING in augur.clades: nucleotide mutation json does not contain full sequences for the root node.")
 
-    # note that we require "aa_muts" to be encoded on the root node, but we actually use the "aa_sequences" key.
-    if "aa_muts" in all_muts.get(root_name, {}):
+    # the presence of a single mutation will imply that the corresponding reference
+    # sequence should be present, and we will warn if it is not
+    nt_present = False
+    genes_present = set([])
+    missing = []
+    for d in all_muts.values():
+        if "muts" in d:
+            nt_present = True
+        genes_present.update(d.get('aa_muts', {}).keys())
+
+    if nt_present:
         try:
-            ref.update({gene:list(seq) for gene, seq in all_muts[root_name]["aa_sequences"].items()})
-        except:
-            print("WARNING in augur.clades: amino acid mutation json does not contain full sequences for the root node.")
+            ref['nuc'] = list(all_muts.get(root_name, {})["sequence"])
+        except KeyError:
+            missing.append("nuc")
+
+    for gene in genes_present:
+        try:
+            ref[gene] = list(all_muts.get(root_name, {}).get("aa_sequences", {})[gene])
+        except KeyError:
+            missing.append(gene)
+
+    if missing:            
+        print(f"WARNING in augur.clades: sequences at the root node have not been specified for {{{', '.join(missing)}}}, \
+even though mutations were observed. Clades which are annotated using bases/codons present at the root \
+of the tree may not be correctly inferred.")
 
     return ref
 

From a356a9edacf7b6c143276937d2f7ad10a7f01996 Mon Sep 17 00:00:00 2001
From: James Hadfield <hadfield.james@gmail.com>
Date: Tue, 11 Apr 2023 19:55:40 +1200
Subject: [PATCH 09/12] [clades] catch error where pos is beyond ref length

We could check all of these up-front instead of exiting upon the first
error, and such a check should be part of validation within augur
clades, but this commit is a simple solution to fix a reported bug.

Closes #965
---
 augur/clades.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/augur/clades.py b/augur/clades.py
index 9bc4898a3..78102c1c6 100644
--- a/augur/clades.py
+++ b/augur/clades.py
@@ -153,7 +153,11 @@ def is_node_in_clade(clade_alleles, node, root_sequence):
         if gene in node.sequences and pos in node.sequences[gene]:
             state = node.sequences[gene][pos]
         elif root_sequence and gene in root_sequence:
-            state = root_sequence[gene][pos]
+            try:
+                state = root_sequence[gene][pos]
+            except IndexError:
+                raise AugurError(f"A clade definition specifies {{{gene},{pos+1},{clade_state}}} which \
+is beyond the bounds of the supplied root sequence for {gene} (length {len(root_sequence[gene])})")
         else:
             state = ''
 

From 2c6b66248eb398c1f61655c1e9efca00ef70d16e Mon Sep 17 00:00:00 2001
From: James Hadfield <hadfield.james@gmail.com>
Date: Tue, 11 Apr 2023 20:12:18 +1200
Subject: [PATCH 10/12] [clades] require required arguments

Closes #1153
---
 augur/clades.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/augur/clades.py b/augur/clades.py
index 78102c1c6..90e719958 100644
--- a/augur/clades.py
+++ b/augur/clades.py
@@ -303,11 +303,11 @@ def parse_nodes(tree_file, node_data_files):
 
 def register_parser(parent_subparsers):
     parser = parent_subparsers.add_parser("clades", help=__doc__)
-    parser.add_argument('--tree', help="prebuilt Newick -- no tree will be built if provided")
-    parser.add_argument('--mutations', nargs='+', help='JSON(s) containing ancestral and tip nucleotide and/or amino-acid mutations ')
+    parser.add_argument('--tree', required=True, help="prebuilt Newick -- no tree will be built if provided")
+    parser.add_argument('--mutations', required=True, metavar="NODE_DATA_JSON", nargs='+', help='JSON(s) containing ancestral and tip nucleotide and/or amino-acid mutations ')
     parser.add_argument('--reference', nargs='+', help=SUPPRESS)
-    parser.add_argument('--clades', type=str, help='TSV file containing clade definitions by amino-acid')
-    parser.add_argument('--output-node-data', type=str, help='name of JSON file to save clade assignments to')
+    parser.add_argument('--clades', required=True, metavar="TSV", type=str, help='TSV file containing clade definitions by amino-acid')
+    parser.add_argument('--output-node-data', type=str,  metavar="NODE_DATA_JSON", help='name of JSON file to save clade assignments to')
     parser.add_argument('--membership-name', type=str, default="clade_membership", help='Key to store clade membership under; use "None" to not export this')
     parser.add_argument('--label-name', type=str, default="clade", help='Key to store clade labels under; use "None" to not export this')
     return parser

From 40e549d84f75f5a498d128ab913e122c9fa6a492 Mon Sep 17 00:00:00 2001
From: James Hadfield <hadfield.james@gmail.com>
Date: Tue, 11 Apr 2023 20:25:57 +1200
Subject: [PATCH 11/12] [clades] warnings for unfound clades

A fatal error is raised if no clades are defined, but if a clade is not
found on the tree it's only a warning.
Suggested in #735
---
 augur/clades.py | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/augur/clades.py b/augur/clades.py
index 90e719958..30c43e8b7 100644
--- a/augur/clades.py
+++ b/augur/clades.py
@@ -22,6 +22,8 @@
 from argparse import SUPPRESS
 from .utils import get_parent_name_by_child_name_for_tree, read_node_data, write_json, get_json_name
 
+UNASSIGNED = 'unassigned'
+
 def read_in_clade_definitions(clade_file):
     '''
     Reads in tab-seperated file that defines clades by amino acid or nucleotide mutations
@@ -122,6 +124,9 @@ def read_in_clade_definitions(clade_file):
         if clade != root
     }
 
+    if not len(clades.keys()):
+        raise AugurError(f"No clades were defined in {clade_file}")
+
     return clades
 
 
@@ -197,7 +202,7 @@ def assign_clades(clade_designations, all_muts, tree, ref=None):
 
     # first pass to set all nodes to unassigned as precaution to ensure attribute is set
     for node in tree.find_clades(order = 'preorder'):
-        clade_membership[node.name] = 'unassigned'
+        clade_membership[node.name] = UNASSIGNED
 
     # count leaves
     for node in tree.find_clades(order = 'postorder'):
@@ -250,6 +255,16 @@ def assign_clades(clade_designations, all_muts, tree, ref=None):
 
     return (clade_membership, clade_labels)
 
+def warn_if_clades_not_found(membership, clade_designations):
+    clades = set(clade_designations.keys())
+    found = set([clade for clade in membership.values() if clade!=UNASSIGNED])
+    if not(len(found)):
+        print(f"WARNING in augur.clades: no clades found in tree!")
+        return
+    for clade in clades-found:
+        # warn loudly - one line per unfound clade
+        print(f"WARNING in augur.clades: clade '{clade}' not found in tree!")
+
 
 def get_reference_sequence_from_root_node(all_muts, root_name):
     """
@@ -330,6 +345,7 @@ def run(args):
     clade_designations = read_in_clade_definitions(args.clades)
 
     membership, labels = assign_clades(clade_designations, all_muts, tree, ref)
+    warn_if_clades_not_found(membership, clade_designations)
 
     membership_key= args.membership_name if args.membership_name.upper() != "NONE" else None
     label_key= args.label_name if args.label_name.upper() != "NONE" else None

From e5cfc3a4a0e71d7decd8859e908a2c80f6f9a909 Mon Sep 17 00:00:00 2001
From: James Hadfield <hadfield.james@gmail.com>
Date: Tue, 11 Apr 2023 20:56:18 +1200
Subject: [PATCH 12/12] [clades] check for multiple mutations at same pos

Multiple mutations at the same position on a single branch are now a
fatal error. Previous behaviour was to overwrite such mutations when
parsing. Suggested by #735.
---
 augur/clades.py                                | 15 +++++++++++++++
 tests/functional/clades.t                      | 11 ++++++++++-
 tests/functional/clades/toy_muts_multiple.json | 15 +++++++++++++++
 3 files changed, 40 insertions(+), 1 deletion(-)
 create mode 100644 tests/functional/clades/toy_muts_multiple.json

diff --git a/augur/clades.py b/augur/clades.py
index 30c43e8b7..ab8614d09 100644
--- a/augur/clades.py
+++ b/augur/clades.py
@@ -170,6 +170,20 @@ def is_node_in_clade(clade_alleles, node, root_sequence):
 
     return all(conditions)
 
+def ensure_no_multiple_mutations(all_muts):
+    multiples = []
+
+    for name,node in all_muts.items():
+        nt_positions = [int(mut[1:-1])-1  for mut in node.get('muts', [])]
+        if len(set(nt_positions))!=len(nt_positions):
+            multiples.append(f"Node {name} (nuc)")
+        for gene in node.get('aa_muts', {}):
+            aa_positions = [int(mut[1:-1])-1 for mut in node['aa_muts'][gene]]
+            if len(set(aa_positions))!=len(aa_positions):
+                multiples.append(f"Node {name} ({gene})")
+    
+    if multiples:
+        raise AugurError(f"Multiple mutations at the same position on a single branch were found: {', '.join(multiples)}")
 
 def assign_clades(clade_designations, all_muts, tree, ref=None):
     '''
@@ -314,6 +328,7 @@ def parse_nodes(tree_file, node_data_files):
     tree_nodes = set([clade.name for clade in tree.find_clades()])
     if not json_nodes.issubset(tree_nodes):
         raise AugurError(f"The following nodes in the node_data files ({', '.join(node_data_files)}) are not found in the tree ({tree_file}): {', '.join(json_nodes - tree_nodes)}")
+    ensure_no_multiple_mutations(node_data['nodes'])
     return (tree, node_data['nodes'])
 
 def register_parser(parent_subparsers):
diff --git a/tests/functional/clades.t b/tests/functional/clades.t
index 2b4bd759c..0dca3dc13 100644
--- a/tests/functional/clades.t
+++ b/tests/functional/clades.t
@@ -82,4 +82,13 @@ if the (branch leading to the) root has the clade-defining mutation.
 
   $ python3 "$TESTDIR/../../scripts/diff_jsons.py" clades/toy_clades_2.json "$TMP/toy_clades_2b.json" \
   >   --exclude-paths "root['generated_by']"
-  {}
\ No newline at end of file
+  {}
+
+Multiple mutations at the same position on a single branch are a fatal error
+
+  $ ${AUGUR} clades \
+  >   --tree clades/toy_tree.nwk \
+  >   --mutations clades/toy_muts_multiple.json \
+  >   --clades clades/toy_clades_nuc.tsv
+  ERROR: Multiple mutations at the same position on a single branch were found: Node A (nuc), Node AB (geneName)
+  [2]
\ No newline at end of file
diff --git a/tests/functional/clades/toy_muts_multiple.json b/tests/functional/clades/toy_muts_multiple.json
new file mode 100644
index 000000000..0688cee89
--- /dev/null
+++ b/tests/functional/clades/toy_muts_multiple.json
@@ -0,0 +1,15 @@
+{
+    "nodes": {
+      "A": {
+        "muts": ["A10T", "T10C"]
+      },
+      "AB": {
+        "aa_muts": {
+          "geneName": ["S42L", "R42H", "Y50W"]
+        }
+      },
+      "B": {
+        "muts": ["A10T", "T11C"]
+      }
+    }
+  }
\ No newline at end of file