From 128aad84b5a409e6f155abe8b7f9a9088d3121c1 Mon Sep 17 00:00:00 2001 From: james hadfield Date: Tue, 23 Apr 2024 22:24:12 +1200 Subject: [PATCH 1/3] [export] enforce tree naming standards Checking for duplicated node names and missing node names is in line with the schema. Previously some calls to `export v2` would be ok with missing node names (e.g. see the updated tests in `minify-output.t`) but any usage with metadata would result in an uncaught error. --- augur/export_v2.py | 28 +++++++++++++++---- .../export_v2/cram/duplicate-node-names.t | 10 +++++++ .../functional/export_v2/cram/minify-output.t | 25 +++++++++-------- .../export_v2/cram/missing-internal-nodes.t | 18 ++++++++++++ 4 files changed, 65 insertions(+), 16 deletions(-) create mode 100644 tests/functional/export_v2/cram/duplicate-node-names.t create mode 100644 tests/functional/export_v2/cram/missing-internal-nodes.t diff --git a/augur/export_v2.py b/augur/export_v2.py index af563bf50..f5aa95890 100644 --- a/augur/export_v2.py +++ b/augur/export_v2.py @@ -117,6 +117,24 @@ def order_nodes(node): return od +def read_tree(fname): + tree = Phylo.read(fname, 'newick') + # augur export requires unique node names (both terminal and external) as these + # are used to associate metadata/node-data with nodes. Any duplication is fatal. + # The exception to this is unlabelled node names, which auspice will handle but + # won't be associated with any metadata within export. + node_names = [clade.name for clade in tree.root.find_clades()] + if None in node_names: + raise AugurError(f"Tree contains unnamed nodes. If these are internal nodes you may wish to run "+ + "`augur refine --tree --output-tree ` to name them.") + if len(set(node_names))!=len(node_names): + from collections import Counter + dups = [name for name, count in Counter(node_names).items() if count>1] + raise AugurError(f"{len(dups)} node names occur multiple times in the tree: " + + ", ".join([f"'{v}'" for v in dups[0:5]]) + ("..." if len(dups)>5 else "")) + return (tree, node_names) + + def node_div(T, node_attrs): """ Scans the provided tree & metadata to see if divergence is defined, and if so returns @@ -1059,12 +1077,12 @@ def create_branch_labels(branch_attrs, node_data, branch_data): continue branch_attrs[node_name]["labels"][label_key] = label_value -def parse_node_data_and_metadata(T, node_data, metadata): +def parse_node_data_and_metadata(node_names, node_data, metadata): node_data_names = set() metadata_names = set() # assign everything to node_attrs, exclusions considered later - node_attrs = {clade.name: {} for clade in T.root.find_clades()} + node_attrs = {name: {} for name in node_names} # first pass: metadata for metadata_id, node in metadata.items(): @@ -1088,7 +1106,7 @@ def parse_node_data_and_metadata(T, node_data, metadata): # third pass: create `branch_attrs`. The data comes from # (a) some keys within `node_data['nodes']` (for legacy reasons) # (b) the `node_data['branches']` dictionary, which currently only defines labels - branch_attrs = {clade.name: defaultdict(dict) for clade in T.root.find_clades()} + branch_attrs = {name: defaultdict(dict) for name in node_names} create_branch_mutations(branch_attrs, node_data) create_branch_labels(branch_attrs, node_data['nodes'], node_data.get('branches', {})) @@ -1173,9 +1191,9 @@ def run(args): metadata_file = {} # parse input files - T = Phylo.read(args.tree, 'newick') + (T, node_names) = read_tree(args.tree) node_data, node_attrs, node_data_names, metadata_names, branch_attrs = \ - parse_node_data_and_metadata(T, node_data_file, metadata_file) + parse_node_data_and_metadata(node_names, node_data_file, metadata_file) config = get_config(args) additional_metadata_columns = get_additional_metadata_columns(config, args.metadata_columns, metadata_names) diff --git a/tests/functional/export_v2/cram/duplicate-node-names.t b/tests/functional/export_v2/cram/duplicate-node-names.t new file mode 100644 index 000000000..2a8c8ca77 --- /dev/null +++ b/tests/functional/export_v2/cram/duplicate-node-names.t @@ -0,0 +1,10 @@ + $ source "$TESTDIR"/_setup.sh + + $ echo "(tipA:1,(tipA:1,tipA:1)internalBC:2,(tipD:3,tipE:4,tipF:1)internalDEF:5)ROOT:0;" > tree1.nwk + + $ ${AUGUR} export v2 \ + > --tree tree1.nwk \ + > --output minimal.json + ERROR: 1 node names occur multiple times in the tree: 'tipA' + [2] + diff --git a/tests/functional/export_v2/cram/minify-output.t b/tests/functional/export_v2/cram/minify-output.t index 68a573369..a9035a9a0 100644 --- a/tests/functional/export_v2/cram/minify-output.t +++ b/tests/functional/export_v2/cram/minify-output.t @@ -16,13 +16,16 @@ names to create larger file sizes with less recursion. > } A small tree is not automatically minified. -The unminified output is 13KB which is considered acceptable. +The unminified output is 15KB which is considered acceptable. + + $ echo "$(generate_newick 10);" > small_tree_raw.nwk + +Use augur refine to add internal node names + $ ${AUGUR} refine --tree small_tree_raw.nwk --output-tree small_tree.nwk &>/dev/null - $ echo "$(generate_newick 10);" > small_tree.nwk $ ${AUGUR} export v2 \ > --tree small_tree.nwk \ - > --skip-validation \ > --output output.json &>/dev/null $ head -c 20 output.json @@ -30,13 +33,12 @@ The unminified output is 13KB which is considered acceptable. "version": "v2", (no-eol) $ ls -l output.json | awk '{print $5}' - 13813 + 15039 It can be forcefully minified with an argument. $ ${AUGUR} export v2 \ > --tree small_tree.nwk \ - > --skip-validation \ > --minify-json \ > --output output.json &>/dev/null @@ -48,21 +50,22 @@ even if it may seem "falsey". $ AUGUR_MINIFY_JSON=0 ${AUGUR} export v2 \ > --tree small_tree.nwk \ - > --skip-validation \ > --output output.json &>/dev/null $ head -c 20 output.json {"version": "v2", "m (no-eol) -A large tree, when forcefully not minified, has an output size of 6MB which is +A large tree, when forcefully not minified, has an output size of 8MB which is considered large. - $ echo "$(generate_newick 500);" > big_tree.nwk + $ echo "$(generate_newick 500);" > big_tree_raw.nwk + +Use augur refine to add internal node names + $ ${AUGUR} refine --tree big_tree_raw.nwk --output-tree big_tree.nwk &>/dev/null $ ${AUGUR} export v2 \ > --tree big_tree.nwk \ - > --skip-validation \ > --no-minify-json \ > --output output.json &>/dev/null @@ -71,7 +74,7 @@ considered large. "version": "v2", (no-eol) $ ls -l output.json | awk '{print $5}' - 6568454 + 8591420 This means it is automatically minified. @@ -84,4 +87,4 @@ This means it is automatically minified. {"version": "v2", "m (no-eol) $ ls -l output.json | awk '{print $5}' - 561436 + 576414 diff --git a/tests/functional/export_v2/cram/missing-internal-nodes.t b/tests/functional/export_v2/cram/missing-internal-nodes.t new file mode 100644 index 000000000..3732a9c72 --- /dev/null +++ b/tests/functional/export_v2/cram/missing-internal-nodes.t @@ -0,0 +1,18 @@ +Setup + + $ source "$TESTDIR"/_setup.sh + +Test a tree with unlabelled internal nodes. +In Augur v24.3.0 and earlier these would work _sometimes_ depending +on what code paths were hit, but in most real-life cases would raise +an uncaught error. We now check for these when we parse the tree. + + $ echo "(tipA:1,(tipB:1,tipC:1):2,(tipD:3,tipE:4,tipF:1):5):0;" > tree1.nwk + + $ ${AUGUR} export v2 \ + > --tree tree1.nwk \ + > --metadata "$TESTDIR/../data/dataset1_metadata_with_name.tsv" \ + > --node-data "$TESTDIR/../data/div_node-data.json" \ + > --output test1.json + ERROR: Tree contains unnamed nodes.+ (re) + [2] From 8baf3f8c45e59215401185cb4b9c62a93910cd41 Mon Sep 17 00:00:00 2001 From: james hadfield Date: Wed, 24 Apr 2024 10:29:53 +1200 Subject: [PATCH 2/3] [export] allow multiple trees MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Multiple trees ("subtrees") have been available in Auspice since late 2021¹ and part of the associated schema since early 2022². Despite this there was no way to produce such datasets within Augur itself, and despite the schema changes the associated `augur validate` command was never updated to allow them. This commit adds multi-tree inputs to `augur export v2` as well as allowing them to validate with our associated validation commands. ¹ ² --- augur/export_v2.py | 28 +++++++---- augur/validate_export.py | 36 ++++++++++--- tests/functional/export_v2/cram/multi-tree.t | 53 ++++++++++++++++++++ 3 files changed, 100 insertions(+), 17 deletions(-) create mode 100644 tests/functional/export_v2/cram/multi-tree.t diff --git a/augur/export_v2.py b/augur/export_v2.py index f5aa95890..cfb31a53a 100644 --- a/augur/export_v2.py +++ b/augur/export_v2.py @@ -117,13 +117,13 @@ def order_nodes(node): return od -def read_tree(fname): - tree = Phylo.read(fname, 'newick') +def read_trees(filenames): + trees = [Phylo.read(fname, 'newick') for fname in filenames] # augur export requires unique node names (both terminal and external) as these # are used to associate metadata/node-data with nodes. Any duplication is fatal. # The exception to this is unlabelled node names, which auspice will handle but # won't be associated with any metadata within export. - node_names = [clade.name for clade in tree.root.find_clades()] + node_names = [clade.name for tree in trees for clade in tree.root.find_clades()] if None in node_names: raise AugurError(f"Tree contains unnamed nodes. If these are internal nodes you may wish to run "+ "`augur refine --tree --output-tree ` to name them.") @@ -132,7 +132,7 @@ def read_tree(fname): dups = [name for name, count in Counter(node_names).items() if count>1] raise AugurError(f"{len(dups)} node names occur multiple times in the tree: " + ", ".join([f"'{v}'" for v in dups[0:5]]) + ("..." if len(dups)>5 else "")) - return (tree, node_names) + return (trees, node_names) def node_div(T, node_attrs): @@ -750,7 +750,12 @@ def _recursively_set_data(node): node['branch_attrs'] = branch_attrs[node['name']] for child in node.get("children", []): _recursively_set_data(child) - _recursively_set_data(data_json["tree"]) + + if isinstance(data_json["tree"], list): + for subtree in data_json['tree']: + _recursively_set_data(subtree) + else: + _recursively_set_data(data_json["tree"]) def set_node_attrs_on_tree(data_json, node_attrs, additional_metadata_columns): @@ -839,7 +844,11 @@ def _recursively_set_data(node): for child in node.get("children", []): _recursively_set_data(child) - _recursively_set_data(data_json["tree"]) + if isinstance(data_json["tree"], list): + for subtree in data_json['tree']: + _recursively_set_data(subtree) + else: + _recursively_set_data(data_json["tree"]) def node_data_prop_is_normal_trait(name): # those traits / keys / attrs which are not "special" and can be exported @@ -893,7 +902,7 @@ def register_parser(parent_subparsers): required = parser.add_argument_group( title="REQUIRED" ) - required.add_argument('--tree','-t', metavar="newick", required=True, help="Phylogenetic tree, usually output from `augur refine`") + required.add_argument('--tree','-t', metavar="newick", nargs='+', required=True, help="Phylogenetic tree(s), usually output from `augur refine`") required.add_argument('--output', metavar="JSON", required=True, help="Output file (typically for visualisation in auspice)") config = parser.add_argument_group( @@ -1191,7 +1200,7 @@ def run(args): metadata_file = {} # parse input files - (T, node_names) = read_tree(args.tree) + (trees, node_names) = read_trees(args.tree) node_data, node_attrs, node_data_names, metadata_names, branch_attrs = \ parse_node_data_and_metadata(node_names, node_data_file, metadata_file) config = get_config(args) @@ -1223,7 +1232,8 @@ def run(args): set_filters(data_json, config) # set tree structure - data_json["tree"] = convert_tree_to_json_structure(T.root, node_attrs, node_div(T, node_attrs)) + trees_json = [convert_tree_to_json_structure(tree.root, node_attrs, node_div(tree, node_attrs)) for tree in trees] + data_json["tree"] = trees_json[0] if len(trees_json)==1 else trees_json set_node_attrs_on_tree(data_json, node_attrs, additional_metadata_columns) set_branch_attrs_on_tree(data_json, branch_attrs) diff --git a/augur/validate_export.py b/augur/validate_export.py index 5cd3f4c4c..80da43bfe 100644 --- a/augur/validate_export.py +++ b/augur/validate_export.py @@ -7,7 +7,7 @@ import sys from collections import defaultdict -def ensure_no_duplicate_names(root, ValidateError): +def ensure_no_duplicate_names(tree, ValidateError): """ Check that all node names are identical, which is required for auspice (v2) JSONs. """ @@ -18,10 +18,14 @@ def recurse(node): names.add(node["name"]) if "children" in node: [recurse(child) for child in node["children"]] - recurse(root) + if isinstance(tree, list): + for subtree in tree: + recurse(subtree) + else: + recurse(tree) -def collectTreeAttrsV2(root, warn): +def collectTreeAttrsV2(tree, warn): """ Collect all keys specified on `node["node_attrs"]` throughout the tree and the values associated with them. Note that this will only look at @@ -47,7 +51,12 @@ def recurse(node): [recurse(child) for child in node["children"]] else: num_terminal += 1 - recurse(root) + + if isinstance(tree, list): + for subtree in tree: + recurse(subtree) + else: + recurse(tree) for data in seen.values(): if data["count"] == num_nodes: @@ -56,7 +65,7 @@ def recurse(node): return(seen, num_terminal) -def collectMutationGenes(root): +def collectMutationGenes(tree): """ Returns a set of all genes specified in the tree in the "aa_muts" objects """ @@ -67,17 +76,28 @@ def recurse(node): genes.update(mutations.keys()) if "children" in node: [recurse(child) for child in node["children"]] - recurse(root) + + if isinstance(tree, list): + for subtree in tree: + recurse(subtree) + else: + recurse(tree) + genes -= {"nuc"} return genes -def collectBranchLabels(root): +def collectBranchLabels(tree): labels = set() def recurse(node): labels.update(node.get("branch_attrs", {}).get("labels", {}).keys()) if "children" in node: [recurse(child) for child in node["children"]] - recurse(root) + + if isinstance(tree, list): + for subtree in tree: + recurse(subtree) + else: + recurse(tree) return labels def verifyMainJSONIsInternallyConsistent(data, ValidateError): diff --git a/tests/functional/export_v2/cram/multi-tree.t b/tests/functional/export_v2/cram/multi-tree.t new file mode 100644 index 000000000..a7a45b948 --- /dev/null +++ b/tests/functional/export_v2/cram/multi-tree.t @@ -0,0 +1,53 @@ + +Setup + + $ source "$TESTDIR"/_setup.sh + +Create a small second tree (which has different names/labels than 'tree.nwk') + $ cat > tree2.nwk <<~~ + > (tipG:1,(tipH:1,tipI:1)internalHI:2)SECOND_ROOT:0; + > ~~ + +Minimal export -- no node data, no metadata etc etc + $ ${AUGUR} export v2 \ + > --tree "$TESTDIR/../data/tree.nwk" tree2.nwk \ + > --output minimal.json &> /dev/null + +More realistic export - with node_data for all nodes and metadata for some of them + + $ cat > metadata.tsv <<~~ + > strain something + > tipA foo + > tipB foo + > tipC foo + > tipG bar + > tipH bar + > ~~ + + + $ cat > node-data.json <<~~ + > { + > "nodes": { + > "ROOT": {"mutation_length": 0}, + > "tipA": {"mutation_length": 1}, + > "internalBC": {"mutation_length": 2}, + > "tipB": {"mutation_length": 1}, + > "tipC": {"mutation_length": 1}, + > "internalDEF": {"mutation_length": 5}, + > "tipD": {"mutation_length": 3}, + > "tipE": {"mutation_length": 4}, + > "tipF": {"mutation_length": 1}, + > "SECOND_ROOT": {"mutation_length": 0}, + > "tipG": {"mutation_length": 1}, + > "internalHI": {"mutation_length": 2}, + > "tipH": {"mutation_length": 1}, + > "tipI": {"mutation_length": 1} + > } + > } + > ~~ + + $ ${AUGUR} export v2 \ + > --tree "$TESTDIR/../data/tree.nwk" tree2.nwk \ + > --metadata metadata.tsv --color-by-metadata something \ + > --node-data node-data.json \ + > --output output.json &> /dev/null From 4b6944e3288e89519563ac0bc93ed5c1b41a8aee Mon Sep 17 00:00:00 2001 From: james hadfield Date: Wed, 24 Apr 2024 14:34:03 +1200 Subject: [PATCH 3/3] WIP [refine] unique internal node names Needed for pipelines which will produce multiple trees via `augur refine` and then supply these trees to `augur export v2` --- augur/refine.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/augur/refine.py b/augur/refine.py index 95717861f..348898377 100644 --- a/augur/refine.py +++ b/augur/refine.py @@ -143,6 +143,7 @@ def register_parser(parent_subparsers): default='mutations-per-site', help='Units in which sequence divergences is exported.') parser.add_argument('--seed', type=int, help='seed for random number generation') parser.add_argument('--verbosity', type=int, default=1, help='treetime verbosity, between 0 and 6 (higher values more output)') + parser.add_argument('--content-aware-internal-node-names', action="store_true") parser.set_defaults(covariance=True) return parser @@ -342,6 +343,21 @@ def are_sequence_states_different(nuc1, nuc2): print("ERROR: divergence unit",args.divergence_units,"not supported!", file=sys.stderr) return 1 + if args.content_aware_internal_node_names: + terminals = [n.name for n in T.find_clades() if n.is_terminal()] + terminals.sort() + internals = {n.name for n in T.find_clades() if not n.is_terminal()} + assert all(n.startswith('NODE_') for n in internals) + # compute hash similar to git short commit hash + import hashlib + id = hashlib.sha256("".join(terminals).encode('utf-8')).hexdigest()[0:7] + def rename(name): + if name not in internals: return name + return f"NODE_{id}_{name.split('_')[1]}" + for n in T.find_clades(): + n.name = rename(n.name) + node_data['nodes'] = {rename(name):data for name,data in node_data['nodes'].items()} + # Export refined tree and node data tree_success = Phylo.write(T, tree_fname, 'newick', format_branch_length='%1.8f', branch_length_only=True) print("updated tree written to",tree_fname, file=sys.stdout)