Skip to content

Commit 8a0c06e

Browse files
committed
augur clades allows attribute name to be specified
Previously the `augur clades` command produced a node-data JSON which stored clade membership as the node-attr "clade_membership" and defined the basal nodes of each clade with the node-attr "clade_annotation". `augur export v2` interpreted the latter as a special-case and turned it into a branch label of the same name. The previous commit allowed `augur export` to be supplied node-data JSONs with a `branch_labels` structure. Here we update `augur clades` to export data in this structure, which allows the user to specify the keys to use via the `--attribute-name` arg. This commit breaks backwards compatibility for pipelines as the default attribute name is "clade". This will result in dataset (auspice) JSONs with the same branch labelling as before, but with a different node-attr (was "clade_membership", now "clade"). As `augur export v2` will make colorings for all node-attrs in in node-data JSONs, this will be exported as a "clade" coloring with no changes needed, however auspice config JSONs may now refer to a non-existent "clade_membership" key. `augur export v2` has been updated to no longer special-case `clade_membership` or `clade_annotation` node attrs. We print a warning if an auspice config JSON refers to `clade_membership` to help users update their configs. Functional tests for `augur clades` have been added. Closes #720
1 parent 997095f commit 8a0c06e

8 files changed

+183
-46
lines changed

augur/clades.py

+22-15
Original file line numberDiff line numberDiff line change
@@ -101,13 +101,12 @@ def assign_clades(clade_designations, all_muts, tree, ref=None):
101101
mapping of node to clades
102102
'''
103103

104-
clade_membership = {}
104+
# We add the names of nodes these dictionaries to store which clade they belong to.
105+
# all nodes in a clade should appear in `clade_membership` while only one node should
106+
# appear in `basal_clade_nodes`
107+
(clade_membership, basal_clade_nodes) = ({}, {})
105108
parents = get_parent_name_by_child_name_for_tree(tree)
106109

107-
# first pass to set all nodes to unassigned as precaution to ensure attribute is set
108-
for node in tree.find_clades(order = 'preorder'):
109-
clade_membership[node.name] = {'clade_membership': 'unassigned'}
110-
111110
# count leaves
112111
for node in tree.find_clades(order = 'postorder'):
113112
node.leaf_count = 1 if node.is_terminal() else np.sum([c.leaf_count for c in node])
@@ -136,7 +135,7 @@ def assign_clades(clade_designations, all_muts, tree, ref=None):
136135
node.sequences[gene][pos] = d
137136

138137

139-
# second pass to assign 'clade_annotation' to basal nodes within each clade
138+
# store names of basal nodes of each clade in `basal_clade_nodes` and `clade_membership` dicts.
140139
# if multiple nodes match, assign annotation to largest
141140
# otherwise occasional unwanted cousin nodes get assigned the annotation
142141
for clade_name, clade_alleles in clade_designations.items():
@@ -147,16 +146,17 @@ def assign_clades(clade_designations, all_muts, tree, ref=None):
147146
sorted_nodes = sorted(node_counts, key=lambda x: x.leaf_count, reverse=True)
148147
if len(sorted_nodes) > 0:
149148
target_node = sorted_nodes[0]
150-
clade_membership[target_node.name] = {'clade_annotation': clade_name, 'clade_membership': clade_name}
149+
basal_clade_nodes[target_node.name] = clade_name
150+
clade_membership[target_node.name] = clade_name # basal nodes are members of the clade
151151

152-
# third pass to propagate 'clade_membership'
152+
# propagate 'clade_membership' to children nodes
153153
# don't propagate if encountering 'clade_annotation'
154154
for node in tree.find_clades(order = 'preorder'):
155155
for child in node:
156-
if 'clade_annotation' not in clade_membership[child.name]:
157-
clade_membership[child.name]['clade_membership'] = clade_membership[node.name]['clade_membership']
158-
159-
return clade_membership
156+
# if the child doesn't define the start of its own clade, but the parent belongs to a clade, then inherit that membership
157+
if child.name not in basal_clade_nodes and node.name in clade_membership:
158+
clade_membership[child.name] = clade_membership[node.name]
159+
return (basal_clade_nodes, clade_membership)
160160

161161

162162
def get_reference_sequence_from_root_node(all_muts, root_name):
@@ -181,6 +181,7 @@ def register_arguments(parser):
181181
parser.add_argument('--mutations', nargs='+', help='JSON(s) containing ancestral and tip nucleotide and/or amino-acid mutations ')
182182
parser.add_argument('--reference', nargs='+', help='fasta files containing reference and tip nucleotide and/or amino-acid sequences ')
183183
parser.add_argument('--clades', type=str, help='TSV file containing clade definitions by amino-acid')
184+
parser.add_argument('--attribute-name', type=str, default="clade", help="name to use for clade membership & branch labels", required=False)
184185
parser.add_argument('--output-node-data', type=str, help='name of JSON file to save clade assignments to')
185186

186187

@@ -205,8 +206,14 @@ def run(args):
205206

206207
clade_designations = read_in_clade_definitions(args.clades)
207208

208-
clade_membership = assign_clades(clade_designations, all_muts, tree, ref)
209+
(basal_clade_nodes, clade_membership) = assign_clades(clade_designations, all_muts, tree, ref)
210+
211+
# create node_data for export as a JSON
212+
node_data = {
213+
'nodes': {node: {args.attribute_name: clade} for node,clade in clade_membership.items()},
214+
'branch_labels': {node: {args.attribute_name: clade} for node,clade in basal_clade_nodes.items()}
215+
}
209216

210217
out_name = get_json_name(args)
211-
write_json({'nodes': clade_membership}, out_name)
212-
print("clades written to", out_name, file=sys.stdout)
218+
write_json(node_data, out_name)
219+
print(f"clades written to {out_name} using attribute name {args.attribute_name}", file=sys.stdout)

augur/export_v2.py

+15-31
Original file line numberDiff line numberDiff line change
@@ -91,17 +91,9 @@ def are_mutations_defined(node_attrs):
9191
return True
9292
return False
9393

94-
95-
def are_clades_defined(node_attrs):
94+
def is_node_attr_defined(node_attrs, attr_name):
9695
for node, data in node_attrs.items():
97-
if data.get("clade_membership") or data.get("clade_annotation"):
98-
return True
99-
return False
100-
101-
102-
def are_dates_defined(node_attrs):
103-
for node, data in node_attrs.items():
104-
if data.get("num_date"):
96+
if data.get(attr_name):
10597
return True
10698
return False
10799

@@ -162,7 +154,7 @@ def set_colorings(data_json, config, command_line_colorings, metadata_names, nod
162154
def _get_type(key, trait_values):
163155
# for some keys we know what the type must be
164156
known_types = {
165-
"clade_membership": "categorical",
157+
"clade": "categorical",
166158
"gt": "categorical",
167159
"author": "categorical",
168160
"num_date": "continuous"
@@ -199,7 +191,7 @@ def _get_title(key):
199191
return config_title
200192

201193
# hardcoded fallbacks:
202-
if key == "clade_membership":
194+
if key == "clade":
203195
return "Clade"
204196
if key == "gt":
205197
return "Genotype"
@@ -251,6 +243,12 @@ def _is_valid(coloring):
251243
if key == "gt" and not are_mutations_defined(node_attrs):
252244
warn("[colorings] You asked for mutations (\"gt\"), but none are defined on the tree. They cannot be used as a coloring.")
253245
return False
246+
if key == "clade_membership" and not trait_values:
247+
# augur 12 & below defined clades via the key "clade_membership", not "clade".
248+
# If an auspice_config file specifies this, and it is not present in any node-data, we print a warning.
249+
# (Note that if "clade" is present in node-data, we automatically include it as a colouring.)
250+
warn("You asked for a color-by for 'clade_membership' but this is now called 'clade'. You should update your auspice config file.")
251+
return False
254252
if key != "gt" and not trait_values:
255253
warn("You asked for a color-by for trait '{}', but it has no values on the tree. It has been ignored.".format(key))
256254
return False
@@ -289,11 +287,10 @@ def _get_colorings():
289287
# add in genotype as a special case if (a) not already set and (b) the data supports it
290288
if "gt" not in explicitly_defined_colorings and are_mutations_defined(node_attrs):
291289
colorings.insert(0,{'key':'gt'})
292-
if "num_date" not in explicitly_defined_colorings and are_dates_defined(node_attrs):
290+
if "num_date" not in explicitly_defined_colorings and is_node_attr_defined(node_attrs, "num_date"):
293291
colorings.insert(0,{'key':'num_date'})
294-
if "clade_membership" not in explicitly_defined_colorings and are_clades_defined(node_attrs):
295-
colorings.insert(0,{'key':'clade_membership'})
296-
292+
if "clade" not in explicitly_defined_colorings and is_node_attr_defined(node_attrs, "clade"):
293+
colorings.insert(0,{'key':'clade'})
297294
return colorings
298295

299296

@@ -650,8 +647,6 @@ def node_data_prop_is_normal_trait(name):
650647
# those traits / keys / attrs which are not "special" and can be exported
651648
# as normal attributes on nodes
652649
excluded = [
653-
"clade_annotation", # Clade annotation is label, not colorby!
654-
"clade_membership", # will be auto-detected if it is available
655650
"authors", # authors are set as a node property, not a trait property
656651
"author", # see above
657652
"vaccine", # vaccine info is stored as a "special" node prop
@@ -850,16 +845,6 @@ def transfer_mutations_to_branches(node_attrs, branch_attrs):
850845
else:
851846
branch_attrs[node_name]["labels"] = { "aa": aa_lab }
852847

853-
def transfer_clade_annotation_to_branches(node_attrs, branch_attrs):
854-
for node_name, raw_data in node_attrs.items():
855-
if "clade_annotation" in raw_data and is_valid(raw_data["clade_annotation"]):
856-
if node_name not in branch_attrs:
857-
branch_attrs[node_name] = {}
858-
if 'labels' in branch_attrs[node_name]:
859-
branch_attrs[node_name]["labels"]['clade'] = raw_data["clade_annotation"]
860-
else:
861-
branch_attrs[node_name]["labels"] = { "clade": raw_data["clade_annotation"] }
862-
863848
def transfer_branch_labels_to_branch_attrs(branch_labels, branch_attrs):
864849
for node_name, branch_labels in branch_labels.items():
865850
if node_name not in branch_attrs:
@@ -899,12 +884,11 @@ def parse_node_data_and_metadata(T, node_data_files, metadata_file):
899884
node_attrs[name][corrected_key] = value
900885
node_data_names.add(corrected_key)
901886

902-
# third pass: create `branch_attrs` which includes certain traits supplied in `node_attrs`
903-
# (e.g. mutations are coverted to branch attrs, and `clade_annotation` is interpreted as a label)
887+
# third pass: create `branch_attrs` which includes a few special-case traits from in `node_attrs`
888+
# (e.g. mutations are coverted from node attrs to branch attrs)
904889
# as well as any branch labels supplied in node-data files.
905890
branch_attrs = {}
906891
transfer_mutations_to_branches(node_attrs, branch_attrs)
907-
transfer_clade_annotation_to_branches(node_attrs, branch_attrs)
908892
transfer_branch_labels_to_branch_attrs(node_data.get('branch_labels', {}), branch_attrs)
909893

910894
return (node_data, node_attrs, node_data_names, metadata_names, branch_attrs)

tests/functional/clades.t

+34
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
Integration tests for augur clades.
2+
3+
$ pushd "$TESTDIR" > /dev/null
4+
$ export AUGUR="../../bin/augur"
5+
6+
Run augur clades without --attribute-name. We expect the name to be "clade"
7+
8+
$ ${AUGUR} clades \
9+
> --tree clades/tree.nwk \
10+
> --clades clades/clades.tsv \
11+
> --mutations clades/nt_muts.json \
12+
> --output-node-data "$TMP/default.json" > /dev/null
13+
14+
$ python3 "$TESTDIR/../../scripts/diff_jsons.py" "clades/expected-output-default.json" "$TMP/default.json"
15+
{}
16+
17+
Run augur clades with a custom --attribute-name
18+
19+
$ ${AUGUR} clades \
20+
> --tree clades/tree.nwk \
21+
> --clades clades/clades.tsv \
22+
> --mutations clades/nt_muts.json \
23+
> --attribute-name custom \
24+
> --output-node-data "$TMP/custom-attr.json" > /dev/null
25+
26+
$ python3 "$TESTDIR/../../scripts/diff_jsons.py" "clades/expected-output-custom-attr.json" "$TMP/custom-attr.json"
27+
{}
28+
29+
# Ensure the only change between runs of `augur clades` is the attr name used
30+
$ cat "$TMP/default.json" | sed "s/clade/custom/" > "$TMP/default-now-custom.json"
31+
$ diff -u "$TMP/default-now-custom.json" "$TMP/custom-attr.json"
32+
33+
Cleanup
34+
$ rm -f "$TMP/default.json" "$TMP/custom-attr.json" "$TMP/default-now-custom.json"

tests/functional/clades/clades.tsv

+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
clade gene site alt
2+
3+
# the 1b mutation only once, on the branch leading to tips B and C
4+
# thus we expect the clade label to be on node `internalAB`
5+
cladeCB nuc 1 B
6+
# the 2c mutation appears twice -- on branch `internalAB` and `internalDEF`
7+
# as the latter has 3 descendants, it is chosen over the former
8+
cladeDEF nuc 2 C
9+
# mutation 3e appears only on a terminal node (tipE)
10+
# but we still expect both a branch label and a node_attr
11+
# this means that tipE should be annotated "cladeE" and _not_ "cladeDEF"
12+
cladeE nuc 3 E
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
{
2+
"branch_labels": {
3+
"internalBC": {
4+
"custom": "cladeCB"
5+
},
6+
"internalDEF": {
7+
"custom": "cladeDEF"
8+
},
9+
"tipE": {
10+
"custom": "cladeE"
11+
}
12+
},
13+
"generated_by": {
14+
"program": "augur",
15+
"version": "12.0.0"
16+
},
17+
"nodes": {
18+
"internalBC": {
19+
"custom": "cladeCB"
20+
},
21+
"internalDEF": {
22+
"custom": "cladeDEF"
23+
},
24+
"tipB": {
25+
"custom": "cladeCB"
26+
},
27+
"tipC": {
28+
"custom": "cladeCB"
29+
},
30+
"tipD": {
31+
"custom": "cladeDEF"
32+
},
33+
"tipE": {
34+
"custom": "cladeE"
35+
},
36+
"tipF": {
37+
"custom": "cladeDEF"
38+
}
39+
}
40+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
{
2+
"branch_labels": {
3+
"internalBC": {
4+
"clade": "cladeCB"
5+
},
6+
"internalDEF": {
7+
"clade": "cladeDEF"
8+
},
9+
"tipE": {
10+
"clade": "cladeE"
11+
}
12+
},
13+
"generated_by": {
14+
"program": "augur",
15+
"version": "12.0.0"
16+
},
17+
"nodes": {
18+
"internalBC": {
19+
"clade": "cladeCB"
20+
},
21+
"internalDEF": {
22+
"clade": "cladeDEF"
23+
},
24+
"tipB": {
25+
"clade": "cladeCB"
26+
},
27+
"tipC": {
28+
"clade": "cladeCB"
29+
},
30+
"tipD": {
31+
"clade": "cladeDEF"
32+
},
33+
"tipE": {
34+
"clade": "cladeE"
35+
},
36+
"tipF": {
37+
"clade": "cladeDEF"
38+
}
39+
}
40+
}

tests/functional/clades/nt_muts.json

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
{
2+
"nodes": {
3+
"tipA": {"muts": [], "aa_muts": {}},
4+
"tipB": {"muts": [], "aa_muts": {}},
5+
"tipC": {"muts": [], "aa_muts": {}},
6+
"tipD": {"muts": [], "aa_muts": {}},
7+
"tipE": {"muts": ["A3E"], "aa_muts": {}},
8+
"tipF": {"muts": [], "aa_muts": {}},
9+
"internalBC": {
10+
"muts": ["A1B", "A2C"],
11+
"aa_muts": {}
12+
},
13+
"internalDEF": {
14+
"muts": ["A2C"],
15+
"aa_muts": {}
16+
},
17+
"ROOT":{"muts": [], "aa_muts": {}}
18+
}
19+
}

tests/functional/clades/tree.nwk

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
(tipA:1,(tipB:1,tipC:1)internalBC:2,(tipD:3,tipE:4,tipF:1)internalDEF:5)ROOT:1;

0 commit comments

Comments
 (0)