Skip to content

Commit f7cc5ab

Browse files
committed
augur clades allows attribute name to be specified
Previously the `augur clades` command produced a node-data JSON which stored clade membership as the node-attr "clade_membership" and defined the basal nodes of each clade with the node-attr "clade_annotation". `augur export v2` interpreted the latter as a special-case and turned it into a branch label of the same name. The previous commit allowed `augur export` to be supplied node-data JSONs with a `branches` dictionary. Here we update `augur clades` to export data in this structure, which allows the user to specify the keys to use via the `--attribute-name` arg. This commit breaks backwards compatibility for pipelines as the default attribute name is "clade". This will result in dataset (auspice) JSONs with the same branch labelling as before, but with a different node-attr (was "clade_membership", now "clade"). As `augur export v2` will make colorings for all node-attrs in in node-data JSONs, this will be exported as a "clade" coloring with no changes needed, however auspice config JSONs may now refer to a non-existent "clade_membership" key. `augur export v2` has been updated to no longer special-case `clade_membership` or `clade_annotation` node attrs. We print a warning if an auspice config JSON refers to `clade_membership` to help users update their configs. Functional tests for `augur clades` have been added. Closes #720
1 parent b80260c commit f7cc5ab

8 files changed

+195
-46
lines changed

augur/clades.py

+22-15
Original file line numberDiff line numberDiff line change
@@ -101,13 +101,12 @@ def assign_clades(clade_designations, all_muts, tree, ref=None):
101101
mapping of node to clades
102102
'''
103103

104-
clade_membership = {}
104+
# We use the following dictionaries to store which clade nodes belong to.
105+
# All nodes in a clade should appear in `clade_membership` while only one node should
106+
# appear in `basal_clade_nodes`
107+
(clade_membership, basal_clade_nodes) = ({}, {})
105108
parents = get_parent_name_by_child_name_for_tree(tree)
106109

107-
# first pass to set all nodes to unassigned as precaution to ensure attribute is set
108-
for node in tree.find_clades(order = 'preorder'):
109-
clade_membership[node.name] = {'clade_membership': 'unassigned'}
110-
111110
# count leaves
112111
for node in tree.find_clades(order = 'postorder'):
113112
node.leaf_count = 1 if node.is_terminal() else np.sum([c.leaf_count for c in node])
@@ -136,7 +135,7 @@ def assign_clades(clade_designations, all_muts, tree, ref=None):
136135
node.sequences[gene][pos] = d
137136

138137

139-
# second pass to assign 'clade_annotation' to basal nodes within each clade
138+
# store names of basal nodes of each clade in `basal_clade_nodes` and `clade_membership` dicts.
140139
# if multiple nodes match, assign annotation to largest
141140
# otherwise occasional unwanted cousin nodes get assigned the annotation
142141
for clade_name, clade_alleles in clade_designations.items():
@@ -147,16 +146,17 @@ def assign_clades(clade_designations, all_muts, tree, ref=None):
147146
sorted_nodes = sorted(node_counts, key=lambda x: x.leaf_count, reverse=True)
148147
if len(sorted_nodes) > 0:
149148
target_node = sorted_nodes[0]
150-
clade_membership[target_node.name] = {'clade_annotation': clade_name, 'clade_membership': clade_name}
149+
basal_clade_nodes[target_node.name] = clade_name
150+
clade_membership[target_node.name] = clade_name # basal nodes are members of the clade
151151

152-
# third pass to propagate 'clade_membership'
152+
# propagate 'clade_membership' to children nodes
153153
# don't propagate if encountering 'clade_annotation'
154154
for node in tree.find_clades(order = 'preorder'):
155155
for child in node:
156-
if 'clade_annotation' not in clade_membership[child.name]:
157-
clade_membership[child.name]['clade_membership'] = clade_membership[node.name]['clade_membership']
158-
159-
return clade_membership
156+
# if the child doesn't define the start of its own clade, but the parent belongs to a clade, then inherit that membership
157+
if child.name not in basal_clade_nodes and node.name in clade_membership:
158+
clade_membership[child.name] = clade_membership[node.name]
159+
return (basal_clade_nodes, clade_membership)
160160

161161

162162
def get_reference_sequence_from_root_node(all_muts, root_name):
@@ -181,6 +181,7 @@ def register_arguments(parser):
181181
parser.add_argument('--mutations', nargs='+', help='JSON(s) containing ancestral and tip nucleotide and/or amino-acid mutations ')
182182
parser.add_argument('--reference', nargs='+', help='fasta files containing reference and tip nucleotide and/or amino-acid sequences ')
183183
parser.add_argument('--clades', type=str, help='TSV file containing clade definitions by amino-acid')
184+
parser.add_argument('--attribute-name', type=str, default="clade", help="name to use for clade membership & branch labels", required=False)
184185
parser.add_argument('--output-node-data', type=str, help='name of JSON file to save clade assignments to')
185186

186187

@@ -205,8 +206,14 @@ def run(args):
205206

206207
clade_designations = read_in_clade_definitions(args.clades)
207208

208-
clade_membership = assign_clades(clade_designations, all_muts, tree, ref)
209+
(basal_clade_nodes, clade_membership) = assign_clades(clade_designations, all_muts, tree, ref)
210+
211+
# create node_data for export as a JSON
212+
node_data = {
213+
'nodes': {node: {args.attribute_name: clade} for node,clade in clade_membership.items()},
214+
'branches': {node: {'labels': {args.attribute_name: clade}} for node,clade in basal_clade_nodes.items()}
215+
}
209216

210217
out_name = get_json_name(args)
211-
write_json({'nodes': clade_membership}, out_name)
212-
print("clades written to", out_name, file=sys.stdout)
218+
write_json(node_data, out_name)
219+
print(f"clades written to {out_name} using attribute name {args.attribute_name}", file=sys.stdout)

augur/export_v2.py

+15-31
Original file line numberDiff line numberDiff line change
@@ -92,17 +92,9 @@ def are_mutations_defined(node_attrs):
9292
return True
9393
return False
9494

95-
96-
def are_clades_defined(node_attrs):
95+
def is_node_attr_defined(node_attrs, attr_name):
9796
for node, data in node_attrs.items():
98-
if data.get("clade_membership") or data.get("clade_annotation"):
99-
return True
100-
return False
101-
102-
103-
def are_dates_defined(node_attrs):
104-
for node, data in node_attrs.items():
105-
if data.get("num_date"):
97+
if data.get(attr_name):
10698
return True
10799
return False
108100

@@ -163,7 +155,7 @@ def set_colorings(data_json, config, command_line_colorings, metadata_names, nod
163155
def _get_type(key, trait_values):
164156
# for some keys we know what the type must be
165157
known_types = {
166-
"clade_membership": "categorical",
158+
"clade": "categorical",
167159
"gt": "categorical",
168160
"author": "categorical",
169161
"num_date": "continuous"
@@ -200,7 +192,7 @@ def _get_title(key):
200192
return config_title
201193

202194
# hardcoded fallbacks:
203-
if key == "clade_membership":
195+
if key == "clade":
204196
return "Clade"
205197
if key == "gt":
206198
return "Genotype"
@@ -310,6 +302,12 @@ def _is_valid(coloring):
310302
if key == "gt" and not are_mutations_defined(node_attrs):
311303
warn("[colorings] You asked for mutations (\"gt\"), but none are defined on the tree. They cannot be used as a coloring.")
312304
return False
305+
if key == "clade_membership" and not trait_values:
306+
# augur 12 & below defined clades via the key "clade_membership", not "clade".
307+
# If an auspice_config file specifies this, and it is not present in any node-data, we print a warning.
308+
# (Note that if "clade" is present in node-data, we automatically include it as a colouring.)
309+
warn("You asked for a color-by for 'clade_membership' but this is now called 'clade'. You should update your auspice config file.")
310+
return False
313311
if key != "gt" and not trait_values:
314312
warn("You asked for a color-by for trait '{}', but it has no values on the tree. It has been ignored.".format(key))
315313
return False
@@ -348,11 +346,10 @@ def _get_colorings():
348346
# add in genotype as a special case if (a) not already set and (b) the data supports it
349347
if "gt" not in explicitly_defined_colorings and are_mutations_defined(node_attrs):
350348
colorings.insert(0,{'key':'gt'})
351-
if "num_date" not in explicitly_defined_colorings and are_dates_defined(node_attrs):
349+
if "num_date" not in explicitly_defined_colorings and is_node_attr_defined(node_attrs, "num_date"):
352350
colorings.insert(0,{'key':'num_date'})
353-
if "clade_membership" not in explicitly_defined_colorings and are_clades_defined(node_attrs):
354-
colorings.insert(0,{'key':'clade_membership'})
355-
351+
if "clade" not in explicitly_defined_colorings and is_node_attr_defined(node_attrs, "clade"):
352+
colorings.insert(0,{'key':'clade'})
356353
return colorings
357354

358355

@@ -714,8 +711,6 @@ def node_data_prop_is_normal_trait(name):
714711
# those traits / keys / attrs which are not "special" and can be exported
715712
# as normal attributes on nodes
716713
excluded = [
717-
"clade_annotation", # Clade annotation is label, not colorby!
718-
"clade_membership", # will be auto-detected if it is available
719714
"authors", # authors are set as a node property, not a trait property
720715
"author", # see above
721716
"vaccine", # vaccine info is stored as a "special" node prop
@@ -914,16 +909,6 @@ def transfer_mutations_to_branches(node_attrs, branch_attrs):
914909
else:
915910
branch_attrs[node_name]["labels"] = { "aa": aa_lab }
916911

917-
def transfer_clade_annotation_to_branches(node_attrs, branch_attrs):
918-
for node_name, raw_data in node_attrs.items():
919-
if "clade_annotation" in raw_data and is_valid(raw_data["clade_annotation"]):
920-
if node_name not in branch_attrs:
921-
branch_attrs[node_name] = {}
922-
if 'labels' in branch_attrs[node_name]:
923-
branch_attrs[node_name]["labels"]['clade'] = raw_data["clade_annotation"]
924-
else:
925-
branch_attrs[node_name]["labels"] = { "clade": raw_data["clade_annotation"] }
926-
927912
def transfer_branch_data_to_branch_attrs(branches_node_data, branch_attrs):
928913
"""
929914
Transfers information stored in node-data JSONs under "branches" to the `branch_attrs`.
@@ -968,12 +953,11 @@ def parse_node_data_and_metadata(T, node_data_files, metadata_file):
968953
node_attrs[name][corrected_key] = value
969954
node_data_names.add(corrected_key)
970955

971-
# third pass: create `branch_attrs` which includes certain traits supplied in `node_attrs`
972-
# (e.g. mutations are coverted to branch attrs, and `clade_annotation` is interpreted as a label)
956+
# third pass: create `branch_attrs` which includes a few special-case traits from in `node_attrs`
957+
# (e.g. mutations are coverted from node attrs to branch attrs)
973958
# as well as any branch labels supplied in node-data files.
974959
branch_attrs = {}
975960
transfer_mutations_to_branches(node_attrs, branch_attrs)
976-
transfer_clade_annotation_to_branches(node_attrs, branch_attrs)
977961
transfer_branch_data_to_branch_attrs(node_data.get('branches', {}), branch_attrs)
978962

979963
return (node_data, node_attrs, node_data_names, metadata_names, branch_attrs)

tests/functional/clades.t

+34
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
Integration tests for augur clades.
2+
3+
$ pushd "$TESTDIR" > /dev/null
4+
$ export AUGUR="../../bin/augur"
5+
6+
Run augur clades without --attribute-name. We expect the name to be "clade"
7+
8+
$ ${AUGUR} clades \
9+
> --tree clades/tree.nwk \
10+
> --clades clades/clades.tsv \
11+
> --mutations clades/nt_muts.json \
12+
> --output-node-data "$TMP/default.json" > /dev/null
13+
14+
$ python3 "$TESTDIR/../../scripts/diff_jsons.py" "clades/expected-output-default.json" "$TMP/default.json"
15+
{}
16+
17+
Run augur clades with a custom --attribute-name
18+
19+
$ ${AUGUR} clades \
20+
> --tree clades/tree.nwk \
21+
> --clades clades/clades.tsv \
22+
> --mutations clades/nt_muts.json \
23+
> --attribute-name custom \
24+
> --output-node-data "$TMP/custom-attr.json" > /dev/null
25+
26+
$ python3 "$TESTDIR/../../scripts/diff_jsons.py" "clades/expected-output-custom-attr.json" "$TMP/custom-attr.json"
27+
{}
28+
29+
Ensure the only change between runs of `augur clades` is the attr name used
30+
$ cat "$TMP/default.json" | sed "s/clade/custom/" > "$TMP/default-now-custom.json"
31+
$ diff -u "$TMP/default-now-custom.json" "$TMP/custom-attr.json"
32+
33+
Cleanup
34+
$ rm -f "$TMP/default.json" "$TMP/custom-attr.json" "$TMP/default-now-custom.json"

tests/functional/clades/clades.tsv

+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
clade gene site alt
2+
3+
# the 1b mutation only once, on the branch leading to tips B and C
4+
# thus we expect the clade label to be on node `internalAB`
5+
cladeCB nuc 1 B
6+
# the 2c mutation appears twice -- on branch `internalAB` and `internalDEF`
7+
# as the latter has 3 descendants, it is chosen over the former
8+
cladeDEF nuc 2 C
9+
# mutation 3e appears only on a terminal node (tipE)
10+
# but we still expect both a branch label and a node_attr
11+
# this means that tipE should be annotated "cladeE" and _not_ "cladeDEF"
12+
cladeE nuc 3 E
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
{
2+
"branches": {
3+
"internalBC": {
4+
"labels": {
5+
"custom": "cladeCB"
6+
}
7+
},
8+
"internalDEF": {
9+
"labels": {
10+
"custom": "cladeDEF"
11+
}
12+
},
13+
"tipE": {
14+
"labels": {
15+
"custom": "cladeE"
16+
}
17+
}
18+
},
19+
"generated_by": {
20+
"program": "augur",
21+
"version": "12.0.0"
22+
},
23+
"nodes": {
24+
"internalBC": {
25+
"custom": "cladeCB"
26+
},
27+
"internalDEF": {
28+
"custom": "cladeDEF"
29+
},
30+
"tipB": {
31+
"custom": "cladeCB"
32+
},
33+
"tipC": {
34+
"custom": "cladeCB"
35+
},
36+
"tipD": {
37+
"custom": "cladeDEF"
38+
},
39+
"tipE": {
40+
"custom": "cladeE"
41+
},
42+
"tipF": {
43+
"custom": "cladeDEF"
44+
}
45+
}
46+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
{
2+
"branches": {
3+
"internalBC": {
4+
"labels": {
5+
"clade": "cladeCB"
6+
}
7+
},
8+
"internalDEF": {
9+
"labels": {
10+
"clade": "cladeDEF"
11+
}
12+
},
13+
"tipE": {
14+
"labels": {
15+
"clade": "cladeE"
16+
}
17+
}
18+
},
19+
"generated_by": {
20+
"program": "augur",
21+
"version": "12.0.0"
22+
},
23+
"nodes": {
24+
"internalBC": {
25+
"clade": "cladeCB"
26+
},
27+
"internalDEF": {
28+
"clade": "cladeDEF"
29+
},
30+
"tipB": {
31+
"clade": "cladeCB"
32+
},
33+
"tipC": {
34+
"clade": "cladeCB"
35+
},
36+
"tipD": {
37+
"clade": "cladeDEF"
38+
},
39+
"tipE": {
40+
"clade": "cladeE"
41+
},
42+
"tipF": {
43+
"clade": "cladeDEF"
44+
}
45+
}
46+
}

tests/functional/clades/nt_muts.json

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
{
2+
"nodes": {
3+
"tipA": {"muts": [], "aa_muts": {}},
4+
"tipB": {"muts": [], "aa_muts": {}},
5+
"tipC": {"muts": [], "aa_muts": {}},
6+
"tipD": {"muts": [], "aa_muts": {}},
7+
"tipE": {"muts": ["A3E"], "aa_muts": {}},
8+
"tipF": {"muts": [], "aa_muts": {}},
9+
"internalBC": {
10+
"muts": ["A1B", "A2C"],
11+
"aa_muts": {}
12+
},
13+
"internalDEF": {
14+
"muts": ["A2C"],
15+
"aa_muts": {}
16+
},
17+
"ROOT":{"muts": [], "aa_muts": {}}
18+
}
19+
}

tests/functional/clades/tree.nwk

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
(tipA:1,(tipB:1,tipC:1)internalBC:2,(tipD:3,tipE:4,tipF:1)internalDEF:5)ROOT:1;

0 commit comments

Comments
 (0)