diff --git a/CHANGES.md b/CHANGES.md index 79f52910e..28d3fed2f 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -11,11 +11,15 @@ * Constrain `bcbio-gff` to >=0.7.0 and allow `Biopython` >=1.81 again. We had to introduce the `Biopython` constraint in v21.0.1 (see [#1152][]) due to `bcbio-gff` <0.7.0 relying on the removed `Biopython` feature `UnknownSeq`. [#1178][] (@corneliusroemer) * `augur.io.read_metadata` (used by export, filter, frequencies, refine, and traits): Previously, this used the Python parser engine for [`pandas.read_csv()`][]. Updated to use the C engine for faster reading of metadata. [#812][] (@victorlin) +* clades, export v2: Clade labels + coloring keys are now definable via arguments to augur clades allowing pipelines to use multiple invocations of augur clades resulting in multiple sets of colors and branch labels. How labels are stored in the (intermediate) node-data JSON files has changed. This should be fully backwards compatible for pipelines using augur commands, however custom scripts may need updating. PR [#728][] (@jameshadfield) + + ### Bug fixes * filter, frequencies, refine, parse: Previously, ambiguous dates in the future had a limit of today's date imposed on the upper value but not the lower value. It is now imposed on the lower value as well. [#1171][] (@victorlin) * refine: `--year-bounds` was ignored in versions 9.0.0 through 20.0.0. It now works. [#1136][] (@victorlin) +[#728]: https://github.com/nextstrain/augur/pull/728 [#812]: https://github.com/nextstrain/augur/pull/812 [#1136]: https://github.com/nextstrain/augur/issues/1136 [#1152]: https://github.com/nextstrain/augur/pull/1152 diff --git a/augur/clades.py b/augur/clades.py index acfb4c301..ab8614d09 100644 --- a/augur/clades.py +++ b/augur/clades.py @@ -1,5 +1,14 @@ """ Assign clades to nodes in a tree based on amino-acid or nucleotide signatures. + +Nodes which are members of a clade are stored via + → nodes → → clade_membership +and if this file is used in `augur export v2` these will automatically become a coloring. + +The basal nodes of each clade are also given a branch label which is stored via + → branches → → labels → clade. + +The keys "clade_membership" and "clade" are customisable via command line arguments. """ import sys @@ -9,8 +18,12 @@ from collections import defaultdict import networkx as nx from itertools import islice +from .errors import AugurError +from argparse import SUPPRESS from .utils import get_parent_name_by_child_name_for_tree, read_node_data, write_json, get_json_name +UNASSIGNED = 'unassigned' + def read_in_clade_definitions(clade_file): ''' Reads in tab-seperated file that defines clades by amino acid or nucleotide mutations @@ -111,10 +124,13 @@ def read_in_clade_definitions(clade_file): if clade != root } + if not len(clades.keys()): + raise AugurError(f"No clades were defined in {clade_file}") + return clades -def is_node_in_clade(clade_alleles, node, ref): +def is_node_in_clade(clade_alleles, node, root_sequence): ''' Determines whether a node matches the clade definition based on sequence For any condition, will first look in mutations stored in node.sequences, @@ -123,11 +139,13 @@ def is_node_in_clade(clade_alleles, node, ref): Parameters ---------- clade_alleles : list - list of clade defining alleles + list of clade defining alleles (typically supplied from the input TSV) node : Bio.Phylo.BaseTree.Clade node to check, assuming sequences (as mutations) are attached to node - ref : str or list - positions + node.sequences specifies nucleotides/codons which are newly observed on this node + i.e. they are the result of a mutation observed on the branch leading to this node + root_sequence : dict + {geneName: observed root sequence (list)} Returns ------- @@ -139,8 +157,12 @@ def is_node_in_clade(clade_alleles, node, ref): for gene, pos, clade_state in clade_alleles: if gene in node.sequences and pos in node.sequences[gene]: state = node.sequences[gene][pos] - elif ref and gene in ref: - state = ref[gene][pos] + elif root_sequence and gene in root_sequence: + try: + state = root_sequence[gene][pos] + except IndexError: + raise AugurError(f"A clade definition specifies {{{gene},{pos+1},{clade_state}}} which \ +is beyond the bounds of the supplied root sequence for {gene} (length {len(root_sequence[gene])})") else: state = '' @@ -148,12 +170,26 @@ def is_node_in_clade(clade_alleles, node, ref): return all(conditions) +def ensure_no_multiple_mutations(all_muts): + multiples = [] + + for name,node in all_muts.items(): + nt_positions = [int(mut[1:-1])-1 for mut in node.get('muts', [])] + if len(set(nt_positions))!=len(nt_positions): + multiples.append(f"Node {name} (nuc)") + for gene in node.get('aa_muts', {}): + aa_positions = [int(mut[1:-1])-1 for mut in node['aa_muts'][gene]] + if len(set(aa_positions))!=len(aa_positions): + multiples.append(f"Node {name} ({gene})") + + if multiples: + raise AugurError(f"Multiple mutations at the same position on a single branch were found: {', '.join(multiples)}") def assign_clades(clade_designations, all_muts, tree, ref=None): ''' Ensures all nodes have an entry (or auspice doesn't display nicely), tests each node - to see if it's the first member of a clade (assigns 'clade_annotation'), and sets - all nodes's clade_membership to the value of their parent. This will change if later found to be + to see if it's the first member of a clade (this is the label), and sets the membership of each + node to the value of their parent. This will change if later found to be the first member of a clade. Parameters @@ -169,16 +205,18 @@ def assign_clades(clade_designations, all_muts, tree, ref=None): Returns ------- - dict - mapping of node to clades + (dict, dict) + [0]: mapping of node to clade membership (where applicable) + [1]: mapping of node to clade label (where applicable) ''' clade_membership = {} + clade_labels = {} parents = get_parent_name_by_child_name_for_tree(tree) # first pass to set all nodes to unassigned as precaution to ensure attribute is set for node in tree.find_clades(order = 'preorder'): - clade_membership[node.name] = {'clade_membership': 'unassigned'} + clade_membership[node.name] = UNASSIGNED # count leaves for node in tree.find_clades(order = 'postorder'): @@ -189,16 +227,16 @@ def assign_clades(clade_designations, all_muts, tree, ref=None): c.up=node tree.root.up = None tree.root.sequences = {'nuc':{}} - tree.root.sequences.update({gene:{} for gene in all_muts[tree.root.name]['aa_muts']}) + tree.root.sequences.update({gene:{} for gene in all_muts.get(tree.root.name, {}).get('aa_muts', {})}) # attach sequences to all nodes for node in tree.find_clades(order='preorder'): if node.up: node.sequences = {gene:muts.copy() for gene, muts in node.up.sequences.items()} - for mut in all_muts[node.name]['muts']: + for mut in all_muts.get(node.name, {}).get('muts', []): a, pos, d = mut[0], int(mut[1:-1])-1, mut[-1] node.sequences['nuc'][pos] = d - if 'aa_muts' in all_muts[node.name]: + if 'aa_muts' in all_muts.get(node.name, {}): for gene in all_muts[node.name]['aa_muts']: for mut in all_muts[node.name]['aa_muts'][gene]: a, pos, d = mut[0], int(mut[1:-1])-1, mut[-1] @@ -208,7 +246,7 @@ def assign_clades(clade_designations, all_muts, tree, ref=None): node.sequences[gene][pos] = d - # second pass to assign 'clade_annotation' to basal nodes within each clade + # second pass to assign basal nodes within each clade to the clade_labels dict # if multiple nodes match, assign annotation to largest # otherwise occasional unwanted cousin nodes get assigned the annotation for clade_name, clade_alleles in clade_designations.items(): @@ -219,58 +257,100 @@ def assign_clades(clade_designations, all_muts, tree, ref=None): sorted_nodes = sorted(node_counts, key=lambda x: x.leaf_count, reverse=True) if len(sorted_nodes) > 0: target_node = sorted_nodes[0] - clade_membership[target_node.name] = {'clade_annotation': clade_name, 'clade_membership': clade_name} + clade_membership[target_node.name] = clade_name + clade_labels[target_node.name] = clade_name - # third pass to propagate 'clade_membership' - # don't propagate if encountering 'clade_annotation' + # third pass to propagate clade_membership to descendant nodes + # (until we encounter a node with its own clade_membership) for node in tree.find_clades(order = 'preorder'): for child in node: - if 'clade_annotation' not in clade_membership[child.name]: - clade_membership[child.name]['clade_membership'] = clade_membership[node.name]['clade_membership'] + if child.name not in clade_labels: + clade_membership[child.name] = clade_membership[node.name] - return clade_membership + return (clade_membership, clade_labels) + +def warn_if_clades_not_found(membership, clade_designations): + clades = set(clade_designations.keys()) + found = set([clade for clade in membership.values() if clade!=UNASSIGNED]) + if not(len(found)): + print(f"WARNING in augur.clades: no clades found in tree!") + return + for clade in clades-found: + # warn loudly - one line per unfound clade + print(f"WARNING in augur.clades: clade '{clade}' not found in tree!") def get_reference_sequence_from_root_node(all_muts, root_name): - # attach sequences to root + """ + Extracts the (nuc) sequence from the root node, if set, as well as + the (aa) sequences. Returns a dictionary of {geneName: rootSequence} + where rootSequence is a list and geneName may be 'nuc'. + """ ref = {} - try: - ref['nuc'] = list(all_muts[root_name]["sequence"]) - except: - print("WARNING in augur.clades: nucleotide mutation json does not contain full sequences for the root node.") - if "aa_muts" in all_muts[root_name]: + # the presence of a single mutation will imply that the corresponding reference + # sequence should be present, and we will warn if it is not + nt_present = False + genes_present = set([]) + missing = [] + for d in all_muts.values(): + if "muts" in d: + nt_present = True + genes_present.update(d.get('aa_muts', {}).keys()) + + if nt_present: + try: + ref['nuc'] = list(all_muts.get(root_name, {})["sequence"]) + except KeyError: + missing.append("nuc") + + for gene in genes_present: try: - ref.update({gene:list(seq) for gene, seq in all_muts[root_name]["aa_sequences"].items()}) - except: - print("WARNING in augur.clades: amino acid mutation json does not contain full sequences for the root node.") + ref[gene] = list(all_muts.get(root_name, {}).get("aa_sequences", {})[gene]) + except KeyError: + missing.append(gene) + + if missing: + print(f"WARNING in augur.clades: sequences at the root node have not been specified for {{{', '.join(missing)}}}, \ +even though mutations were observed. Clades which are annotated using bases/codons present at the root \ +of the tree may not be correctly inferred.") return ref +def parse_nodes(tree_file, node_data_files): + tree = Phylo.read(tree_file, 'newick') + # don't supply tree to read_node_data as we don't want to require that every node is present in the node_data JSONs + node_data = read_node_data(node_data_files) + # node_data files can be parsed without 'nodes' (if they have 'branches') + if "nodes" not in node_data or len(node_data['nodes'].keys())==0: + raise AugurError(f"No nodes found in the supplied node data files. Please check {', '.join(node_data_files)}") + json_nodes = set(node_data["nodes"].keys()) + tree_nodes = set([clade.name for clade in tree.find_clades()]) + if not json_nodes.issubset(tree_nodes): + raise AugurError(f"The following nodes in the node_data files ({', '.join(node_data_files)}) are not found in the tree ({tree_file}): {', '.join(json_nodes - tree_nodes)}") + ensure_no_multiple_mutations(node_data['nodes']) + return (tree, node_data['nodes']) def register_parser(parent_subparsers): parser = parent_subparsers.add_parser("clades", help=__doc__) - parser.add_argument('--tree', help="prebuilt Newick -- no tree will be built if provided") - parser.add_argument('--mutations', nargs='+', help='JSON(s) containing ancestral and tip nucleotide and/or amino-acid mutations ') - parser.add_argument('--reference', nargs='+', help='fasta files containing reference and tip nucleotide and/or amino-acid sequences ') - parser.add_argument('--clades', type=str, help='TSV file containing clade definitions by amino-acid') - parser.add_argument('--output-node-data', type=str, help='name of JSON file to save clade assignments to') + parser.add_argument('--tree', required=True, help="prebuilt Newick -- no tree will be built if provided") + parser.add_argument('--mutations', required=True, metavar="NODE_DATA_JSON", nargs='+', help='JSON(s) containing ancestral and tip nucleotide and/or amino-acid mutations ') + parser.add_argument('--reference', nargs='+', help=SUPPRESS) + parser.add_argument('--clades', required=True, metavar="TSV", type=str, help='TSV file containing clade definitions by amino-acid') + parser.add_argument('--output-node-data', type=str, metavar="NODE_DATA_JSON", help='name of JSON file to save clade assignments to') + parser.add_argument('--membership-name', type=str, default="clade_membership", help='Key to store clade membership under; use "None" to not export this') + parser.add_argument('--label-name', type=str, default="clade", help='Key to store clade labels under; use "None" to not export this') return parser def run(args): - ## read tree and data, if reading data fails, return with error code - tree = Phylo.read(args.tree, 'newick') - node_data = read_node_data(args.mutations, args.tree) - if node_data is None: - print("ERROR: could not read node data (incl sequences)") - return 1 - all_muts = node_data['nodes'] + (tree, all_muts) = parse_nodes(args.tree, args.mutations) if args.reference: # PLACE HOLDER FOR vcf WORKFLOW. # Works without a reference for now but can be added if clade defs contain positions # that are monomorphic across reference and sequence sample. + print(f"WARNING in augur.clades: You have provided a --reference file(s) ({args.reference}) however this is functionality is not yet supported.") ref = None else: # extract reference sequences from the root node entry in the mutation json @@ -279,8 +359,20 @@ def run(args): clade_designations = read_in_clade_definitions(args.clades) - clade_membership = assign_clades(clade_designations, all_muts, tree, ref) + membership, labels = assign_clades(clade_designations, all_muts, tree, ref) + warn_if_clades_not_found(membership, clade_designations) + + membership_key= args.membership_name if args.membership_name.upper() != "NONE" else None + label_key= args.label_name if args.label_name.upper() != "NONE" else None + + node_data_json = {} + if membership_key: + node_data_json['nodes'] = {node: {membership_key: clade} for node,clade in membership.items()} + print(f"Clade membership stored on nodes → → {membership_key}", file=sys.stdout) + if label_key: + node_data_json['branches'] = {node: {'labels': {label_key: label}} for node,label in labels.items()} + print(f"Clade labels stored on branches → → labels → {label_key}", file=sys.stdout) out_name = get_json_name(args) - write_json({'nodes': clade_membership}, out_name) - print("clades written to", out_name, file=sys.stdout) + write_json(node_data_json, out_name) + print(f"Clades written to {out_name}", file=sys.stdout) diff --git a/augur/export_v2.py b/augur/export_v2.py index 3f606c797..4c1988a11 100644 --- a/augur/export_v2.py +++ b/augur/export_v2.py @@ -151,23 +151,15 @@ def convert_tree_to_json_structure(node, metadata, div=0): return node_struct -def are_mutations_defined(node_attrs): - for node, data in node_attrs.items(): - if data.get("aa_muts") or data.get("muts"): - return True - return False - - -def are_clades_defined(node_attrs): - for node, data in node_attrs.items(): - if data.get("clade_membership") or data.get("clade_annotation"): +def are_mutations_defined(branch_attrs): + for branch in branch_attrs.values(): + if branch.get("mutations"): return True return False - -def are_dates_defined(node_attrs): +def is_node_attr_defined(node_attrs, attr_name): for node, data in node_attrs.items(): - if data.get("num_date"): + if data.get(attr_name): return True return False @@ -223,7 +215,7 @@ def get_config_colorings_as_dict(config): return config_colorings -def set_colorings(data_json, config, command_line_colorings, metadata_names, node_data_colorings, provided_colors, node_attrs): +def set_colorings(data_json, config, command_line_colorings, metadata_names, node_data_colorings, provided_colors, node_attrs, branch_attrs): def _get_type(key, trait_values): # for some keys we know what the type must be @@ -372,7 +364,7 @@ def _create_coloring(key): def _is_valid(coloring): key = coloring["key"] trait_values = get_values_across_nodes(node_attrs, key) # e.g. list of countries, regions etc - if key == "gt" and not are_mutations_defined(node_attrs): + if key == "gt" and not are_mutations_defined(branch_attrs): warn("[colorings] You asked for mutations (\"gt\"), but none are defined on the tree. They cannot be used as a coloring.") return False if key != "gt" and not trait_values: @@ -411,11 +403,11 @@ def _get_colorings(): explicitly_defined_colorings = [x["key"] for x in colorings] # add in genotype as a special case if (a) not already set and (b) the data supports it - if "gt" not in explicitly_defined_colorings and are_mutations_defined(node_attrs): + if "gt" not in explicitly_defined_colorings and are_mutations_defined(branch_attrs): colorings.insert(0,{'key':'gt'}) - if "num_date" not in explicitly_defined_colorings and are_dates_defined(node_attrs): + if "num_date" not in explicitly_defined_colorings and is_node_attr_defined(node_attrs, "num_date"): colorings.insert(0,{'key':'num_date'}) - if "clade_membership" not in explicitly_defined_colorings and are_clades_defined(node_attrs): + if "clade_membership" not in explicitly_defined_colorings and is_node_attr_defined(node_attrs, "clade_membership"): colorings.insert(0,{'key':'clade_membership'}) return colorings @@ -702,10 +694,23 @@ def node_to_author_tuple(data): return node_author_info +def set_branch_attrs_on_tree(data_json, branch_attrs): + """ + Shifts the provided `branch_attrs` onto the (auspice) `data_json`. + Currently all data is transferred, there is no way for (e.g.) the set of exported + labels to be restricted by the user in a config. + """ + def _recursively_set_data(node): + if branch_attrs.get(node['name'], {}): + node['branch_attrs'] = branch_attrs[node['name']] + for child in node.get("children", []): + _recursively_set_data(child) + _recursively_set_data(data_json["tree"]) + def set_node_attrs_on_tree(data_json, node_attrs): ''' - Assign desired colorings, metadata etc to the tree structure + Assign desired colorings, metadata etc to the `node_attrs` of nodes in the tree Parameters ---------- @@ -716,33 +721,10 @@ def set_node_attrs_on_tree(data_json, node_attrs): author_data = create_author_data(node_attrs) - def _transfer_mutations(node, raw_data): - if "aa_muts" in raw_data or "muts" in raw_data: - node["branch_attrs"]["mutations"] = {} - if "muts" in raw_data and len(raw_data["muts"]): - node["branch_attrs"]["mutations"]["nuc"] = raw_data["muts"] - if "aa_muts" in raw_data: - aa = {gene:data for gene, data in raw_data["aa_muts"].items() if len(data)} - node["branch_attrs"]["mutations"].update(aa) - #convert mutations into a label - if aa: - aa_lab = '; '.join("{!s}: {!s}".format(key,', '.join(val)) for (key,val) in aa.items()) - if 'labels' in node["branch_attrs"]: - node["branch_attrs"]["labels"]["aa"] = aa_lab - else: - node["branch_attrs"]["labels"] = { "aa": aa_lab } - def _transfer_vaccine_info(node, raw_data): if raw_data.get("vaccine"): node["node_attrs"]['vaccine'] = raw_data['vaccine'] - def _transfer_labels(node, raw_data): - if "clade_annotation" in raw_data and is_valid(raw_data["clade_annotation"]): - if 'labels' in node["branch_attrs"]: - node["branch_attrs"]["labels"]['clade'] = raw_data["clade_annotation"] - else: - node["branch_attrs"]["labels"] = { "clade": raw_data["clade_annotation"] } - def _transfer_hidden_flag(node, raw_data): hidden = raw_data.get("hidden", None) if hidden: @@ -791,9 +773,7 @@ def _recursively_set_data(node): # get all the available information for this particular node raw_data = node_attrs[node["name"]] # transfer "special cases" - _transfer_mutations(node, raw_data) _transfer_vaccine_info(node, raw_data) - _transfer_labels(node, raw_data) _transfer_hidden_flag(node, raw_data) _transfer_num_date(node, raw_data) _transfer_url_accession(node, raw_data) @@ -810,8 +790,6 @@ def node_data_prop_is_normal_trait(name): # those traits / keys / attrs which are not "special" and can be exported # as normal attributes on nodes excluded = [ - "clade_annotation", # Clade annotation is label, not colorby! - "clade_membership", # will be auto-detected if it is available "authors", # authors are set as a node property, not a trait property "author", # see above "vaccine", # vaccine info is stored as a "special" node prop @@ -983,6 +961,45 @@ def set_description(data_json, cmd_line_description_file): except FileNotFoundError: fatal("Provided desciption file {} does not exist".format(cmd_line_description_file)) +def create_branch_mutations(branch_attrs, node_data): + for node_name, node_info in node_data['nodes'].items(): + if node_name not in branch_attrs: + continue # strain name not in the tree + if "aa_muts" not in node_info and "muts" not in node_info: + continue + branch_attrs[node_name]['mutations'] = {} + if "muts" in node_info and len(node_info["muts"]): + branch_attrs[node_name]["mutations"]["nuc"] = node_info["muts"] + if "aa_muts" in node_info: + aa = {gene:data for gene, data in node_info["aa_muts"].items() if len(data)} + branch_attrs[node_name]["mutations"].update(aa) + +def create_branch_labels(branch_attrs, node_data, branch_data): + ## start by creating the 'aa' branch label, summarising any amino acid mutations. + ## (We have already set mutations on 'branch_attrs' if they exist, just not the label) + ## This is done first so that if the user defines their own 'aa' labels they will + ## overwrite the ones created here + for branch_info in branch_attrs.values(): + genes = [gene for gene in branch_info.get('mutations', {}) if gene!='nuc'] + if len(genes): + branch_info['labels']['aa'] = \ + '; '.join(f"{gene}: {', '.join(branch_info['mutations'][gene])}" for gene in genes) + + ## check for the special key 'clade_annotation' defined via node data. + ## For historical reasons, this is interpreted as a branch label 'clade' + for node_name, node_info in node_data.items(): + if node_name in branch_attrs and "clade_annotation" in node_info and is_valid(node_info["clade_annotation"]): + branch_attrs[node_name]['labels']['clade'] = node_info["clade_annotation"] + + ## finally transfer any labels defined via -> 'branches' -> labels + for node_name, branch_info in branch_data.items(): + if node_name not in branch_attrs: + continue + for label_key, label_value in branch_info.get('labels', {}).items(): + if label_key.upper() == "NONE" or not is_valid(label_value): + continue + branch_attrs[node_name]["labels"][label_key] = label_value + def parse_node_data_and_metadata(T, node_data, metadata): node_data_names = set() metadata_names = set() @@ -999,14 +1016,24 @@ def parse_node_data_and_metadata(T, node_data, metadata): metadata_names.add(corrected_key) # second pass: node data JSONs (overwrites keys of same name found in metadata) + node_attrs_which_are_actually_branch_attrs = ["clade_annotation", "aa_muts", "muts"] for name, info in node_data['nodes'].items(): if name in node_attrs: # i.e. this node name is in the tree for key, value in info.items(): + if key in node_attrs_which_are_actually_branch_attrs: + continue # these will be handled below corrected_key = update_deprecated_names(key) node_attrs[name][corrected_key] = value node_data_names.add(corrected_key) - return (node_data, node_attrs, node_data_names, metadata_names) + # third pass: create `branch_attrs`. The data comes from + # (a) some keys within `node_data['nodes']` (for legacy reasons) + # (b) the `node_data['branches']` dictionary, which currently only defines labels + branch_attrs = {clade.name: defaultdict(dict) for clade in T.root.find_clades()} + create_branch_mutations(branch_attrs, node_data) + create_branch_labels(branch_attrs, node_data['nodes'], node_data.get('branches', {})) + + return (node_data, node_attrs, node_data_names, metadata_names, branch_attrs) def get_config(args): if not args.auspice_config: @@ -1054,7 +1081,8 @@ def run(args): # parse input files T = Phylo.read(args.tree, 'newick') - node_data, node_attrs, node_data_names, metadata_names = parse_node_data_and_metadata(T, node_data_file, metadata_file) + node_data, node_attrs, node_data_names, metadata_names, branch_attrs = \ + parse_node_data_and_metadata(T, node_data_file, metadata_file) config = get_config(args) # set metadata data structures @@ -1074,7 +1102,8 @@ def run(args): metadata_names=metadata_names, node_data_colorings=node_data_names, provided_colors=read_colors(args.colors), - node_attrs=node_attrs + node_attrs=node_attrs, + branch_attrs=branch_attrs ) except FileNotFoundError as e: print(f"ERROR: required file could not be read: {e}") @@ -1084,6 +1113,8 @@ def run(args): # set tree structure data_json["tree"] = convert_tree_to_json_structure(T.root, node_attrs) set_node_attrs_on_tree(data_json, node_attrs) + set_branch_attrs_on_tree(data_json, branch_attrs) + set_geo_resolutions(data_json, config, args.geo_resolutions, read_lat_longs(args.lat_longs), node_attrs) set_panels(data_json, config, args.panels) set_data_provenance(data_json, config) diff --git a/augur/util_support/node_data_file.py b/augur/util_support/node_data_file.py index 5d389771e..39479fb09 100644 --- a/augur/util_support/node_data_file.py +++ b/augur/util_support/node_data_file.py @@ -27,7 +27,12 @@ def annotations(self): @property def nodes(self): - return self.attrs.get("nodes") + return self.attrs.get("nodes", {}) + + @property + def branches(self): + # these are optional, so we provide an empty dict as a default + return self.attrs.get("branches", {}) @property def generated_by(self): @@ -73,6 +78,15 @@ def validate(self): f"`nodes` value in {self.fname} is not a dictionary. Please check the formatting of this JSON!" ) + if not isinstance(self.branches, dict): + raise AugurError( + f"`branches` value in {self.fname} is not a dictionary. Please check the formatting of this JSON!" ) + + if not self.nodes and not self.branches: + raise AugurError( + f"{self.fname} did not contain either `nodes` or `branches`. Please check the formatting of this JSON!" + ) + if self.validation_mode is not ValidationMode.SKIP and self.is_generated_by_incompatible_augur: msg = ( f"Augur version incompatibility detected: the JSON {self.fname} was generated by " diff --git a/docs/faq/clades.md b/docs/faq/clades.md index 35a1488b2..17c8f50cc 100644 --- a/docs/faq/clades.md +++ b/docs/faq/clades.md @@ -39,7 +39,7 @@ rule clades: As input, this command requires the tree, the output of the ancestral reconstruction steps and the translation step (assuming your clades are defined using translations), as well as the file with clade definitions. The output of this command is a json file in the common augur format that specifies `clade_membership` for each node in the tree. -Nodes that didn't match any clade definition will be left `unassigned`. -Internal nodes that form the root of clades will have an additional field `clade_annotation` that auspice uses to label branches in the tree. +Nodes that didn't match any clade definition will be left unassigned. +Internal nodes that form the root of each clade will also have a branch label assigned (you can prevent this by adding `--label-name none`). diff --git a/tests/functional/clades.t b/tests/functional/clades.t index 619c35d56..0dca3dc13 100644 --- a/tests/functional/clades.t +++ b/tests/functional/clades.t @@ -13,4 +13,82 @@ Test augur clades with simple Zika input files and hierarchical clades. $ python3 "$TESTDIR/../../scripts/diff_jsons.py" clades/clades.json "$TMP/clades.json" \ > --exclude-paths "root['generated_by']" - {} \ No newline at end of file + {} + +Test custom membership key + label key. The only change should be the key names + + $ ${AUGUR} clades \ + > --tree clades/tree.nwk \ + > --mutations clades/aa_muts.json clades/nt_muts_small.json \ + > --clades clades/clades.tsv \ + > --membership-name lineage --label-name origin \ + > --output-node-data "$TMP/clades_custom.json" &>/dev/null + + $ cat "$TMP/clades_custom.json" | sed "s/lineage/clade_membership/" | sed "s/origin/clade/" > "$TMP/clades_sed.json" + + $ python3 "$TESTDIR/../../scripts/diff_jsons.py" clades/clades.json "$TMP/clades_sed.json" \ + > --exclude-paths "root['generated_by']" + {} + +Test the ability to _not_ export a branch label (same logic as not exporting the membership) + + $ ${AUGUR} clades \ + > --tree clades/tree.nwk \ + > --mutations clades/aa_muts.json clades/nt_muts_small.json \ + > --clades clades/clades.tsv \ + > --label-name none \ + > --output-node-data "$TMP/clades_no-labels.json" &>/dev/null + + $ python3 "$TESTDIR/../../scripts/diff_jsons.py" clades/clades.json "$TMP/clades_no-labels.json" \ + > --exclude-paths "root['generated_by']" + {'dictionary_item_removed': [root['branches']]} + + +A clade which exists at the root is not identified by inferring the root state +(i.e. we don't infer the root state to be A if we observe a subsequent A10T mutation) +This is an oversight and ideally would be fixed + + $ ${AUGUR} clades \ + > --tree clades/toy_tree.nwk \ + > --mutations clades/toy_muts_no_ref.json \ + > --clades clades/toy_clades_nuc.tsv \ + > --output-node-data "$TMP/toy_clades_1.json" &>/dev/null + + $ python3 "$TESTDIR/../../scripts/diff_jsons.py" clades/toy_clades_1.json "$TMP/toy_clades_1.json" \ + > --exclude-paths "root['generated_by']" + {} + +A clade which exists at the root is identified (and correctly propogated) if the root sequence +is explicitly set. + + $ ${AUGUR} clades \ + > --tree clades/toy_tree.nwk \ + > --mutations clades/toy_muts_ref.json \ + > --clades clades/toy_clades_nuc.tsv \ + > --output-node-data "$TMP/toy_clades_2a.json" &>/dev/null + + $ python3 "$TESTDIR/../../scripts/diff_jsons.py" clades/toy_clades_2.json "$TMP/toy_clades_2a.json" \ + > --exclude-paths "root['generated_by']" + {} + +A clade which exists at the root is identified (and correctly propogated) without a root sequence +if the (branch leading to the) root has the clade-defining mutation. + + $ ${AUGUR} clades \ + > --tree clades/toy_tree.nwk \ + > --mutations clades/toy_muts_explicit_root_mutation.json \ + > --clades clades/toy_clades_nuc.tsv \ + > --output-node-data "$TMP/toy_clades_2b.json" &>/dev/null + + $ python3 "$TESTDIR/../../scripts/diff_jsons.py" clades/toy_clades_2.json "$TMP/toy_clades_2b.json" \ + > --exclude-paths "root['generated_by']" + {} + +Multiple mutations at the same position on a single branch are a fatal error + + $ ${AUGUR} clades \ + > --tree clades/toy_tree.nwk \ + > --mutations clades/toy_muts_multiple.json \ + > --clades clades/toy_clades_nuc.tsv + ERROR: Multiple mutations at the same position on a single branch were found: Node A (nuc), Node AB (geneName) + [2] \ No newline at end of file diff --git a/tests/functional/clades/clades.json b/tests/functional/clades/clades.json index bcb32caa4..3d8eae6ed 100644 --- a/tests/functional/clades/clades.json +++ b/tests/functional/clades/clades.json @@ -23,18 +23,15 @@ "clade_membership": "A.1" }, "NODE_0000001": { - "clade_annotation": "B", "clade_membership": "B" }, "NODE_0000002": { "clade_membership": "A" }, "NODE_0000003": { - "clade_annotation": "A", "clade_membership": "A" }, "NODE_0000004": { - "clade_annotation": "A.1", "clade_membership": "A.1" }, "NODE_0000005": { @@ -47,7 +44,6 @@ "clade_membership": "B" }, "NODE_0000008": { - "clade_annotation": "B.1", "clade_membership": "B.1" }, "PAN/CDC_259359_V1_V3/2015": { @@ -62,5 +58,27 @@ "ZKC2/2016": { "clade_membership": "A" } + }, + "branches": { + "NODE_0000001": { + "labels": { + "clade": "B" + } + }, + "NODE_0000003": { + "labels": { + "clade": "A" + } + }, + "NODE_0000004": { + "labels": { + "clade": "A.1" + } + }, + "NODE_0000008": { + "labels": { + "clade": "B.1" + } + } } } \ No newline at end of file diff --git a/tests/functional/clades/toy_clades_1.json b/tests/functional/clades/toy_clades_1.json new file mode 100644 index 000000000..aa5a308e0 --- /dev/null +++ b/tests/functional/clades/toy_clades_1.json @@ -0,0 +1,30 @@ +{ + "branches": { + "A": { + "labels": { + "clade": "clade_2" + } + } + }, + "generated_by": { + "program": "augur", + "version": "21.1.0" + }, + "nodes": { + "A": { + "clade_membership": "clade_2" + }, + "AB": { + "clade_membership": "unassigned" + }, + "B": { + "clade_membership": "unassigned" + }, + "C": { + "clade_membership": "unassigned" + }, + "ROOT": { + "clade_membership": "unassigned" + } + } +} \ No newline at end of file diff --git a/tests/functional/clades/toy_clades_2.json b/tests/functional/clades/toy_clades_2.json new file mode 100644 index 000000000..206a0560a --- /dev/null +++ b/tests/functional/clades/toy_clades_2.json @@ -0,0 +1,35 @@ +{ + "branches": { + "A": { + "labels": { + "clade": "clade_2" + } + }, + "ROOT": { + "labels": { + "clade": "clade_1" + } + } + }, + "generated_by": { + "program": "augur", + "version": "21.1.0" + }, + "nodes": { + "A": { + "clade_membership": "clade_2" + }, + "AB": { + "clade_membership": "clade_1" + }, + "B": { + "clade_membership": "clade_1" + }, + "C": { + "clade_membership": "clade_1" + }, + "ROOT": { + "clade_membership": "clade_1" + } + } +} \ No newline at end of file diff --git a/tests/functional/clades/toy_clades_nuc.tsv b/tests/functional/clades/toy_clades_nuc.tsv new file mode 100644 index 000000000..0e1afa6e0 --- /dev/null +++ b/tests/functional/clades/toy_clades_nuc.tsv @@ -0,0 +1,3 @@ +clade gene site alt +clade_1 nuc 10 A +clade_2 nuc 10 T \ No newline at end of file diff --git a/tests/functional/clades/toy_muts_explicit_root_mutation.json b/tests/functional/clades/toy_muts_explicit_root_mutation.json new file mode 100644 index 000000000..f9587e89b --- /dev/null +++ b/tests/functional/clades/toy_muts_explicit_root_mutation.json @@ -0,0 +1,10 @@ +{ + "nodes": { + "ROOT": { + "muts": ["C10A"] + }, + "A": { + "muts": ["A10T"] + } + } +} \ No newline at end of file diff --git a/tests/functional/clades/toy_muts_multiple.json b/tests/functional/clades/toy_muts_multiple.json new file mode 100644 index 000000000..0688cee89 --- /dev/null +++ b/tests/functional/clades/toy_muts_multiple.json @@ -0,0 +1,15 @@ +{ + "nodes": { + "A": { + "muts": ["A10T", "T10C"] + }, + "AB": { + "aa_muts": { + "geneName": ["S42L", "R42H", "Y50W"] + } + }, + "B": { + "muts": ["A10T", "T11C"] + } + } + } \ No newline at end of file diff --git a/tests/functional/clades/toy_muts_no_ref.json b/tests/functional/clades/toy_muts_no_ref.json new file mode 100644 index 000000000..f909383f7 --- /dev/null +++ b/tests/functional/clades/toy_muts_no_ref.json @@ -0,0 +1,7 @@ +{ + "nodes": { + "A": { + "muts": ["A10T"] + } + } +} \ No newline at end of file diff --git a/tests/functional/clades/toy_muts_ref.json b/tests/functional/clades/toy_muts_ref.json new file mode 100644 index 000000000..35287cb8a --- /dev/null +++ b/tests/functional/clades/toy_muts_ref.json @@ -0,0 +1,10 @@ +{ + "nodes": { + "ROOT": { + "sequence": "AAAAAAAAAA" + }, + "A": { + "muts": ["A10T"] + } + } +} \ No newline at end of file diff --git a/tests/functional/clades/toy_tree.nwk b/tests/functional/clades/toy_tree.nwk new file mode 100644 index 000000000..6bbdf908d --- /dev/null +++ b/tests/functional/clades/toy_tree.nwk @@ -0,0 +1 @@ +((A:0.5,B:0.3)AB:0.8,C:0.3)ROOT:0.2; diff --git a/tests/functional/export_v2.t b/tests/functional/export_v2.t index b9eb06e94..1d78bb057 100644 --- a/tests/functional/export_v2.t +++ b/tests/functional/export_v2.t @@ -166,5 +166,16 @@ Run export with metadata and external colors TSV that contains zero values. {} $ rm -f "$TMP/dataset6.json" +Test that attributes are correctly exported as branch_attrs. Currently this includes branch labels (node_data→branches), +mutations (node_data→nodes) and a historical node_data→nodes→→clade_annotation branch label. + $ ${AUGUR} export v2 \ + > --tree export_v2/tree.nwk \ + > --node-data export_v2/div_node-data.json export_v2/nt_muts_1.json export_v2/aa_muts_1.json export_v2/branch-labels.json \ + > --maintainers "Nextstrain Team" \ + > --output "$TMP/dataset-with-branch-labels.json" > /dev/null + + $ python3 "$TESTDIR/../../scripts/diff_jsons.py" export_v2/dataset-with-branch-labels.json "$TMP/dataset-with-branch-labels.json" \ + > --exclude-paths "root['meta']['updated']" + {} $ popd > /dev/null diff --git a/tests/functional/export_v2/aa_muts_1.json b/tests/functional/export_v2/aa_muts_1.json new file mode 100644 index 000000000..fe9dc9fa6 --- /dev/null +++ b/tests/functional/export_v2/aa_muts_1.json @@ -0,0 +1,33 @@ +{ + "annotations": { + "gene1": { + "end": 150, + "start": 50, + "strand": "+" + }, + "gene2": { + "end": 300, + "start": 200, + "strand": "+" + }, + "nuc": { + "end": 500, + "start": 1, + "strand": "+" + } + }, + "nodes": { + "internalBC": { + "aa_muts": { + "gene1": ["S10G", "P20S"], + "gene2": [] + } + }, + "internalDEF": { + "aa_muts": { + "gene1": ["P20S"], + "gene2": [] + } + } + } +} \ No newline at end of file diff --git a/tests/functional/export_v2/branch-labels.json b/tests/functional/export_v2/branch-labels.json new file mode 100644 index 000000000..d0595832e --- /dev/null +++ b/tests/functional/export_v2/branch-labels.json @@ -0,0 +1,37 @@ +{ + "nodes": { + "tipD": { + "clade_membership": "membership D", + "clade_annotation": "set via nodes→clade_annotation" + }, + "tipC": { + "clade_membership": "membership C", + "clade_annotation": "this should be overwritten by custom clade label" + } + }, + "branches": { + "ROOT": { + "labels": { + "fruit": "apple" + } + }, + "tipA": { + "labels": { + "fruit": "orange" + } + }, + "tipC": { + "labels": { + "clade": "clade C", + "none": "this will not be exported" + } + }, + "internalBC": { + "labels": { + "fruit": "pomegranate", + "vegetable": "pumpkin", + "aa": "custom aa label" + } + } + } +} \ No newline at end of file diff --git a/tests/functional/export_v2/dataset-with-branch-labels.json b/tests/functional/export_v2/dataset-with-branch-labels.json new file mode 100644 index 000000000..d09dfb61a --- /dev/null +++ b/tests/functional/export_v2/dataset-with-branch-labels.json @@ -0,0 +1,162 @@ +{ + "version": "v2", + "meta": { + "maintainers": [ + { + "name": "Nextstrain Team" + } + ], + "genome_annotations": { + "nuc": { + "end": 500, + "start": 1, + "strand": "+" + }, + "gene1": { + "end": 150, + "start": 50, + "strand": "+" + }, + "gene2": { + "end": 300, + "start": 200, + "strand": "+" + } + }, + "colorings": [ + { + "key": "gt", + "title": "Genotype", + "type": "categorical" + }, + { + "key": "clade_membership", + "title": "Clade", + "type": "categorical" + } + ], + "filters": [ + "clade_membership" + ], + "panels": [ + "tree", + "entropy" + ], + "updated": "2022-09-09" + }, + "tree": { + "name": "ROOT", + "node_attrs": { + "div": 0 + }, + "branch_attrs": { + "labels": { + "fruit": "apple" + } + }, + "children": [ + { + "name": "tipA", + "branch_attrs": { + "labels": { + "fruit": "orange" + } + }, + "node_attrs": { + "div": 1 + } + }, + { + "name": "internalBC", + "node_attrs": { + "div": 2 + }, + "branch_attrs": { + "mutations": { + "gene1": [ + "S10G", + "P20S" + ] + }, + "labels": { + "aa": "custom aa label", + "fruit": "pomegranate", + "vegetable": "pumpkin" + } + }, + "children": [ + { + "name": "tipB", + "node_attrs": { + "div": 3 + }, + "branch_attrs": {} + }, + { + "name": "tipC", + "node_attrs": { + "div": 3, + "clade_membership": { + "value": "membership C" + } + }, + "branch_attrs": { + "labels": { + "clade": "clade C" + } + } + } + ] + }, + { + "name": "internalDEF", + "node_attrs": { + "div": 5 + }, + "branch_attrs": { + "mutations": { + "nuc": [ + "A400T" + ], + "gene1": [ + "P20S" + ] + }, + "labels": { + "aa": "gene1: P20S" + } + }, + "children": [ + { + "name": "tipD", + "node_attrs": { + "div": 8, + "clade_membership": { + "value": "membership D" + } + }, + "branch_attrs": { + "labels": { + "clade": "set via nodes\u2192clade_annotation" + } + } + }, + { + "name": "tipE", + "node_attrs": { + "div": 9 + }, + "branch_attrs": {} + }, + { + "name": "tipF", + "node_attrs": { + "div": 6 + }, + "branch_attrs": {} + } + ] + } + ] + } +} \ No newline at end of file diff --git a/tests/functional/export_v2/nt_muts_1.json b/tests/functional/export_v2/nt_muts_1.json new file mode 100644 index 000000000..9f0662309 --- /dev/null +++ b/tests/functional/export_v2/nt_muts_1.json @@ -0,0 +1,21 @@ +{ + "annotations": { + "nuc": { + "end": 500, + "start": 1, + "strand": "+" + } + }, + "nodes": { + "internalDEF": { + "muts": [ + "A400T" + ] + }, + "not_in_tree": { + "muts": [ + "A100T" + ] + } + } +} diff --git a/tests/util_support/test_node_data_reader.py b/tests/util_support/test_node_data_reader.py index 889a862b8..837a8d695 100644 --- a/tests/util_support/test_node_data_reader.py +++ b/tests/util_support/test_node_data_reader.py @@ -60,7 +60,7 @@ def test_read_dict_nonuniformity(self, tmpdir, prepare_file): "file1.json", """ { - "nodes": {}, + "nodes": {"node_name": "some_value"}, "a": {} } """, @@ -69,7 +69,7 @@ def test_read_dict_nonuniformity(self, tmpdir, prepare_file): "file2.json", """ { - "nodes": {}, + "nodes": {"node_name": "some_other_value"}, "a": "nah" } """,