Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 44 additions & 30 deletions augur/clades.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,20 +107,11 @@ def read_in_clade_definitions(clade_file):
if not nx.is_directed_acyclic_graph(G):
raise ValueError(f"Clade definitions contain cycles {list(nx.simple_cycles(G))}")

# Traverse graph top down, so that children can inherit from parents and grandparents
# Topological sort ensures parents are visited before children
# islice is used to skip the root node (which has no parent)
for clade in islice(nx.topological_sort(G),1,None):
# Get name of parent clade
# G.predecessors(clade) returns iterator, thus next() necessary
# despite the fact that there should only be one parent
parent_clade = next(G.predecessors(clade))
# Inheritance from parents happens here
# Allele dict is initialized with alleles from parent
clades[clade] = clades[parent_clade].copy()
for _, row in df[(df.clade == clade) & (df.gene != 'clade')].iterrows():
# Overwrite of parent alleles is possible and happens here
clades[clade][(row.gene, int(row.site)-1)] = row.alt
simple_graph_viz(G)

# Don't use the clade inheritance to copy alleles, just read the defs in the TSV
for _, row in df[(df.gene != 'clade')].iterrows():
clades[row.clade][(row.gene, int(row.site)-1)] = row.alt

# Convert items from dict[str, dict[(str,int),str]] to dict[str, list[(str,int,str)]]
clades = {
Expand All @@ -136,7 +127,7 @@ def read_in_clade_definitions(clade_file):
if not len(clades.keys()):
raise AugurError(f"No clades were defined in {clade_file}")

return clades
return (clades, G)


def is_node_in_clade(clade_alleles, node, root_sequence):
Expand Down Expand Up @@ -194,7 +185,18 @@ def ensure_no_multiple_mutations(all_muts):
if multiples:
raise AugurError(f"Multiple mutations at the same position on a single branch were found: {', '.join(multiples)}")

def assign_clades(clade_designations, all_muts, tree, ref=None):
def simple_graph_viz(G):
print()
print("Clade relationships:")
out = defaultdict(list)
for u, v in G.edges():
out[u].append(v)
for u in G.nodes():
if children:=(", ".join(out[u]) or False):
print(f"\t{u if u!=0 else '[NO PARENT]'} → {children}")
print()

def assign_clades(clade_designations, clade_graph, all_muts, tree, ref=None):
'''
Ensures all nodes have an entry (or auspice doesn't display nicely), tests each node
to see if it's the first member of a clade (this is the label), and sets the membership of each
Expand All @@ -205,6 +207,7 @@ def assign_clades(clade_designations, all_muts, tree, ref=None):
----------
clade_designations : dict
clade definitions as :code:`{clade_name:[(gene, site, allele),...]}`
clade_graph: TKTK XXX
all_muts : dict
mutations in each node
tree : Bio.Phylo.BaseTree.Tree
Expand All @@ -219,8 +222,8 @@ def assign_clades(clade_designations, all_muts, tree, ref=None):
[1]: mapping of node to clade label (where applicable)
'''

clade_membership = {}
clade_labels = {}
clade_membership = {} # node name -> clade name (for all nodes in clade)
clade_labels = {} # node object -> clade name (the originating node only)
parents = get_parent_name_by_child_name_for_tree(tree)

# first pass to set all nodes to unassigned as precaution to ensure attribute is set
Expand Down Expand Up @@ -254,29 +257,40 @@ def assign_clades(clade_designations, all_muts, tree, ref=None):
node.sequences[gene]={}
node.sequences[gene][pos] = d


# second pass to assign basal nodes within each clade to the clade_labels dict
# if multiple nodes match, assign annotation to largest
# otherwise occasional unwanted cousin nodes get assigned the annotation
for clade_name, clade_alleles in clade_designations.items():
# Traverse the clades via a pre-order traversal of the graph constructed from their
# inheritance in the original definitions TSV
for clade_name in islice(nx.topological_sort(clade_graph),1,None):
clade_alleles = clade_designations[clade_name]
node_counts = []
for node in tree.find_clades(order = 'preorder'):
# If the clade inherits from another clade, only consider the subtree downstream from
# where that clade starts
if parent_clade:=next(iter(clade_graph.predecessors(clade_name))): # 0 if no parent
start_node = {v:k for k,v in clade_labels.items()}.get(parent_clade, None)
if start_node is None:
print(f"WARNING: clade {clade_name} not identified as it inherits from clade {parent_clade} which was itself not identified", file=sys.stderr)
continue
else:
start_node = tree

for node in start_node.find_clades(order = 'preorder'):
if is_node_in_clade(clade_alleles, node, ref):
node_counts.append(node)
sorted_nodes = sorted(node_counts, key=lambda x: x.leaf_count, reverse=True)
if len(sorted_nodes) > 0:
target_node = sorted_nodes[0]
clade_membership[target_node.name] = clade_name
clade_labels[target_node.name] = clade_name
clade_labels[target_node] = clade_name
else:
print(f"WARNING: clade {clade_name} not identified in tree", file=sys.stderr)

# third pass to propagate clade_membership to descendant nodes
# (until we encounter a node with its own clade_membership)
for node in tree.find_clades(order = 'preorder'):
for child in node:
if child.name not in clade_labels:
if child not in clade_labels:
clade_membership[child.name] = clade_membership[node.name]

return (clade_membership, clade_labels)
return (clade_membership, {n.name: label for n,label in clade_labels.items()})

def warn_if_clades_not_found(membership, clade_designations):
clades = set(clade_designations.keys())
Expand Down Expand Up @@ -346,7 +360,7 @@ def register_parser(parent_subparsers):
parser.add_argument('--mutations', required=True, metavar="NODE_DATA_JSON", nargs='+', action=ExtendOverwriteDefault, help='JSON(s) containing ancestral and tip nucleotide and/or amino-acid mutations ')
parser.add_argument('--reference', nargs='+', action=ExtendOverwriteDefault, help=SUPPRESS)
parser.add_argument('--clades', required=True, metavar="TSV", type=str, help='TSV file containing clade definitions by amino-acid')
parser.add_argument('--output-node-data', type=str, metavar="NODE_DATA_JSON", help='name of JSON file to save clade assignments to')
parser.add_argument('--output-node-data', type=str, required=True, metavar="NODE_DATA_JSON", help='name of JSON file to save clade assignments to')
parser.add_argument('--membership-name', type=str, default="clade_membership", help='Key to store clade membership under; use "None" to not export this')
parser.add_argument('--label-name', type=str, default="clade", help='Key to store clade labels under; use "None" to not export this')
add_validation_arguments(parser)
Expand All @@ -367,8 +381,8 @@ def run(args):
# if this doesn't exist, it will complain but not error.
ref = get_reference_sequence_from_root_node(all_muts, tree.root.name)

clade_designations = read_in_clade_definitions(args.clades)
membership, labels = assign_clades(clade_designations, all_muts, tree, ref)
clade_designations, clade_graph = read_in_clade_definitions(args.clades)
membership, labels = assign_clades(clade_designations, clade_graph, all_muts, tree, ref)
warn_if_clades_not_found(membership, clade_designations)

membership_key= args.membership_name if args.membership_name.upper() != "NONE" else None
Expand Down
13 changes: 13 additions & 0 deletions tests/functional/clades/cram/inheritance.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
Integration tests for augur clades.

$ source "$TESTDIR"/_setup.sh

$ ${AUGUR} clades \
> --tree "$TESTDIR/../data/tree.nwk" \
> --mutations "$TESTDIR/../data/inheritance-muts.json" \
> --clades "$TESTDIR/../data/inheritance-clades.tsv" \
> --output-node-data clades.json &>/dev/null

$ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" "$TESTDIR/../data/inheritance-clades.json" clades.json \
> --exclude-paths "root['generated_by']"
{}
3 changes: 2 additions & 1 deletion tests/functional/clades/cram/multiple-mutations-error.t
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ Multiple mutations at the same position on a single branch are a fatal error
$ ${AUGUR} clades \
> --tree "$TESTDIR/../data/toy_tree.nwk" \
> --mutations "$TESTDIR/../data/toy_muts_multiple.json" \
> --clades "$TESTDIR/../data/toy_clades_nuc.tsv"
> --clades "$TESTDIR/../data/toy_clades_nuc.tsv" \
> --output-node-data "tmp.json"
ERROR: Multiple mutations at the same position on a single branch were found: Node A (nuc), Node AB (geneName)
[2]
74 changes: 74 additions & 0 deletions tests/functional/clades/data/inheritance-clades.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
{
"branches": {
"NODE_0000002": {
"labels": {
"clade": "Y"
}
},
"NODE_0000005": {
"labels": {
"clade": "X"
}
}
},
"generated_by": {
"program": "augur",
"version": "31.2.1"
},
"nodes": {
"BRA/2016/FC_6706": {
"clade_membership": "X"
},
"COL/FLR_00008/2015": {
"clade_membership": "unassigned"
},
"Colombia/2016/ZC204Se": {
"clade_membership": "unassigned"
},
"DOM/2016/BB_0183": {
"clade_membership": "Y"
},
"EcEs062_16": {
"clade_membership": "Y"
},
"HND/2016/HU_ME59": {
"clade_membership": "X"
},
"NODE_0000001": {
"clade_membership": "unassigned"
},
"NODE_0000002": {
"clade_membership": "Y"
},
"NODE_0000003": {
"clade_membership": "X"
},
"NODE_0000004": {
"clade_membership": "X"
},
"NODE_0000005": {
"clade_membership": "X"
},
"NODE_0000006": {
"clade_membership": "unassigned"
},
"NODE_0000007": {
"clade_membership": "unassigned"
},
"NODE_0000008": {
"clade_membership": "unassigned"
},
"PAN/CDC_259359_V1_V3/2015": {
"clade_membership": "unassigned"
},
"PRVABC59": {
"clade_membership": "X"
},
"VEN/UF_1/2016": {
"clade_membership": "unassigned"
},
"ZKC2/2016": {
"clade_membership": "X"
}
}
}
9 changes: 9 additions & 0 deletions tests/functional/clades/data/inheritance-clades.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
clade gene site alt
# There's two options for clade X - each of the root's children
# match this. The upper clade "wins" as it has more descendants
X nuc 1 T
# Clade Y "inherits" from clade X with an additional mutation.
# Intuitively, this means Y must be a descendent of X, however
# this isn't the way `augur clades` is currently implemented
Y clade X
Y nuc 2 T
7 changes: 7 additions & 0 deletions tests/functional/clades/data/inheritance-muts.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"nodes": {
"NODE_0000005": {"muts": ["A1T"]},
"NODE_0000001": {"muts": ["A1T", "A2T"]},
"NODE_0000002": {"muts": ["A2T"]}
}
}
Loading