From 06f13fa3df7751b3f2124d1c420c26bb8c9fa40e Mon Sep 17 00:00:00 2001 From: Vlad Badelita Date: Wed, 16 Apr 2025 16:03:19 +0100 Subject: [PATCH 1/6] Added --alignment-args to augur align The new argument takes a list of arguments to pass to the alignment tool. This gives more flexibility in configuring mafft. --- augur/align.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/augur/align.py b/augur/align.py index 37551903f..14d9a9a0d 100644 --- a/augur/align.py +++ b/augur/align.py @@ -14,6 +14,10 @@ from .utils import nthreads_value from collections import defaultdict +DEFAULT_ARGS = { + "mafft": "--reorder --anysymbol --nomemsave --adjustdirection", +} + class AlignmentError(Exception): # TODO: this exception should potentially be renamed and made augur-wide # thus allowing any module to raise it and have the message printed & augur @@ -31,6 +35,8 @@ def register_arguments(parser): parser.add_argument('--nthreads', type=nthreads_value, default=1, help="number of threads to use; specifying the value 'auto' will cause the number of available CPU cores on your system, if determinable, to be used") parser.add_argument('--method', default='mafft', choices=["mafft"], help="alignment program to use") + parser.add_argument('--alignment-args', help="arguments to pass to the alignment program (except for number of threads), overriding defaults. " + + f"mafft defaults: '{DEFAULT_ARGS['mafft']}'") parser.add_argument('--reference-name', metavar="NAME", type=str, help="strip insertions relative to reference sequence; use if the reference is already in the input sequences") parser.add_argument('--reference-sequence', metavar="PATH", type=str, help="Add this reference sequence to the dataset & strip insertions relative to this. Use if the reference is NOT already in the input sequences") parser.add_argument('--remove-reference', action="store_true", default=False, help="remove reference sequence from the alignment") @@ -132,7 +138,7 @@ def run(args): # generate alignment command & run log = args.output + ".log" - cmd = generate_alignment_cmd(args.method, args.nthreads, existing_aln_fname, seqs_to_align_fname, args.output, log) + cmd = generate_alignment_cmd(args.method, args.alignment_args, args.nthreads, existing_aln_fname, seqs_to_align_fname, args.output, log) success = run_shell_command(cmd) if not success: raise AlignmentError(f"Error during alignment: please see the log file {log!r} for more details") @@ -248,17 +254,26 @@ def read_reference(ref_fname): "\n\tmake sure the file %s contains one sequence in genbank or fasta format"%ref_fname) return ref_seq -def generate_alignment_cmd(method, nthreads, existing_aln_fname, seqs_to_align_fname, aln_fname, log_fname): +def generate_alignment_cmd(method, alignment_args, nthreads, existing_aln_fname, seqs_to_align_fname, aln_fname, log_fname): + if method not in DEFAULT_ARGS: + raise AlignmentError('ERROR: alignment method %s not implemented'%method) + + if alignment_args is None: + args = DEFAULT_ARGS[method] + else: + args = alignment_args + if method=='mafft': if existing_aln_fname: - cmd = "mafft --add %s --keeplength --reorder --anysymbol --nomemsave --adjustdirection --thread %d %s 1> %s 2> %s"%(shquote(seqs_to_align_fname), nthreads, shquote(existing_aln_fname), shquote(aln_fname), shquote(log_fname)) + cmd = "mafft --add %s --keeplength %s %d %s 1> %s 2> %s"%(shquote(seqs_to_align_fname), args, nthreads, shquote(existing_aln_fname), shquote(aln_fname), shquote(log_fname)) else: - cmd = "mafft --reorder --anysymbol --nomemsave --adjustdirection --thread %d %s 1> %s 2> %s"%(nthreads, shquote(seqs_to_align_fname), shquote(aln_fname), shquote(log_fname)) + cmd = "mafft %s --thread %d %s 1> %s 2> %s"%(args, nthreads, shquote(seqs_to_align_fname), shquote(aln_fname), shquote(log_fname)) print("\nusing mafft to align via:\n\t" + cmd + " \n\n\tKatoh et al, Nucleic Acid Research, vol 30, issue 14" "\n\thttps://doi.org/10.1093%2Fnar%2Fgkf436\n") else: raise AlignmentError('ERROR: alignment method %s not implemented'%method) + return cmd From c1b04fb59681330ef1ab32c42ac3921264094376 Mon Sep 17 00:00:00 2001 From: Vlad Badelita Date: Wed, 16 Apr 2025 16:13:58 +0100 Subject: [PATCH 2/6] Updated changelog --- CHANGES.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGES.md b/CHANGES.md index 533b2a7ea..319d6b097 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,6 +2,7 @@ ## __NEXT__ +* align: Added `--alignment-args` options for passing arguments to the alignment program. [#1789] (@vbadelita) ## 30.0.0 (15 April 2025) From 951c9bea0d28d8142877544ad49087e9b5b05700 Mon Sep 17 00:00:00 2001 From: Vlad Badelita Date: Wed, 16 Apr 2025 16:31:28 +0100 Subject: [PATCH 3/6] Fixed existing unit tests - Found a bug, where I accidentally removed "threads" from the argument string --- augur/align.py | 6 +++--- tests/test_align.py | 8 +++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/augur/align.py b/augur/align.py index 14d9a9a0d..a9cdece7d 100644 --- a/augur/align.py +++ b/augur/align.py @@ -138,7 +138,7 @@ def run(args): # generate alignment command & run log = args.output + ".log" - cmd = generate_alignment_cmd(args.method, args.alignment_args, args.nthreads, existing_aln_fname, seqs_to_align_fname, args.output, log) + cmd = generate_alignment_cmd(args.method, args.nthreads, existing_aln_fname, seqs_to_align_fname, args.output, log, alignment_args=args.alignment_args) success = run_shell_command(cmd) if not success: raise AlignmentError(f"Error during alignment: please see the log file {log!r} for more details") @@ -254,7 +254,7 @@ def read_reference(ref_fname): "\n\tmake sure the file %s contains one sequence in genbank or fasta format"%ref_fname) return ref_seq -def generate_alignment_cmd(method, alignment_args, nthreads, existing_aln_fname, seqs_to_align_fname, aln_fname, log_fname): +def generate_alignment_cmd(method, nthreads, existing_aln_fname, seqs_to_align_fname, aln_fname, log_fname, alignment_args): if method not in DEFAULT_ARGS: raise AlignmentError('ERROR: alignment method %s not implemented'%method) @@ -265,7 +265,7 @@ def generate_alignment_cmd(method, alignment_args, nthreads, existing_aln_fname, if method=='mafft': if existing_aln_fname: - cmd = "mafft --add %s --keeplength %s %d %s 1> %s 2> %s"%(shquote(seqs_to_align_fname), args, nthreads, shquote(existing_aln_fname), shquote(aln_fname), shquote(log_fname)) + cmd = "mafft --add %s --keeplength %s --thread %d %s 1> %s 2> %s"%(shquote(seqs_to_align_fname), args, nthreads, shquote(existing_aln_fname), shquote(aln_fname), shquote(log_fname)) else: cmd = "mafft %s --thread %d %s 1> %s 2> %s"%(args, nthreads, shquote(seqs_to_align_fname), shquote(aln_fname), shquote(log_fname)) print("\nusing mafft to align via:\n\t" + cmd + diff --git a/tests/test_align.py b/tests/test_align.py index 1f6c2eb83..c09e3460b 100644 --- a/tests/test_align.py +++ b/tests/test_align.py @@ -187,7 +187,7 @@ def test_prettify_alignment(self): def test_generate_alignment_cmd_non_mafft(self): with pytest.raises(align.AlignmentError): - assert align.generate_alignment_cmd('no-mafft', 1, None, None, None, None) + assert align.generate_alignment_cmd('no-mafft', 1, None, None, None, None, alignment_args=None) def test_generate_alignment_cmd_mafft_existing_aln_fname(self): existing_aln_fname = "existing_aln" @@ -199,7 +199,8 @@ def test_generate_alignment_cmd_mafft_existing_aln_fname(self): existing_aln_fname, seqs_to_align_fname, aln_fname, - log_fname) + log_fname, + alignment_args=None) expected = "mafft --add %s --keeplength --reorder --anysymbol --nomemsave --adjustdirection --thread %d %s 1> %s 2> %s" % (quote(seqs_to_align_fname), 1, quote(existing_aln_fname), quote(aln_fname), quote(log_fname)) @@ -214,7 +215,8 @@ def test_generate_alignment_cmd_mafft_no_existing_aln_fname(self): None, seqs_to_align_fname, aln_fname, - log_fname) + log_fname, + alignment_args=None) expected = "mafft --reorder --anysymbol --nomemsave --adjustdirection --thread %d %s 1> %s 2> %s" % (1, quote(seqs_to_align_fname), quote(aln_fname), quote(log_fname)) From 94cdea55f84732937edda19a3a78221906d2d98d Mon Sep 17 00:00:00 2001 From: Vlad Badelita Date: Wed, 16 Apr 2025 16:46:54 +0100 Subject: [PATCH 4/6] Added tests for new arg - tests just show the command line mafft call is what we expect - Clarified the docs that --keeplength is automatically added when an existing alignment is passed --- augur/align.py | 2 +- tests/test_align.py | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/augur/align.py b/augur/align.py index a9cdece7d..aa4097b9b 100644 --- a/augur/align.py +++ b/augur/align.py @@ -35,7 +35,7 @@ def register_arguments(parser): parser.add_argument('--nthreads', type=nthreads_value, default=1, help="number of threads to use; specifying the value 'auto' will cause the number of available CPU cores on your system, if determinable, to be used") parser.add_argument('--method', default='mafft', choices=["mafft"], help="alignment program to use") - parser.add_argument('--alignment-args', help="arguments to pass to the alignment program (except for number of threads), overriding defaults. " + + parser.add_argument('--alignment-args', help="arguments to pass to the alignment program (except for threads, keeplength if --existing-alignment is passed), overriding defaults. " + f"mafft defaults: '{DEFAULT_ARGS['mafft']}'") parser.add_argument('--reference-name', metavar="NAME", type=str, help="strip insertions relative to reference sequence; use if the reference is already in the input sequences") parser.add_argument('--reference-sequence', metavar="PATH", type=str, help="Add this reference sequence to the dataset & strip insertions relative to this. Use if the reference is NOT already in the input sequences") diff --git a/tests/test_align.py b/tests/test_align.py index c09e3460b..4a3db852f 100644 --- a/tests/test_align.py +++ b/tests/test_align.py @@ -221,6 +221,39 @@ def test_generate_alignment_cmd_mafft_no_existing_aln_fname(self): expected = "mafft --reorder --anysymbol --nomemsave --adjustdirection --thread %d %s 1> %s 2> %s" % (1, quote(seqs_to_align_fname), quote(aln_fname), quote(log_fname)) assert result == expected + + def test_generate_alignment_cmd_mafft_custom_args_existing_aln_fname(self): + existing_aln_fname = "existing_aln" + seqs_to_align_fname = "seqs_to_align" + aln_fname = "aln_fname" + log_fname = "log_fname" + + result = align.generate_alignment_cmd("mafft", 1, + existing_aln_fname, + seqs_to_align_fname, + aln_fname, + log_fname, + alignment_args="--auto") + + expected = "mafft --add %s --keeplength --auto --thread %d %s 1> %s 2> %s" % (quote(seqs_to_align_fname), 1, quote(existing_aln_fname), quote(aln_fname), quote(log_fname)) + + assert result == expected + + def test_generate_alignment_cmd_mafft_custom_args_no_existing_aln_fname(self): + seqs_to_align_fname = "seqs_to_align" + aln_fname = "aln_fname" + log_fname = "log_fname" + + result = align.generate_alignment_cmd("mafft", 1, + None, + seqs_to_align_fname, + aln_fname, + log_fname, + alignment_args="--auto --anysymbol") + + expected = "mafft --auto --anysymbol --thread %d %s 1> %s 2> %s" % (1, quote(seqs_to_align_fname), quote(aln_fname), quote(log_fname)) + + assert result == expected def test_read_alignment(self): data_file = pathlib.Path('tests/data/align/test_aligned_sequences.fasta') From b251d0e2798d608a5cb967cc5685b710f8c28148 Mon Sep 17 00:00:00 2001 From: Vlad Badelita Date: Wed, 16 Apr 2025 20:15:40 +0100 Subject: [PATCH 5/6] Unified mafft command - I modified the mafft command string to use newer python format strings - Made both paths (with and witout existing file) use the same command, with slightly different arguments - Updated the tests slightly, as it was difficult to unify the commands without a bit of reordering --- augur/align.py | 14 ++++++++------ tests/test_align.py | 4 ++-- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/augur/align.py b/augur/align.py index aa4097b9b..e6479472d 100644 --- a/augur/align.py +++ b/augur/align.py @@ -259,15 +259,17 @@ def generate_alignment_cmd(method, nthreads, existing_aln_fname, seqs_to_align_f raise AlignmentError('ERROR: alignment method %s not implemented'%method) if alignment_args is None: - args = DEFAULT_ARGS[method] - else: - args = alignment_args + alignment_args = DEFAULT_ARGS[method] if method=='mafft': + files_to_align = shquote(seqs_to_align_fname) if existing_aln_fname: - cmd = "mafft --add %s --keeplength %s --thread %d %s 1> %s 2> %s"%(shquote(seqs_to_align_fname), args, nthreads, shquote(existing_aln_fname), shquote(aln_fname), shquote(log_fname)) - else: - cmd = "mafft %s --thread %d %s 1> %s 2> %s"%(args, nthreads, shquote(seqs_to_align_fname), shquote(aln_fname), shquote(log_fname)) + # If there is an existing alignment, then seqs_to_align_fname becomes a parameter of --add + # and existing_aln_fname becomes the anonymous parameter + files_to_align = f"--add {shquote(seqs_to_align_fname)} {shquote(existing_aln_fname)}" + alignment_args = " ".join(["--keeplength", alignment_args]) + + cmd = f"mafft {alignment_args} --thread {nthreads} {files_to_align} 1> {shquote(aln_fname)} 2> {shquote(log_fname)}" print("\nusing mafft to align via:\n\t" + cmd + " \n\n\tKatoh et al, Nucleic Acid Research, vol 30, issue 14" "\n\thttps://doi.org/10.1093%2Fnar%2Fgkf436\n") diff --git a/tests/test_align.py b/tests/test_align.py index 4a3db852f..e59bcd5d7 100644 --- a/tests/test_align.py +++ b/tests/test_align.py @@ -202,7 +202,7 @@ def test_generate_alignment_cmd_mafft_existing_aln_fname(self): log_fname, alignment_args=None) - expected = "mafft --add %s --keeplength --reorder --anysymbol --nomemsave --adjustdirection --thread %d %s 1> %s 2> %s" % (quote(seqs_to_align_fname), 1, quote(existing_aln_fname), quote(aln_fname), quote(log_fname)) + expected = "mafft --keeplength --reorder --anysymbol --nomemsave --adjustdirection --thread %d --add %s %s 1> %s 2> %s" % (1, quote(seqs_to_align_fname), quote(existing_aln_fname), quote(aln_fname), quote(log_fname)) assert result == expected @@ -235,7 +235,7 @@ def test_generate_alignment_cmd_mafft_custom_args_existing_aln_fname(self): log_fname, alignment_args="--auto") - expected = "mafft --add %s --keeplength --auto --thread %d %s 1> %s 2> %s" % (quote(seqs_to_align_fname), 1, quote(existing_aln_fname), quote(aln_fname), quote(log_fname)) + expected = "mafft --keeplength --auto --thread %d --add %s %s 1> %s 2> %s" % (1, quote(seqs_to_align_fname), quote(existing_aln_fname), quote(aln_fname), quote(log_fname)) assert result == expected From 30044ea4cefcb64089c4b9d45b049a36352ac86f Mon Sep 17 00:00:00 2001 From: Vlad Badelita Date: Tue, 22 Apr 2025 11:54:56 +0100 Subject: [PATCH 6/6] Added --override-defaults-args parameter to augur align - Changed the behaviour to not override default arguments (and expand instead) unless the --override-defaults-args parameter is set to true. - Updated tests to reflect this change. --- augur/align.py | 15 ++++++++++----- tests/test_align.py | 28 ++++++++++++++++++++++++---- 2 files changed, 34 insertions(+), 9 deletions(-) diff --git a/augur/align.py b/augur/align.py index e6479472d..bf7a45543 100644 --- a/augur/align.py +++ b/augur/align.py @@ -35,8 +35,9 @@ def register_arguments(parser): parser.add_argument('--nthreads', type=nthreads_value, default=1, help="number of threads to use; specifying the value 'auto' will cause the number of available CPU cores on your system, if determinable, to be used") parser.add_argument('--method', default='mafft', choices=["mafft"], help="alignment program to use") - parser.add_argument('--alignment-args', help="arguments to pass to the alignment program (except for threads, keeplength if --existing-alignment is passed), overriding defaults. " + - f"mafft defaults: '{DEFAULT_ARGS['mafft']}'") + parser.add_argument('--alignment-args', help="arguments to pass to the alignment program (except for threads, keeplength if `--existing-alignment` is passed), overriding defaults. " + + f"mafft defaults: `{DEFAULT_ARGS['mafft']}`") + parser.add_argument('--override-default-args', action="store_true", help="override default alignment program arguments with the values provided by the user in `--alignment-args` instead of augmenting the existing defaults.") parser.add_argument('--reference-name', metavar="NAME", type=str, help="strip insertions relative to reference sequence; use if the reference is already in the input sequences") parser.add_argument('--reference-sequence', metavar="PATH", type=str, help="Add this reference sequence to the dataset & strip insertions relative to this. Use if the reference is NOT already in the input sequences") parser.add_argument('--remove-reference', action="store_true", default=False, help="remove reference sequence from the alignment") @@ -138,7 +139,7 @@ def run(args): # generate alignment command & run log = args.output + ".log" - cmd = generate_alignment_cmd(args.method, args.nthreads, existing_aln_fname, seqs_to_align_fname, args.output, log, alignment_args=args.alignment_args) + cmd = generate_alignment_cmd(args.method, args.nthreads, existing_aln_fname, seqs_to_align_fname, args.output, log, alignment_args=args.alignment_args, override_default_args=args.override_default_args) success = run_shell_command(cmd) if not success: raise AlignmentError(f"Error during alignment: please see the log file {log!r} for more details") @@ -254,13 +255,17 @@ def read_reference(ref_fname): "\n\tmake sure the file %s contains one sequence in genbank or fasta format"%ref_fname) return ref_seq -def generate_alignment_cmd(method, nthreads, existing_aln_fname, seqs_to_align_fname, aln_fname, log_fname, alignment_args): +def generate_alignment_cmd(method, nthreads, existing_aln_fname, seqs_to_align_fname, aln_fname, log_fname, alignment_args=None, override_default_args=False): if method not in DEFAULT_ARGS: raise AlignmentError('ERROR: alignment method %s not implemented'%method) if alignment_args is None: alignment_args = DEFAULT_ARGS[method] - + elif override_default_args: + alignment_args = alignment_args + else: + alignment_args = f"{DEFAULT_ARGS[method]} {alignment_args}" + if method=='mafft': files_to_align = shquote(seqs_to_align_fname) if existing_aln_fname: diff --git a/tests/test_align.py b/tests/test_align.py index e59bcd5d7..0d0957c58 100644 --- a/tests/test_align.py +++ b/tests/test_align.py @@ -233,9 +233,10 @@ def test_generate_alignment_cmd_mafft_custom_args_existing_aln_fname(self): seqs_to_align_fname, aln_fname, log_fname, - alignment_args="--auto") + alignment_args="--auto", + override_default_args=False) - expected = "mafft --keeplength --auto --thread %d --add %s %s 1> %s 2> %s" % (1, quote(seqs_to_align_fname), quote(existing_aln_fname), quote(aln_fname), quote(log_fname)) + expected = "mafft --keeplength --reorder --anysymbol --nomemsave --adjustdirection --auto --thread %d --add %s %s 1> %s 2> %s" % (1, quote(seqs_to_align_fname), quote(existing_aln_fname), quote(aln_fname), quote(log_fname)) assert result == expected @@ -249,12 +250,31 @@ def test_generate_alignment_cmd_mafft_custom_args_no_existing_aln_fname(self): seqs_to_align_fname, aln_fname, log_fname, - alignment_args="--auto --anysymbol") + alignment_args="--auto", + override_default_args=False) - expected = "mafft --auto --anysymbol --thread %d %s 1> %s 2> %s" % (1, quote(seqs_to_align_fname), quote(aln_fname), quote(log_fname)) + expected = "mafft --reorder --anysymbol --nomemsave --adjustdirection --auto --thread %d %s 1> %s 2> %s" % (1, quote(seqs_to_align_fname), quote(aln_fname), quote(log_fname)) assert result == expected + + def test_generate_alignment_cmd_mafft_custom_args_override_args(self): + existing_aln_fname = "existing_aln" + seqs_to_align_fname = "seqs_to_align" + aln_fname = "aln_fname" + log_fname = "log_fname" + + result = align.generate_alignment_cmd("mafft", 1, + existing_aln_fname, + seqs_to_align_fname, + aln_fname, + log_fname, + alignment_args="--auto", + override_default_args=True) + expected = "mafft --keeplength --auto --thread %d --add %s %s 1> %s 2> %s" % (1, quote(seqs_to_align_fname), quote(existing_aln_fname), quote(aln_fname), quote(log_fname)) + + assert result == expected + def test_read_alignment(self): data_file = pathlib.Path('tests/data/align/test_aligned_sequences.fasta') result = align.read_alignment(str(data_file.resolve()))