diff --git a/CHANGES.md b/CHANGES.md index 4cfce1219..5600e76f2 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,6 +2,10 @@ ## __NEXT__ +### Features + +* A helper function – `augur.subsample.get_parallelism` – has been added to optimize usage of `augur subsample` in Snakemake workflows. This is experimental and not yet part of the public API. [#1963][] (@victorlin) + ### Bug fixes * filter, merge: Fixed formatting of the error message shown when there are duplicate sequence ids. [#1954][] @victorlin @@ -9,6 +13,7 @@ [#1954]: https://github.com/nextstrain/augur/pull/1954 [#1956]: https://github.com/nextstrain/augur/pull/1956 +[#1963]: https://github.com/nextstrain/augur/pull/1963 ## 33.0.0 (26 January 2026) diff --git a/augur/subsample.py b/augur/subsample.py index e36ee1431..63a0d1841 100644 --- a/augur/subsample.py +++ b/augur/subsample.py @@ -233,6 +233,42 @@ def run(args: argparse.Namespace) -> None: sample.remove_output_strains() +def get_parallelism( + config_file: str, + config_section: list[str] | None = None, + limit: int | None = None +) -> int: + """Compute the degree of parallelism (i.e., optimal value for ``--nthreads``). + + Inspects the subsample config file to return the degree of parallelism that + should be used for ``--nthreads``. Higher values will underutilize + resources, while lower values will underallocate resources and not fully use + available parallelism. + + Parameters + ---------- + config_file + Path to the subsample config file. + + config_section + Optional list of keys to navigate to a specific section of the config file. + + limit + Optional upper bound for return value. + + Returns + ------- + int + Degree of parallelism. + """ + schema_validator = load_json_schema("schema-subsample-config.json") + config = _parse_config(config_file, config_section, schema_validator) + if limit is None: + return max(1, len(config["samples"])) + else: + return max(1, min(limit, len(config["samples"]))) + + def get_referenced_files( config_file: str, config_section: Optional[List[str]] = None,