diff --git a/Cargo.lock b/Cargo.lock index 2dad7585b..2f6590377 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2154,6 +2154,7 @@ dependencies = [ "serde_repr", "serde_stacker", "serde_yaml", + "smart-default", "strsim", "strum 0.27.2", "strum_macros 0.27.2", @@ -3268,6 +3269,17 @@ version = "1.15.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" +[[package]] +name = "smart-default" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eb01866308440fc64d6c44d9e86c5cc17adfe33c4d6eed55da9145044d0ffc1" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.112", +] + [[package]] name = "socket2" version = "0.5.10" diff --git a/Cargo.toml b/Cargo.toml index d73c802ee..bd790b297 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -81,6 +81,7 @@ serde_json = { version = "=1.0.148", features = ["preserve_order", "indexmap", " serde_repr = "=0.1.20" serde_stacker = { version = "=0.1.14" } serde_yaml = "=0.9.34" +smart-default = "=0.7.1" strsim = "=0.11.1" strum = "=0.27.2" strum_macros = "=0.27.2" diff --git a/docs/user/input-files/05-pathogen-config.md b/docs/user/input-files/05-pathogen-config.md index 7f6a89817..81e2e7546 100644 --- a/docs/user/input-files/05-pathogen-config.md +++ b/docs/user/input-files/05-pathogen-config.md @@ -220,6 +220,63 @@ In addition, a "default" value can be specified for amino acid mutations that ar If the score is only relevant for specific clades, you can specify which clades are to be ignored. +#### Nucleotide mutation pattern detection (`mutationPatterns`) + +Nextclade can detect named groups of private nucleotide substitutions. This is useful for reporting mutation patterns such as RNA editing signatures separately from the generic SNP cluster QC rule. + +Pattern detection is configured with `mutationPatterns.patterns`. Each pattern has an `id`, a display `name`, optional `description`, one or more `events`, and optional clustering parameters. The only supported event type is currently `nucSubstitution`. + +```json + "mutationPatterns": { + "patterns": [ + { + "id": "adar", + "name": "ADAR-like RNA editing", + "description": "ADAR-mediated A-to-I editing observed as A>G and complementary T>C", + "events": [ + { + "type": "nucSubstitution", + "ref": ["A"], + "qry": ["G"] + }, + { + "type": "nucSubstitution", + "ref": ["T"], + "qry": ["C"] + } + ], + "cluster": { + "windowSize": 50, + "cutoff": 3 + } + }, + { + "id": "apobec", + "name": "APOBEC-like cytosine deamination", + "description": "APOBEC-like cytosine deamination observed as G>A in a reference motif", + "events": [ + { + "type": "nucSubstitution", + "ref": ["G"], + "qry": ["A"], + "motifs": ["[CT]G[ACT]"] + } + ], + "cluster": { + "windowSize": 50, + "cutoff": 3 + } + } + ] + } +``` + +The `ref` and `qry` arrays use Nextclade nucleotide symbols, including IUPAC ambiguity codes such as `N`, `R`, and `Y`. A substitution matches when both the reference and query nucleotide match one of the configured symbols. + +The `motifs` array contains regular expressions matched against the reference sequence. A motif qualifies a substitution when the regex match interval contains the substituted reference position. Motifs are regular expressions over the reference letters, so use regex character classes such as `[CT]` instead of IUPAC ambiguity symbols when matching multiple reference letters inside a motif. + +The optional `cluster` object reports clusters within mutations matched by that pattern. It does not replace `qc.snpClusters`: `qc.snpClusters` remains the generic global SNP cluster QC rule over all private nucleotide substitutions. + #### Amino acid motif detection (`aaMotifs`) Nextclade can detect and report specific motifs in translated amino acid sequences. This feature is currently being used to highlight changes in glycosylation or cleavage sites, but the feature itself is generic. diff --git a/docs/user/output-files/04-results-tsv.md b/docs/user/output-files/04-results-tsv.md index ca4b1b8d6..802625c16 100644 --- a/docs/user/output-files/04-results-tsv.md +++ b/docs/user/output-files/04-results-tsv.md @@ -100,6 +100,15 @@ Every row in tabular output corresponds to 1 input sequence. The meaning of colu | qc.stopCodons.totalStopCodons | Total number of detected stop codons in "Stop codons" QC rule | non-negative integer | 2 | | qc.stopCodons.score | Score for "Stop codons" QC rule | float | 0.5 | | qc.stopCodons.status | Status for "Stop codons" QC rule | string: `good | mediocre |bad` | bad | +| mutationPatterns.id | Mutation pattern identifier, or multiple identifiers separated by `\|` | string | adar | +| mutationPatterns.name | Mutation pattern display name, or multiple names separated by `\|` | string | ADAR-like RNA editing | +| mutationPatterns.description | Mutation pattern description, or multiple descriptions separated by `\|` | string | ADAR-mediated A-to-I editing | +| mutationPatterns.counts.matches | Total number of events matched by mutation patterns | non-negative integer | 14 | +| mutationPatterns.counts.clustered | Total number of matched events that occur in mutation pattern clusters | non-negative integer | 9 | +| mutationPatterns.counts.clusters | Total number of mutation pattern clusters | non-negative integer | 2 | +| mutationPatterns.eventTypeCounts | Matched event type counts for mutation patterns | comma separated list of strings | nucSubstitution:A>G:5 | +| mutationPatterns.clusters | Matched mutation pattern cluster ranges and event counts | comma separated list of strings | 3003-3011:5 | +| mutationPatterns.clusterEvents | Events in matched mutation pattern clusters | comma separated list of strings | 3003-3011:nucSubstitution:A3003G | | isReverseComplement | Whether query sequences were transformed using reverse complement operation before alignment | boolean | false | | errors | List of errors during processing | comma separated list of strings | | | warnings | List of warnings during processing | comma separated list of strings | | @@ -123,4 +132,3 @@ The table can contain additional columns for every clade-like attribute defined > > > See descriptions of individual outputs and [Errors and warnings](./errors-and-warnings.md) section for more details. - diff --git a/packages/nextclade-cli/src/cli/nextclade_loop.rs b/packages/nextclade-cli/src/cli/nextclade_loop.rs index 2c28d1cae..e37b72c9c 100644 --- a/packages/nextclade-cli/src/cli/nextclade_loop.rs +++ b/packages/nextclade-cli/src/cli/nextclade_loop.rs @@ -138,6 +138,7 @@ pub fn nextclade_run(mut run_args: NextcladeRunArgs) -> Result<(), Report> { clade_node_attr_key_descs, phenotype_attr_descs, aa_motif_keys, + mutation_pattern_keys, ref_nodes, .. } = nextclade.get_initial_data(); @@ -148,6 +149,7 @@ pub fn nextclade_run(mut run_args: NextcladeRunArgs) -> Result<(), Report> { &phenotype_attr_descs, &ref_nodes, &aa_motif_keys, + &mutation_pattern_keys, &csv_column_config, &run_args.outputs, &nextclade.params, diff --git a/packages/nextclade-cli/src/cli/nextclade_ordered_writer.rs b/packages/nextclade-cli/src/cli/nextclade_ordered_writer.rs index 2d57270fe..1204a59b4 100644 --- a/packages/nextclade-cli/src/cli/nextclade_ordered_writer.rs +++ b/packages/nextclade-cli/src/cli/nextclade_ordered_writer.rs @@ -44,6 +44,7 @@ impl NextcladeOrderedWriter { phenotype_attr_key_desc: &[PhenotypeAttrDesc], ref_nodes: &AuspiceRefNodesDesc, aa_motifs_keys: &[String], + mutation_pattern_keys: &[String], csv_column_config: &CsvColumnConfig, output_params: &NextcladeRunOutputArgs, params: &NextcladeInputParams, @@ -73,6 +74,7 @@ impl NextcladeOrderedWriter { &phenotype_attr_keys, ref_nodes, aa_motifs_keys, + mutation_pattern_keys, csv_column_config, ) })?; @@ -85,6 +87,7 @@ impl NextcladeOrderedWriter { &phenotype_attr_keys, ref_nodes, aa_motifs_keys, + mutation_pattern_keys, csv_column_config, ) })?; diff --git a/packages/nextclade-schemas/input-pathogen-json.schema.json b/packages/nextclade-schemas/input-pathogen-json.schema.json index 0d3382078..81b4a85b0 100644 --- a/packages/nextclade-schemas/input-pathogen-json.schema.json +++ b/packages/nextclade-schemas/input-pathogen-json.schema.json @@ -53,9 +53,9 @@ }, "snpClusters": { "enabled": true, + "scoreWeight": 50.0, "windowSize": 100, - "clusterCutOff": 5, - "scoreWeight": 50.0 + "clusterCutOff": 5 }, "frameShifts": { "enabled": true, @@ -85,6 +85,62 @@ "scoreWeight": 75.0 } }, + "mutationPatterns": { + "patterns": [ + { + "id": "adar", + "name": "ADAR-like RNA editing", + "description": "ADAR-mediated A-to-I editing observed as A>G and complementary T>C", + "events": [ + { + "type": "nucSubstitution", + "ref": [ + "A", + "T" + ], + "qry": [ + "G", + "C" + ], + "motifs": [ + "A[ACGT]G", + "T[ACGT]C" + ] + } + ], + "cluster": { + "windowSize": 100, + "cutoff": 5 + } + }, + { + "id": "apobec", + "name": "APOBEC-like cytosine deamination", + "description": "APOBEC-like cytosine deamination observed as C>T and complementary G>A", + "events": [ + { + "type": "nucSubstitution", + "ref": [ + "C", + "G" + ], + "qry": [ + "T", + "A" + ], + "motifs": [ + "TC[AT]", + "[AT]GA" + ] + } + ], + "cluster": { + "windowSize": 50, + "cutoff": 4 + } + } + ] + }, "phenotypeData": [ { "name": "receptor_binding", @@ -219,6 +275,17 @@ } ] }, + "mutationPatterns": { + "description": "Mutation pattern analysis configuration. When present, detects private mutation patterns such as enzyme-associated clustered substitution signatures.", + "anyOf": [ + { + "$ref": "#/definitions/MutationPatternsConfig" + }, + { + "type": "null" + } + ] + }, "generalParams": { "description": "General analysis parameters (e.g. includeReference, inOrder, replaceUnknown).", "anyOf": [ @@ -507,9 +574,9 @@ }, "snpClusters": { "enabled": true, + "scoreWeight": 50.0, "windowSize": 100, - "clusterCutOff": 5, - "scoreWeight": 50.0 + "clusterCutOff": 5 }, "frameShifts": { "enabled": true, @@ -590,9 +657,9 @@ "description": "Configuration for the \"SNP clusters\" (C) rule", "default": { "enabled": false, + "scoreWeight": 0.0, "windowSize": 0, - "clusterCutOff": 0, - "scoreWeight": 0.0 + "clusterCutOff": 0 }, "allOf": [ { @@ -754,9 +821,9 @@ "examples": [ { "enabled": true, + "scoreWeight": 50.0, "windowSize": 100, - "clusterCutOff": 5, - "scoreWeight": 50.0 + "clusterCutOff": 5 } ], "type": "object", @@ -765,25 +832,25 @@ "default": false, "type": "boolean" }, + "scoreWeight": { + "description": "QC score added per detected cluster", + "default": 0.0, + "type": "number", + "format": "double" + }, "windowSize": { - "description": "Size of the sliding window (in nucleotides) used to count private mutations", + "description": "Size of the sliding nucleotide window for global SNP cluster detection", "default": 0, "type": "integer", "format": "uint", "minimum": 0.0 }, "clusterCutOff": { - "description": "Number of private mutations within a window above which the window is flagged as a cluster", + "description": "Number of substitutions within a window required to count as a global SNP cluster", "default": 0, "type": "integer", "format": "uint", "minimum": 0.0 - }, - "scoreWeight": { - "description": "QC score added per detected cluster", - "default": 0.0, - "type": "number", - "format": "double" } } }, @@ -942,6 +1009,272 @@ } } }, + "MutationPatternsConfig": { + "description": "Configuration for mutation pattern analysis. Detects private mutations matching biologically meaningful mutation type and reference-context rules. Pattern-specific clustering is independent from global `qc.snpClusters`.", + "examples": [ + { + "patterns": [ + { + "id": "adar", + "name": "ADAR-like RNA editing", + "description": "ADAR-mediated A-to-I editing observed as A>G and complementary T>C", + "events": [ + { + "type": "nucSubstitution", + "ref": [ + "A", + "T" + ], + "qry": [ + "G", + "C" + ], + "motifs": [ + "A[ACGT]G", + "T[ACGT]C" + ] + } + ], + "cluster": { + "windowSize": 100, + "cutoff": 5 + } + }, + { + "id": "apobec", + "name": "APOBEC-like cytosine deamination", + "description": "APOBEC-like cytosine deamination observed as C>T and complementary G>A", + "events": [ + { + "type": "nucSubstitution", + "ref": [ + "C", + "G" + ], + "qry": [ + "T", + "A" + ], + "motifs": [ + "TC[AT]", + "[AT]GA" + ] + } + ], + "cluster": { + "windowSize": 50, + "cutoff": 4 + } + } + ] + } + ], + "type": "object", + "properties": { + "patterns": { + "description": "Mutation patterns evaluated independently for every analyzed sequence.", + "type": "array", + "items": { + "$ref": "#/definitions/MutationPatternConfig" + } + } + } + }, + "MutationPatternConfig": { + "description": "Named mutation pattern: event filters, optional clustering, and display metadata.\n\nDataset authors can define multiple patterns to separate biologically different mutation processes, such as ADAR-like A-to-I editing and APOBEC-like cytosine deamination.", + "examples": [ + { + "id": "adar", + "name": "ADAR-like RNA editing", + "description": "ADAR-mediated A-to-I editing observed as A>G and complementary T>C", + "events": [ + { + "type": "nucSubstitution", + "ref": [ + "A", + "T" + ], + "qry": [ + "G", + "C" + ], + "motifs": [ + "A[ACGT]G", + "T[ACGT]C" + ] + } + ], + "cluster": { + "windowSize": 100, + "cutoff": 5 + } + } + ], + "type": "object", + "required": [ + "id", + "name" + ], + "properties": { + "id": { + "description": "Stable machine-readable identifier. This value appears in JSON and TSV output.", + "type": "string" + }, + "name": { + "description": "Human-readable name shown in Nextclade Web tooltips and reports.", + "type": "string" + }, + "description": { + "description": "Optional explanatory text shown together with the pattern result.", + "type": [ + "string", + "null" + ] + }, + "events": { + "description": "Event filters included in this pattern. If empty, the pattern matches all private nucleotide substitutions.", + "type": "array", + "items": { + "$ref": "#/definitions/MutationPatternEvent" + } + }, + "cluster": { + "description": "Optional pattern-local clustering rule. If omitted, Nextclade reports matches and type counts, but no clusters for this pattern.", + "anyOf": [ + { + "$ref": "#/definitions/MutationPatternClusterConfig" + }, + { + "type": "null" + } + ] + } + } + }, + "MutationPatternEvent": { + "description": "Mutation event filter in mutation pattern analysis.\n\nEach event selects one class of private mutations to include in a pattern. More event variants can be added without changing the surrounding `mutationPatterns.patterns[]` structure.", + "examples": [ + { + "type": "nucSubstitution", + "ref": [ + "A", + "T" + ], + "qry": [ + "G", + "C" + ], + "motifs": [ + "A[ACGT]G", + "T[ACGT]C" + ] + } + ], + "oneOf": [ + { + "description": "Nucleotide substitution event, selected by reference nucleotide, query nucleotide, and optional reference motifs.", + "examples": [ + { + "ref": [ + "A", + "T" + ], + "qry": [ + "G", + "C" + ], + "motifs": [ + "A[ACGT]G", + "T[ACGT]C" + ] + } + ], + "type": "object", + "required": [ + "qry", + "ref", + "type" + ], + "properties": { + "type": { + "type": "string", + "enum": [ + "nucSubstitution" + ] + }, + "ref": { + "description": "Reference nucleotides to match at the mutated position.", + "type": "array", + "items": { + "$ref": "#/definitions/Nuc" + } + }, + "qry": { + "description": "Query nucleotides to match at the mutated position.", + "type": "array", + "items": { + "$ref": "#/definitions/Nuc" + } + }, + "motifs": { + "description": "Regular expressions matched against the reference sequence. A mutation matches a motif when the regex match spans the mutated position. Use IUPAC nucleotide letters directly in the regex, for example `TC[AT]` for a TCW motif.", + "type": "array", + "items": { + "type": "string" + } + } + } + } + ] + }, + "Nuc": { + "description": "A nucleotide", + "type": "string", + "enum": [ + "T", + "A", + "W", + "C", + "Y", + "M", + "H", + "G", + "K", + "R", + "D", + "S", + "B", + "V", + "N", + "-" + ] + }, + "MutationPatternClusterConfig": { + "description": "Clustering rule applied to events matching one mutation pattern.\n\nClustering is pattern-local. It decides which matched events are reported as dense clusters in this pattern. It does not change the global `qc.snpClusters` rule unless the QC rule itself is configured separately.", + "examples": [ + { + "windowSize": 100, + "cutoff": 5 + } + ], + "type": "object", + "properties": { + "windowSize": { + "description": "Sliding nucleotide window size. A cluster is detected when `cutoff` matched events fall within this many reference nucleotides.", + "default": 100, + "type": "integer", + "format": "uint", + "minimum": 0.0 + }, + "cutoff": { + "description": "Minimum number of matched events in one sliding window required to report a cluster.", + "default": 5, + "type": "integer", + "format": "uint", + "minimum": 0.0 + } + } + }, "NextcladeGeneralParamsOptional": { "type": "object", "properties": { diff --git a/packages/nextclade-schemas/input-pathogen-json.schema.yaml b/packages/nextclade-schemas/input-pathogen-json.schema.yaml index 1c94e542a..29edc8b19 100644 --- a/packages/nextclade-schemas/input-pathogen-json.schema.yaml +++ b/packages/nextclade-schemas/input-pathogen-json.schema.yaml @@ -41,9 +41,9 @@ examples: cutoff: 15.0 snpClusters: enabled: true + scoreWeight: 50.0 windowSize: 100 clusterCutOff: 5 - scoreWeight: 50.0 frameShifts: enabled: true ignoredFrameShifts: @@ -60,6 +60,42 @@ examples: - cdsName: ORF3a codon: 238 scoreWeight: 75.0 + mutationPatterns: + patterns: + - id: adar + name: ADAR-like RNA editing + description: ADAR-mediated A-to-I editing observed as A>G and complementary T>C + events: + - type: nucSubstitution + ref: + - A + - T + qry: + - G + - C + motifs: + - A[ACGT]G + - T[ACGT]C + cluster: + windowSize: 100 + cutoff: 5 + - id: apobec + name: APOBEC-like cytosine deamination + description: APOBEC-like cytosine deamination observed as C>T and complementary G>A + events: + - type: nucSubstitution + ref: + - C + - G + qry: + - T + - A + motifs: + - TC[AT] + - '[AT]GA' + cluster: + windowSize: 50 + cutoff: 4 phenotypeData: - name: receptor_binding nameFriendly: Receptor Binding @@ -139,6 +175,11 @@ properties: anyOf: - $ref: '#/definitions/QcConfig' - type: 'null' + mutationPatterns: + description: Mutation pattern analysis configuration. When present, detects private mutation patterns such as enzyme-associated clustered substitution signatures. + anyOf: + - $ref: '#/definitions/MutationPatternsConfig' + - type: 'null' generalParams: description: General analysis parameters (e.g. includeReference, inOrder, replaceUnknown). anyOf: @@ -324,9 +365,9 @@ definitions: cutoff: 15.0 snpClusters: enabled: true + scoreWeight: 50.0 windowSize: 100 clusterCutOff: 5 - scoreWeight: 50.0 frameShifts: enabled: true ignoredFrameShifts: @@ -378,9 +419,9 @@ definitions: description: Configuration for the "SNP clusters" (C) rule default: enabled: false + scoreWeight: 0.0 windowSize: 0 clusterCutOff: 0 - scoreWeight: 0.0 allOf: - $ref: '#/definitions/QcRulesConfigSnpClusters' frameShifts: @@ -495,31 +536,31 @@ definitions: description: Configuration for QC rule "SNP clusters" examples: - enabled: true + scoreWeight: 50.0 windowSize: 100 clusterCutOff: 5 - scoreWeight: 50.0 type: object properties: enabled: default: false type: boolean + scoreWeight: + description: QC score added per detected cluster + default: 0.0 + type: number + format: double windowSize: - description: Size of the sliding window (in nucleotides) used to count private mutations + description: Size of the sliding nucleotide window for global SNP cluster detection default: 0 type: integer format: uint minimum: 0.0 clusterCutOff: - description: Number of private mutations within a window above which the window is flagged as a cluster + description: Number of substitutions within a window required to count as a global SNP cluster default: 0 type: integer format: uint minimum: 0.0 - scoreWeight: - description: QC score added per detected cluster - default: 0.0 - type: number - format: double QcRulesConfigFrameShifts: description: Configuration for QC rule "frame shifts" examples: @@ -625,6 +666,195 @@ definitions: type: integer format: uint minimum: 0.0 + MutationPatternsConfig: + description: Configuration for mutation pattern analysis. Detects private mutations matching biologically meaningful mutation type and reference-context rules. Pattern-specific clustering is independent from global `qc.snpClusters`. + examples: + - patterns: + - id: adar + name: ADAR-like RNA editing + description: ADAR-mediated A-to-I editing observed as A>G and complementary T>C + events: + - type: nucSubstitution + ref: + - A + - T + qry: + - G + - C + motifs: + - A[ACGT]G + - T[ACGT]C + cluster: + windowSize: 100 + cutoff: 5 + - id: apobec + name: APOBEC-like cytosine deamination + description: APOBEC-like cytosine deamination observed as C>T and complementary G>A + events: + - type: nucSubstitution + ref: + - C + - G + qry: + - T + - A + motifs: + - TC[AT] + - '[AT]GA' + cluster: + windowSize: 50 + cutoff: 4 + type: object + properties: + patterns: + description: Mutation patterns evaluated independently for every analyzed sequence. + type: array + items: + $ref: '#/definitions/MutationPatternConfig' + MutationPatternConfig: + description: |- + Named mutation pattern: event filters, optional clustering, and display metadata. + + Dataset authors can define multiple patterns to separate biologically different mutation processes, such as ADAR-like A-to-I editing and APOBEC-like cytosine deamination. + examples: + - id: adar + name: ADAR-like RNA editing + description: ADAR-mediated A-to-I editing observed as A>G and complementary T>C + events: + - type: nucSubstitution + ref: + - A + - T + qry: + - G + - C + motifs: + - A[ACGT]G + - T[ACGT]C + cluster: + windowSize: 100 + cutoff: 5 + type: object + required: + - id + - name + properties: + id: + description: Stable machine-readable identifier. This value appears in JSON and TSV output. + type: string + name: + description: Human-readable name shown in Nextclade Web tooltips and reports. + type: string + description: + description: Optional explanatory text shown together with the pattern result. + type: + - string + - 'null' + events: + description: Event filters included in this pattern. If empty, the pattern matches all private nucleotide substitutions. + type: array + items: + $ref: '#/definitions/MutationPatternEvent' + cluster: + description: Optional pattern-local clustering rule. If omitted, Nextclade reports matches and type counts, but no clusters for this pattern. + anyOf: + - $ref: '#/definitions/MutationPatternClusterConfig' + - type: 'null' + MutationPatternEvent: + description: |- + Mutation event filter in mutation pattern analysis. + + Each event selects one class of private mutations to include in a pattern. More event variants can be added without changing the surrounding `mutationPatterns.patterns[]` structure. + examples: + - type: nucSubstitution + ref: + - A + - T + qry: + - G + - C + motifs: + - A[ACGT]G + - T[ACGT]C + oneOf: + - description: Nucleotide substitution event, selected by reference nucleotide, query nucleotide, and optional reference motifs. + examples: + - ref: + - A + - T + qry: + - G + - C + motifs: + - A[ACGT]G + - T[ACGT]C + type: object + required: + - qry + - ref + - type + properties: + type: + type: string + enum: + - nucSubstitution + ref: + description: Reference nucleotides to match at the mutated position. + type: array + items: + $ref: '#/definitions/Nuc' + qry: + description: Query nucleotides to match at the mutated position. + type: array + items: + $ref: '#/definitions/Nuc' + motifs: + description: Regular expressions matched against the reference sequence. A mutation matches a motif when the regex match spans the mutated position. Use IUPAC nucleotide letters directly in the regex, for example `TC[AT]` for a TCW motif. + type: array + items: + type: string + Nuc: + description: A nucleotide + type: string + enum: + - T + - A + - W + - C + - Y + - M + - H + - G + - K + - R + - D + - S + - B + - V + - N + - '-' + MutationPatternClusterConfig: + description: |- + Clustering rule applied to events matching one mutation pattern. + + Clustering is pattern-local. It decides which matched events are reported as dense clusters in this pattern. It does not change the global `qc.snpClusters` rule unless the QC rule itself is configured separately. + examples: + - windowSize: 100 + cutoff: 5 + type: object + properties: + windowSize: + description: Sliding nucleotide window size. A cluster is detected when `cutoff` matched events fall within this many reference nucleotides. + default: 100 + type: integer + format: uint + minimum: 0.0 + cutoff: + description: Minimum number of matched events in one sliding window required to report a cluster. + default: 5 + type: integer + format: uint + minimum: 0.0 NextcladeGeneralParamsOptional: type: object properties: diff --git a/packages/nextclade-schemas/nextclade-auspice-extensions.schema.json b/packages/nextclade-schemas/nextclade-auspice-extensions.schema.json index 139417c43..f0b216a27 100644 --- a/packages/nextclade-schemas/nextclade-auspice-extensions.schema.json +++ b/packages/nextclade-schemas/nextclade-auspice-extensions.schema.json @@ -387,9 +387,9 @@ }, "snpClusters": { "enabled": true, + "scoreWeight": 50.0, "windowSize": 100, - "clusterCutOff": 5, - "scoreWeight": 50.0 + "clusterCutOff": 5 }, "frameShifts": { "enabled": true, @@ -419,6 +419,62 @@ "scoreWeight": 75.0 } }, + "mutationPatterns": { + "patterns": [ + { + "id": "adar", + "name": "ADAR-like RNA editing", + "description": "ADAR-mediated A-to-I editing observed as A>G and complementary T>C", + "events": [ + { + "type": "nucSubstitution", + "ref": [ + "A", + "T" + ], + "qry": [ + "G", + "C" + ], + "motifs": [ + "A[ACGT]G", + "T[ACGT]C" + ] + } + ], + "cluster": { + "windowSize": 100, + "cutoff": 5 + } + }, + { + "id": "apobec", + "name": "APOBEC-like cytosine deamination", + "description": "APOBEC-like cytosine deamination observed as C>T and complementary G>A", + "events": [ + { + "type": "nucSubstitution", + "ref": [ + "C", + "G" + ], + "qry": [ + "T", + "A" + ], + "motifs": [ + "TC[AT]", + "[AT]GA" + ] + } + ], + "cluster": { + "windowSize": 50, + "cutoff": 4 + } + } + ] + }, "phenotypeData": [ { "name": "receptor_binding", @@ -553,6 +609,17 @@ } ] }, + "mutationPatterns": { + "description": "Mutation pattern analysis configuration. When present, detects private mutation patterns such as enzyme-associated clustered substitution signatures.", + "anyOf": [ + { + "$ref": "#/definitions/MutationPatternsConfig" + }, + { + "type": "null" + } + ] + }, "generalParams": { "description": "General analysis parameters (e.g. includeReference, inOrder, replaceUnknown).", "anyOf": [ @@ -841,9 +908,9 @@ }, "snpClusters": { "enabled": true, + "scoreWeight": 50.0, "windowSize": 100, - "clusterCutOff": 5, - "scoreWeight": 50.0 + "clusterCutOff": 5 }, "frameShifts": { "enabled": true, @@ -924,9 +991,9 @@ "description": "Configuration for the \"SNP clusters\" (C) rule", "default": { "enabled": false, + "scoreWeight": 0.0, "windowSize": 0, - "clusterCutOff": 0, - "scoreWeight": 0.0 + "clusterCutOff": 0 }, "allOf": [ { @@ -1088,9 +1155,9 @@ "examples": [ { "enabled": true, + "scoreWeight": 50.0, "windowSize": 100, - "clusterCutOff": 5, - "scoreWeight": 50.0 + "clusterCutOff": 5 } ], "type": "object", @@ -1099,25 +1166,25 @@ "default": false, "type": "boolean" }, + "scoreWeight": { + "description": "QC score added per detected cluster", + "default": 0.0, + "type": "number", + "format": "double" + }, "windowSize": { - "description": "Size of the sliding window (in nucleotides) used to count private mutations", + "description": "Size of the sliding nucleotide window for global SNP cluster detection", "default": 0, "type": "integer", "format": "uint", "minimum": 0.0 }, "clusterCutOff": { - "description": "Number of private mutations within a window above which the window is flagged as a cluster", + "description": "Number of substitutions within a window required to count as a global SNP cluster", "default": 0, "type": "integer", "format": "uint", "minimum": 0.0 - }, - "scoreWeight": { - "description": "QC score added per detected cluster", - "default": 0.0, - "type": "number", - "format": "double" } } }, @@ -1256,6 +1323,272 @@ } } }, + "MutationPatternsConfig": { + "description": "Configuration for mutation pattern analysis. Detects private mutations matching biologically meaningful mutation type and reference-context rules. Pattern-specific clustering is independent from global `qc.snpClusters`.", + "examples": [ + { + "patterns": [ + { + "id": "adar", + "name": "ADAR-like RNA editing", + "description": "ADAR-mediated A-to-I editing observed as A>G and complementary T>C", + "events": [ + { + "type": "nucSubstitution", + "ref": [ + "A", + "T" + ], + "qry": [ + "G", + "C" + ], + "motifs": [ + "A[ACGT]G", + "T[ACGT]C" + ] + } + ], + "cluster": { + "windowSize": 100, + "cutoff": 5 + } + }, + { + "id": "apobec", + "name": "APOBEC-like cytosine deamination", + "description": "APOBEC-like cytosine deamination observed as C>T and complementary G>A", + "events": [ + { + "type": "nucSubstitution", + "ref": [ + "C", + "G" + ], + "qry": [ + "T", + "A" + ], + "motifs": [ + "TC[AT]", + "[AT]GA" + ] + } + ], + "cluster": { + "windowSize": 50, + "cutoff": 4 + } + } + ] + } + ], + "type": "object", + "properties": { + "patterns": { + "description": "Mutation patterns evaluated independently for every analyzed sequence.", + "type": "array", + "items": { + "$ref": "#/definitions/MutationPatternConfig" + } + } + } + }, + "MutationPatternConfig": { + "description": "Named mutation pattern: event filters, optional clustering, and display metadata.\n\nDataset authors can define multiple patterns to separate biologically different mutation processes, such as ADAR-like A-to-I editing and APOBEC-like cytosine deamination.", + "examples": [ + { + "id": "adar", + "name": "ADAR-like RNA editing", + "description": "ADAR-mediated A-to-I editing observed as A>G and complementary T>C", + "events": [ + { + "type": "nucSubstitution", + "ref": [ + "A", + "T" + ], + "qry": [ + "G", + "C" + ], + "motifs": [ + "A[ACGT]G", + "T[ACGT]C" + ] + } + ], + "cluster": { + "windowSize": 100, + "cutoff": 5 + } + } + ], + "type": "object", + "required": [ + "id", + "name" + ], + "properties": { + "id": { + "description": "Stable machine-readable identifier. This value appears in JSON and TSV output.", + "type": "string" + }, + "name": { + "description": "Human-readable name shown in Nextclade Web tooltips and reports.", + "type": "string" + }, + "description": { + "description": "Optional explanatory text shown together with the pattern result.", + "type": [ + "string", + "null" + ] + }, + "events": { + "description": "Event filters included in this pattern. If empty, the pattern matches all private nucleotide substitutions.", + "type": "array", + "items": { + "$ref": "#/definitions/MutationPatternEvent" + } + }, + "cluster": { + "description": "Optional pattern-local clustering rule. If omitted, Nextclade reports matches and type counts, but no clusters for this pattern.", + "anyOf": [ + { + "$ref": "#/definitions/MutationPatternClusterConfig" + }, + { + "type": "null" + } + ] + } + } + }, + "MutationPatternEvent": { + "description": "Mutation event filter in mutation pattern analysis.\n\nEach event selects one class of private mutations to include in a pattern. More event variants can be added without changing the surrounding `mutationPatterns.patterns[]` structure.", + "examples": [ + { + "type": "nucSubstitution", + "ref": [ + "A", + "T" + ], + "qry": [ + "G", + "C" + ], + "motifs": [ + "A[ACGT]G", + "T[ACGT]C" + ] + } + ], + "oneOf": [ + { + "description": "Nucleotide substitution event, selected by reference nucleotide, query nucleotide, and optional reference motifs.", + "examples": [ + { + "ref": [ + "A", + "T" + ], + "qry": [ + "G", + "C" + ], + "motifs": [ + "A[ACGT]G", + "T[ACGT]C" + ] + } + ], + "type": "object", + "required": [ + "qry", + "ref", + "type" + ], + "properties": { + "type": { + "type": "string", + "enum": [ + "nucSubstitution" + ] + }, + "ref": { + "description": "Reference nucleotides to match at the mutated position.", + "type": "array", + "items": { + "$ref": "#/definitions/Nuc" + } + }, + "qry": { + "description": "Query nucleotides to match at the mutated position.", + "type": "array", + "items": { + "$ref": "#/definitions/Nuc" + } + }, + "motifs": { + "description": "Regular expressions matched against the reference sequence. A mutation matches a motif when the regex match spans the mutated position. Use IUPAC nucleotide letters directly in the regex, for example `TC[AT]` for a TCW motif.", + "type": "array", + "items": { + "type": "string" + } + } + } + } + ] + }, + "Nuc": { + "description": "A nucleotide", + "type": "string", + "enum": [ + "T", + "A", + "W", + "C", + "Y", + "M", + "H", + "G", + "K", + "R", + "D", + "S", + "B", + "V", + "N", + "-" + ] + }, + "MutationPatternClusterConfig": { + "description": "Clustering rule applied to events matching one mutation pattern.\n\nClustering is pattern-local. It decides which matched events are reported as dense clusters in this pattern. It does not change the global `qc.snpClusters` rule unless the QC rule itself is configured separately.", + "examples": [ + { + "windowSize": 100, + "cutoff": 5 + } + ], + "type": "object", + "properties": { + "windowSize": { + "description": "Sliding nucleotide window size. A cluster is detected when `cutoff` matched events fall within this many reference nucleotides.", + "default": 100, + "type": "integer", + "format": "uint", + "minimum": 0.0 + }, + "cutoff": { + "description": "Minimum number of matched events in one sliding window required to report a cluster.", + "default": 5, + "type": "integer", + "format": "uint", + "minimum": 0.0 + } + } + }, "NextcladeGeneralParamsOptional": { "type": "object", "properties": { diff --git a/packages/nextclade-schemas/nextclade-auspice-extensions.schema.yaml b/packages/nextclade-schemas/nextclade-auspice-extensions.schema.yaml index ffdbb66b2..904d97de8 100644 --- a/packages/nextclade-schemas/nextclade-auspice-extensions.schema.yaml +++ b/packages/nextclade-schemas/nextclade-auspice-extensions.schema.yaml @@ -261,9 +261,9 @@ definitions: cutoff: 15.0 snpClusters: enabled: true + scoreWeight: 50.0 windowSize: 100 clusterCutOff: 5 - scoreWeight: 50.0 frameShifts: enabled: true ignoredFrameShifts: @@ -280,6 +280,42 @@ definitions: - cdsName: ORF3a codon: 238 scoreWeight: 75.0 + mutationPatterns: + patterns: + - id: adar + name: ADAR-like RNA editing + description: ADAR-mediated A-to-I editing observed as A>G and complementary T>C + events: + - type: nucSubstitution + ref: + - A + - T + qry: + - G + - C + motifs: + - A[ACGT]G + - T[ACGT]C + cluster: + windowSize: 100 + cutoff: 5 + - id: apobec + name: APOBEC-like cytosine deamination + description: APOBEC-like cytosine deamination observed as C>T and complementary G>A + events: + - type: nucSubstitution + ref: + - C + - G + qry: + - T + - A + motifs: + - TC[AT] + - '[AT]GA' + cluster: + windowSize: 50 + cutoff: 4 phenotypeData: - name: receptor_binding nameFriendly: Receptor Binding @@ -359,6 +395,11 @@ definitions: anyOf: - $ref: '#/definitions/QcConfig' - type: 'null' + mutationPatterns: + description: Mutation pattern analysis configuration. When present, detects private mutation patterns such as enzyme-associated clustered substitution signatures. + anyOf: + - $ref: '#/definitions/MutationPatternsConfig' + - type: 'null' generalParams: description: General analysis parameters (e.g. includeReference, inOrder, replaceUnknown). anyOf: @@ -543,9 +584,9 @@ definitions: cutoff: 15.0 snpClusters: enabled: true + scoreWeight: 50.0 windowSize: 100 clusterCutOff: 5 - scoreWeight: 50.0 frameShifts: enabled: true ignoredFrameShifts: @@ -597,9 +638,9 @@ definitions: description: Configuration for the "SNP clusters" (C) rule default: enabled: false + scoreWeight: 0.0 windowSize: 0 clusterCutOff: 0 - scoreWeight: 0.0 allOf: - $ref: '#/definitions/QcRulesConfigSnpClusters' frameShifts: @@ -714,31 +755,31 @@ definitions: description: Configuration for QC rule "SNP clusters" examples: - enabled: true + scoreWeight: 50.0 windowSize: 100 clusterCutOff: 5 - scoreWeight: 50.0 type: object properties: enabled: default: false type: boolean + scoreWeight: + description: QC score added per detected cluster + default: 0.0 + type: number + format: double windowSize: - description: Size of the sliding window (in nucleotides) used to count private mutations + description: Size of the sliding nucleotide window for global SNP cluster detection default: 0 type: integer format: uint minimum: 0.0 clusterCutOff: - description: Number of private mutations within a window above which the window is flagged as a cluster + description: Number of substitutions within a window required to count as a global SNP cluster default: 0 type: integer format: uint minimum: 0.0 - scoreWeight: - description: QC score added per detected cluster - default: 0.0 - type: number - format: double QcRulesConfigFrameShifts: description: Configuration for QC rule "frame shifts" examples: @@ -827,6 +868,195 @@ definitions: type: integer format: uint minimum: 0.0 + MutationPatternsConfig: + description: Configuration for mutation pattern analysis. Detects private mutations matching biologically meaningful mutation type and reference-context rules. Pattern-specific clustering is independent from global `qc.snpClusters`. + examples: + - patterns: + - id: adar + name: ADAR-like RNA editing + description: ADAR-mediated A-to-I editing observed as A>G and complementary T>C + events: + - type: nucSubstitution + ref: + - A + - T + qry: + - G + - C + motifs: + - A[ACGT]G + - T[ACGT]C + cluster: + windowSize: 100 + cutoff: 5 + - id: apobec + name: APOBEC-like cytosine deamination + description: APOBEC-like cytosine deamination observed as C>T and complementary G>A + events: + - type: nucSubstitution + ref: + - C + - G + qry: + - T + - A + motifs: + - TC[AT] + - '[AT]GA' + cluster: + windowSize: 50 + cutoff: 4 + type: object + properties: + patterns: + description: Mutation patterns evaluated independently for every analyzed sequence. + type: array + items: + $ref: '#/definitions/MutationPatternConfig' + MutationPatternConfig: + description: |- + Named mutation pattern: event filters, optional clustering, and display metadata. + + Dataset authors can define multiple patterns to separate biologically different mutation processes, such as ADAR-like A-to-I editing and APOBEC-like cytosine deamination. + examples: + - id: adar + name: ADAR-like RNA editing + description: ADAR-mediated A-to-I editing observed as A>G and complementary T>C + events: + - type: nucSubstitution + ref: + - A + - T + qry: + - G + - C + motifs: + - A[ACGT]G + - T[ACGT]C + cluster: + windowSize: 100 + cutoff: 5 + type: object + required: + - id + - name + properties: + id: + description: Stable machine-readable identifier. This value appears in JSON and TSV output. + type: string + name: + description: Human-readable name shown in Nextclade Web tooltips and reports. + type: string + description: + description: Optional explanatory text shown together with the pattern result. + type: + - string + - 'null' + events: + description: Event filters included in this pattern. If empty, the pattern matches all private nucleotide substitutions. + type: array + items: + $ref: '#/definitions/MutationPatternEvent' + cluster: + description: Optional pattern-local clustering rule. If omitted, Nextclade reports matches and type counts, but no clusters for this pattern. + anyOf: + - $ref: '#/definitions/MutationPatternClusterConfig' + - type: 'null' + MutationPatternEvent: + description: |- + Mutation event filter in mutation pattern analysis. + + Each event selects one class of private mutations to include in a pattern. More event variants can be added without changing the surrounding `mutationPatterns.patterns[]` structure. + examples: + - type: nucSubstitution + ref: + - A + - T + qry: + - G + - C + motifs: + - A[ACGT]G + - T[ACGT]C + oneOf: + - description: Nucleotide substitution event, selected by reference nucleotide, query nucleotide, and optional reference motifs. + examples: + - ref: + - A + - T + qry: + - G + - C + motifs: + - A[ACGT]G + - T[ACGT]C + type: object + required: + - qry + - ref + - type + properties: + type: + type: string + enum: + - nucSubstitution + ref: + description: Reference nucleotides to match at the mutated position. + type: array + items: + $ref: '#/definitions/Nuc' + qry: + description: Query nucleotides to match at the mutated position. + type: array + items: + $ref: '#/definitions/Nuc' + motifs: + description: Regular expressions matched against the reference sequence. A mutation matches a motif when the regex match spans the mutated position. Use IUPAC nucleotide letters directly in the regex, for example `TC[AT]` for a TCW motif. + type: array + items: + type: string + Nuc: + description: A nucleotide + type: string + enum: + - T + - A + - W + - C + - Y + - M + - H + - G + - K + - R + - D + - S + - B + - V + - N + - '-' + MutationPatternClusterConfig: + description: |- + Clustering rule applied to events matching one mutation pattern. + + Clustering is pattern-local. It decides which matched events are reported as dense clusters in this pattern. It does not change the global `qc.snpClusters` rule unless the QC rule itself is configured separately. + examples: + - windowSize: 100 + cutoff: 5 + type: object + properties: + windowSize: + description: Sliding nucleotide window size. A cluster is detected when `cutoff` matched events fall within this many reference nucleotides. + default: 100 + type: integer + format: uint + minimum: 0.0 + cutoff: + description: Minimum number of matched events in one sliding window required to report a cluster. + default: 5 + type: integer + format: uint + minimum: 0.0 NextcladeGeneralParamsOptional: type: object properties: diff --git a/packages/nextclade-schemas/output-json.schema.json b/packages/nextclade-schemas/output-json.schema.json index 2c4e8fe7e..507a3a019 100644 --- a/packages/nextclade-schemas/output-json.schema.json +++ b/packages/nextclade-schemas/output-json.schema.json @@ -399,6 +399,7 @@ "lenUnaligned", "missing", "missingCdses", + "mutationPatterns", "nearestNodeId", "nearestNodeName", "nonACGTNs", @@ -700,6 +701,14 @@ "$ref": "#/definitions/PrivateAaMutations" } }, + "mutationPatterns": { + "description": "Per-type mutation statistics, local reference context, and detected mutation clusters", + "allOf": [ + { + "$ref": "#/definitions/MutationPatternsResults" + } + ] + }, "cladeFounderInfo": { "description": "Mutations relative to the clade founder node", "anyOf": [ @@ -1664,6 +1673,633 @@ } } }, + "MutationPatternsResults": { + "description": "Mutation pattern analysis output. Contains per-pattern results.", + "examples": [ + { + "results": [ + { + "id": "adar", + "name": "ADAR-like RNA editing", + "matches": [ + { + "type": "nucSubstitution", + "pos": 5003, + "refNuc": "A", + "qryNuc": "G", + "refContext": [ + "A", + "A", + "G" + ], + "motifMatches": [ + { + "motif": "A[ACGT]G", + "start": 5002, + "end": 5005 + } + ] + }, + { + "type": "nucSubstitution", + "pos": 5033, + "refNuc": "A", + "qryNuc": "G", + "refContext": [ + "A", + "A", + "G" + ], + "motifMatches": [ + { + "motif": "A[ACGT]G", + "start": 5032, + "end": 5035 + } + ] + } + ], + "eventTypeCounts": [ + { + "type": "nucSubstitution", + "refNuc": "A", + "qryNuc": "G", + "count": 2 + } + ], + "clusters": [ + { + "start": 5003, + "end": 5033, + "count": 2, + "events": [ + { + "type": "nucSubstitution", + "pos": 5003, + "refNuc": "A", + "qryNuc": "G", + "refContext": [ + "A", + "A", + "G" + ], + "motifMatches": [ + { + "motif": "A[ACGT]G", + "start": 5002, + "end": 5005 + } + ] + }, + { + "type": "nucSubstitution", + "pos": 5033, + "refNuc": "A", + "qryNuc": "G", + "refContext": [ + "A", + "A", + "G" + ], + "motifMatches": [ + { + "motif": "A[ACGT]G", + "start": 5032, + "end": 5035 + } + ] + } + ], + "eventTypeCounts": [ + { + "type": "nucSubstitution", + "refNuc": "A", + "qryNuc": "G", + "count": 2 + } + ] + } + ], + "counts": { + "matches": 2, + "clustered": 2, + "clusters": 1 + }, + "description": "ADAR-mediated A-to-I editing observed as A>G and complementary T>C" + } + ] + } + ], + "type": "object", + "properties": { + "results": { + "description": "Results for each configured mutation pattern. Empty when mutation pattern analysis is not configured and no global SNP cluster compatibility output is needed.", + "type": "array", + "items": { + "$ref": "#/definitions/MutationPatternResults" + } + } + } + }, + "MutationPatternResults": { + "description": "Results for a single mutation pattern.", + "examples": [ + { + "id": "adar", + "name": "ADAR-like RNA editing", + "matches": [ + { + "type": "nucSubstitution", + "pos": 5003, + "refNuc": "A", + "qryNuc": "G", + "refContext": [ + "A", + "A", + "G" + ], + "motifMatches": [ + { + "motif": "A[ACGT]G", + "start": 5002, + "end": 5005 + } + ] + }, + { + "type": "nucSubstitution", + "pos": 5033, + "refNuc": "A", + "qryNuc": "G", + "refContext": [ + "A", + "A", + "G" + ], + "motifMatches": [ + { + "motif": "A[ACGT]G", + "start": 5032, + "end": 5035 + } + ] + } + ], + "eventTypeCounts": [ + { + "type": "nucSubstitution", + "refNuc": "A", + "qryNuc": "G", + "count": 2 + } + ], + "clusters": [ + { + "start": 5003, + "end": 5033, + "count": 2, + "events": [ + { + "type": "nucSubstitution", + "pos": 5003, + "refNuc": "A", + "qryNuc": "G", + "refContext": [ + "A", + "A", + "G" + ], + "motifMatches": [ + { + "motif": "A[ACGT]G", + "start": 5002, + "end": 5005 + } + ] + }, + { + "type": "nucSubstitution", + "pos": 5033, + "refNuc": "A", + "qryNuc": "G", + "refContext": [ + "A", + "A", + "G" + ], + "motifMatches": [ + { + "motif": "A[ACGT]G", + "start": 5032, + "end": 5035 + } + ] + } + ], + "eventTypeCounts": [ + { + "type": "nucSubstitution", + "refNuc": "A", + "qryNuc": "G", + "count": 2 + } + ] + } + ], + "counts": { + "matches": 2, + "clustered": 2, + "clusters": 1 + }, + "description": "ADAR-mediated A-to-I editing observed as A>G and complementary T>C" + } + ], + "type": "object", + "required": [ + "clusters", + "counts", + "eventTypeCounts", + "id", + "matches", + "name" + ], + "properties": { + "id": { + "description": "Stable machine-readable pattern identifier copied from pathogen.json.", + "type": "string" + }, + "name": { + "description": "Human-readable pattern name copied from pathogen.json.", + "type": "string" + }, + "matches": { + "description": "All private mutation events matching this pattern.", + "type": "array", + "items": { + "$ref": "#/definitions/MutationPatternEventMatch" + } + }, + "eventTypeCounts": { + "description": "Counts of matched event types across all `matches`.", + "type": "array", + "items": { + "$ref": "#/definitions/MutationPatternEventTypeCount" + } + }, + "clusters": { + "description": "Pattern-local clusters detected among `matches`.", + "type": "array", + "items": { + "$ref": "#/definitions/MutationPatternCluster" + } + }, + "counts": { + "description": "Summary counts for matches and clusters.", + "allOf": [ + { + "$ref": "#/definitions/MutationPatternCounts" + } + ] + }, + "description": { + "description": "Optional explanatory text copied from pathogen.json.", + "type": [ + "string", + "null" + ] + } + } + }, + "MutationPatternEventMatch": { + "description": "Matched mutation pattern event.", + "examples": [ + { + "type": "nucSubstitution", + "pos": 5003, + "refNuc": "A", + "qryNuc": "G", + "refContext": [ + "A", + "A", + "G" + ], + "motifMatches": [ + { + "motif": "A[ACGT]G", + "start": 5002, + "end": 5005 + } + ] + } + ], + "oneOf": [ + { + "description": "Matched nucleotide substitution event.", + "examples": [ + { + "pos": 5003, + "refNuc": "A", + "qryNuc": "G", + "refContext": [ + "A", + "A", + "G" + ], + "motifMatches": [ + { + "motif": "A[ACGT]G", + "start": 5002, + "end": 5005 + } + ] + } + ], + "type": "object", + "required": [ + "motifMatches", + "pos", + "qryNuc", + "refContext", + "refNuc", + "type" + ], + "properties": { + "type": { + "type": "string", + "enum": [ + "nucSubstitution" + ] + }, + "motifMatches": { + "description": "Motif matches that spanned the substituted position. Empty when the pattern event had no motif restriction.", + "type": "array", + "items": { + "$ref": "#/definitions/MutationPatternMotifMatch" + } + }, + "refContext": { + "description": "Reference nucleotide context around the substituted position.", + "type": "array", + "items": { + "$ref": "#/definitions/Nuc" + } + }, + "pos": { + "description": "0-based position in the reference sequence", + "allOf": [ + { + "$ref": "#/definitions/Position" + } + ] + }, + "refNuc": { + "description": "Nucleotide in the reference at this position", + "allOf": [ + { + "$ref": "#/definitions/Nuc" + } + ] + }, + "qryNuc": { + "description": "Nucleotide in the query at this position", + "allOf": [ + { + "$ref": "#/definitions/Nuc" + } + ] + } + } + } + ] + }, + "MutationPatternMotifMatch": { + "description": "Reference motif match that overlapped a mutation pattern event.", + "examples": [ + { + "motif": "A[ACGT]G", + "start": 5002, + "end": 5005 + } + ], + "type": "object", + "required": [ + "end", + "motif", + "start" + ], + "properties": { + "motif": { + "description": "Regular expression from the pattern configuration that matched the reference sequence.", + "type": "string" + }, + "start": { + "description": "0-based first reference position included in the motif match.", + "type": "integer", + "format": "uint", + "minimum": 0.0 + }, + "end": { + "description": "0-based position after the end of the motif match.", + "type": "integer", + "format": "uint", + "minimum": 0.0 + } + } + }, + "MutationPatternEventTypeCount": { + "description": "Count of matched mutation pattern events of one event type.", + "examples": [ + { + "type": "nucSubstitution", + "refNuc": "A", + "qryNuc": "G", + "count": 8 + } + ], + "oneOf": [ + { + "description": "Count of nucleotide substitutions with the same reference and query nucleotide.", + "examples": [ + { + "refNuc": "A", + "qryNuc": "G", + "count": 8 + } + ], + "type": "object", + "required": [ + "count", + "qryNuc", + "refNuc", + "type" + ], + "properties": { + "type": { + "type": "string", + "enum": [ + "nucSubstitution" + ] + }, + "refNuc": { + "description": "Reference nucleotide at the substituted position.", + "allOf": [ + { + "$ref": "#/definitions/Nuc" + } + ] + }, + "qryNuc": { + "description": "Query nucleotide at the substituted position.", + "allOf": [ + { + "$ref": "#/definitions/Nuc" + } + ] + }, + "count": { + "description": "Number of matching substitutions with this reference and query nucleotide pair.", + "type": "integer", + "format": "uint", + "minimum": 0.0 + } + } + } + ] + }, + "MutationPatternCluster": { + "description": "Cluster of mutation pattern events detected within one sliding nucleotide window.", + "examples": [ + { + "start": 5003, + "end": 5033, + "count": 2, + "events": [ + { + "type": "nucSubstitution", + "pos": 5003, + "refNuc": "A", + "qryNuc": "G", + "refContext": [ + "A", + "A", + "G" + ], + "motifMatches": [ + { + "motif": "A[ACGT]G", + "start": 5002, + "end": 5005 + } + ] + }, + { + "type": "nucSubstitution", + "pos": 5033, + "refNuc": "A", + "qryNuc": "G", + "refContext": [ + "A", + "A", + "G" + ], + "motifMatches": [ + { + "motif": "A[ACGT]G", + "start": 5032, + "end": 5035 + } + ] + } + ], + "eventTypeCounts": [ + { + "type": "nucSubstitution", + "refNuc": "A", + "qryNuc": "G", + "count": 2 + } + ] + } + ], + "type": "object", + "required": [ + "count", + "end", + "eventTypeCounts", + "events", + "start" + ], + "properties": { + "start": { + "description": "0-based first reference position included in this cluster.", + "type": "integer", + "format": "uint", + "minimum": 0.0 + }, + "end": { + "description": "0-based last reference position included in this cluster.", + "type": "integer", + "format": "uint", + "minimum": 0.0 + }, + "count": { + "description": "Number of matched events in this cluster.", + "type": "integer", + "format": "uint", + "minimum": 0.0 + }, + "events": { + "description": "Matched events belonging to this cluster, sorted by reference position.", + "type": "array", + "items": { + "$ref": "#/definitions/MutationPatternEventMatch" + } + }, + "eventTypeCounts": { + "description": "Counts of event types within this cluster.", + "type": "array", + "items": { + "$ref": "#/definitions/MutationPatternEventTypeCount" + } + } + } + }, + "MutationPatternCounts": { + "description": "Summary counts for one mutation pattern.", + "examples": [ + { + "matches": 14, + "clustered": 14, + "clusters": 2 + } + ], + "type": "object", + "required": [ + "clustered", + "clusters", + "matches" + ], + "properties": { + "matches": { + "description": "Number of private mutation events matching the pattern.", + "type": "integer", + "format": "uint", + "minimum": 0.0 + }, + "clustered": { + "description": "Number of matching events that belong to reported clusters.", + "type": "integer", + "format": "uint", + "minimum": 0.0 + }, + "clusters": { + "description": "Number of clusters reported for this pattern.", + "type": "integer", + "format": "uint", + "minimum": 0.0 + } + } + }, "CladeNodeAttrFounderInfo": { "description": "Information about a clade-like node attribute founder", "type": "object", diff --git a/packages/nextclade-schemas/output-json.schema.yaml b/packages/nextclade-schemas/output-json.schema.yaml index 9305c399d..a2b4f9b49 100644 --- a/packages/nextclade-schemas/output-json.schema.yaml +++ b/packages/nextclade-schemas/output-json.schema.yaml @@ -277,6 +277,7 @@ definitions: - lenUnaligned - missing - missingCdses + - mutationPatterns - nearestNodeId - nearestNodeName - nonACGTNs @@ -509,6 +510,10 @@ definitions: type: object additionalProperties: $ref: '#/definitions/PrivateAaMutations' + mutationPatterns: + description: Per-type mutation statistics, local reference context, and detected mutation clusters + allOf: + - $ref: '#/definitions/MutationPatternsResults' cladeFounderInfo: description: Mutations relative to the clade founder node anyOf: @@ -1178,6 +1183,423 @@ definitions: type: array items: type: string + MutationPatternsResults: + description: Mutation pattern analysis output. Contains per-pattern results. + examples: + - results: + - id: adar + name: ADAR-like RNA editing + matches: + - type: nucSubstitution + pos: 5003 + refNuc: A + qryNuc: G + refContext: + - A + - A + - G + motifMatches: + - motif: A[ACGT]G + start: 5002 + end: 5005 + - type: nucSubstitution + pos: 5033 + refNuc: A + qryNuc: G + refContext: + - A + - A + - G + motifMatches: + - motif: A[ACGT]G + start: 5032 + end: 5035 + eventTypeCounts: + - type: nucSubstitution + refNuc: A + qryNuc: G + count: 2 + clusters: + - start: 5003 + end: 5033 + count: 2 + events: + - type: nucSubstitution + pos: 5003 + refNuc: A + qryNuc: G + refContext: + - A + - A + - G + motifMatches: + - motif: A[ACGT]G + start: 5002 + end: 5005 + - type: nucSubstitution + pos: 5033 + refNuc: A + qryNuc: G + refContext: + - A + - A + - G + motifMatches: + - motif: A[ACGT]G + start: 5032 + end: 5035 + eventTypeCounts: + - type: nucSubstitution + refNuc: A + qryNuc: G + count: 2 + counts: + matches: 2 + clustered: 2 + clusters: 1 + description: ADAR-mediated A-to-I editing observed as A>G and complementary T>C + type: object + properties: + results: + description: Results for each configured mutation pattern. Empty when mutation pattern analysis is not configured and no global SNP cluster compatibility output is needed. + type: array + items: + $ref: '#/definitions/MutationPatternResults' + MutationPatternResults: + description: Results for a single mutation pattern. + examples: + - id: adar + name: ADAR-like RNA editing + matches: + - type: nucSubstitution + pos: 5003 + refNuc: A + qryNuc: G + refContext: + - A + - A + - G + motifMatches: + - motif: A[ACGT]G + start: 5002 + end: 5005 + - type: nucSubstitution + pos: 5033 + refNuc: A + qryNuc: G + refContext: + - A + - A + - G + motifMatches: + - motif: A[ACGT]G + start: 5032 + end: 5035 + eventTypeCounts: + - type: nucSubstitution + refNuc: A + qryNuc: G + count: 2 + clusters: + - start: 5003 + end: 5033 + count: 2 + events: + - type: nucSubstitution + pos: 5003 + refNuc: A + qryNuc: G + refContext: + - A + - A + - G + motifMatches: + - motif: A[ACGT]G + start: 5002 + end: 5005 + - type: nucSubstitution + pos: 5033 + refNuc: A + qryNuc: G + refContext: + - A + - A + - G + motifMatches: + - motif: A[ACGT]G + start: 5032 + end: 5035 + eventTypeCounts: + - type: nucSubstitution + refNuc: A + qryNuc: G + count: 2 + counts: + matches: 2 + clustered: 2 + clusters: 1 + description: ADAR-mediated A-to-I editing observed as A>G and complementary T>C + type: object + required: + - clusters + - counts + - eventTypeCounts + - id + - matches + - name + properties: + id: + description: Stable machine-readable pattern identifier copied from pathogen.json. + type: string + name: + description: Human-readable pattern name copied from pathogen.json. + type: string + matches: + description: All private mutation events matching this pattern. + type: array + items: + $ref: '#/definitions/MutationPatternEventMatch' + eventTypeCounts: + description: Counts of matched event types across all `matches`. + type: array + items: + $ref: '#/definitions/MutationPatternEventTypeCount' + clusters: + description: Pattern-local clusters detected among `matches`. + type: array + items: + $ref: '#/definitions/MutationPatternCluster' + counts: + description: Summary counts for matches and clusters. + allOf: + - $ref: '#/definitions/MutationPatternCounts' + description: + description: Optional explanatory text copied from pathogen.json. + type: + - string + - 'null' + MutationPatternEventMatch: + description: Matched mutation pattern event. + examples: + - type: nucSubstitution + pos: 5003 + refNuc: A + qryNuc: G + refContext: + - A + - A + - G + motifMatches: + - motif: A[ACGT]G + start: 5002 + end: 5005 + oneOf: + - description: Matched nucleotide substitution event. + examples: + - pos: 5003 + refNuc: A + qryNuc: G + refContext: + - A + - A + - G + motifMatches: + - motif: A[ACGT]G + start: 5002 + end: 5005 + type: object + required: + - motifMatches + - pos + - qryNuc + - refContext + - refNuc + - type + properties: + type: + type: string + enum: + - nucSubstitution + motifMatches: + description: Motif matches that spanned the substituted position. Empty when the pattern event had no motif restriction. + type: array + items: + $ref: '#/definitions/MutationPatternMotifMatch' + refContext: + description: Reference nucleotide context around the substituted position. + type: array + items: + $ref: '#/definitions/Nuc' + pos: + description: 0-based position in the reference sequence + allOf: + - $ref: '#/definitions/Position' + refNuc: + description: Nucleotide in the reference at this position + allOf: + - $ref: '#/definitions/Nuc' + qryNuc: + description: Nucleotide in the query at this position + allOf: + - $ref: '#/definitions/Nuc' + MutationPatternMotifMatch: + description: Reference motif match that overlapped a mutation pattern event. + examples: + - motif: A[ACGT]G + start: 5002 + end: 5005 + type: object + required: + - end + - motif + - start + properties: + motif: + description: Regular expression from the pattern configuration that matched the reference sequence. + type: string + start: + description: 0-based first reference position included in the motif match. + type: integer + format: uint + minimum: 0.0 + end: + description: 0-based position after the end of the motif match. + type: integer + format: uint + minimum: 0.0 + MutationPatternEventTypeCount: + description: Count of matched mutation pattern events of one event type. + examples: + - type: nucSubstitution + refNuc: A + qryNuc: G + count: 8 + oneOf: + - description: Count of nucleotide substitutions with the same reference and query nucleotide. + examples: + - refNuc: A + qryNuc: G + count: 8 + type: object + required: + - count + - qryNuc + - refNuc + - type + properties: + type: + type: string + enum: + - nucSubstitution + refNuc: + description: Reference nucleotide at the substituted position. + allOf: + - $ref: '#/definitions/Nuc' + qryNuc: + description: Query nucleotide at the substituted position. + allOf: + - $ref: '#/definitions/Nuc' + count: + description: Number of matching substitutions with this reference and query nucleotide pair. + type: integer + format: uint + minimum: 0.0 + MutationPatternCluster: + description: Cluster of mutation pattern events detected within one sliding nucleotide window. + examples: + - start: 5003 + end: 5033 + count: 2 + events: + - type: nucSubstitution + pos: 5003 + refNuc: A + qryNuc: G + refContext: + - A + - A + - G + motifMatches: + - motif: A[ACGT]G + start: 5002 + end: 5005 + - type: nucSubstitution + pos: 5033 + refNuc: A + qryNuc: G + refContext: + - A + - A + - G + motifMatches: + - motif: A[ACGT]G + start: 5032 + end: 5035 + eventTypeCounts: + - type: nucSubstitution + refNuc: A + qryNuc: G + count: 2 + type: object + required: + - count + - end + - eventTypeCounts + - events + - start + properties: + start: + description: 0-based first reference position included in this cluster. + type: integer + format: uint + minimum: 0.0 + end: + description: 0-based last reference position included in this cluster. + type: integer + format: uint + minimum: 0.0 + count: + description: Number of matched events in this cluster. + type: integer + format: uint + minimum: 0.0 + events: + description: Matched events belonging to this cluster, sorted by reference position. + type: array + items: + $ref: '#/definitions/MutationPatternEventMatch' + eventTypeCounts: + description: Counts of event types within this cluster. + type: array + items: + $ref: '#/definitions/MutationPatternEventTypeCount' + MutationPatternCounts: + description: Summary counts for one mutation pattern. + examples: + - matches: 14 + clustered: 14 + clusters: 2 + type: object + required: + - clustered + - clusters + - matches + properties: + matches: + description: Number of private mutation events matching the pattern. + type: integer + format: uint + minimum: 0.0 + clustered: + description: Number of matching events that belong to reported clusters. + type: integer + format: uint + minimum: 0.0 + clusters: + description: Number of clusters reported for this pattern. + type: integer + format: uint + minimum: 0.0 CladeNodeAttrFounderInfo: description: Information about a clade-like node attribute founder type: object diff --git a/packages/nextclade-schemas/output-ndjson.schema.json b/packages/nextclade-schemas/output-ndjson.schema.json index fbbd7d7ee..d17f29142 100644 --- a/packages/nextclade-schemas/output-ndjson.schema.json +++ b/packages/nextclade-schemas/output-ndjson.schema.json @@ -30,6 +30,7 @@ "lenUnaligned", "missing", "missingCdses", + "mutationPatterns", "nearestNodeId", "nearestNodeName", "nonACGTNs", @@ -331,6 +332,14 @@ "$ref": "#/definitions/PrivateAaMutations" } }, + "mutationPatterns": { + "description": "Per-type mutation statistics, local reference context, and detected mutation clusters", + "allOf": [ + { + "$ref": "#/definitions/MutationPatternsResults" + } + ] + }, "cladeFounderInfo": { "description": "Mutations relative to the clade founder node", "anyOf": [ @@ -1295,6 +1304,633 @@ } } }, + "MutationPatternsResults": { + "description": "Mutation pattern analysis output. Contains per-pattern results.", + "examples": [ + { + "results": [ + { + "id": "adar", + "name": "ADAR-like RNA editing", + "matches": [ + { + "type": "nucSubstitution", + "pos": 5003, + "refNuc": "A", + "qryNuc": "G", + "refContext": [ + "A", + "A", + "G" + ], + "motifMatches": [ + { + "motif": "A[ACGT]G", + "start": 5002, + "end": 5005 + } + ] + }, + { + "type": "nucSubstitution", + "pos": 5033, + "refNuc": "A", + "qryNuc": "G", + "refContext": [ + "A", + "A", + "G" + ], + "motifMatches": [ + { + "motif": "A[ACGT]G", + "start": 5032, + "end": 5035 + } + ] + } + ], + "eventTypeCounts": [ + { + "type": "nucSubstitution", + "refNuc": "A", + "qryNuc": "G", + "count": 2 + } + ], + "clusters": [ + { + "start": 5003, + "end": 5033, + "count": 2, + "events": [ + { + "type": "nucSubstitution", + "pos": 5003, + "refNuc": "A", + "qryNuc": "G", + "refContext": [ + "A", + "A", + "G" + ], + "motifMatches": [ + { + "motif": "A[ACGT]G", + "start": 5002, + "end": 5005 + } + ] + }, + { + "type": "nucSubstitution", + "pos": 5033, + "refNuc": "A", + "qryNuc": "G", + "refContext": [ + "A", + "A", + "G" + ], + "motifMatches": [ + { + "motif": "A[ACGT]G", + "start": 5032, + "end": 5035 + } + ] + } + ], + "eventTypeCounts": [ + { + "type": "nucSubstitution", + "refNuc": "A", + "qryNuc": "G", + "count": 2 + } + ] + } + ], + "counts": { + "matches": 2, + "clustered": 2, + "clusters": 1 + }, + "description": "ADAR-mediated A-to-I editing observed as A>G and complementary T>C" + } + ] + } + ], + "type": "object", + "properties": { + "results": { + "description": "Results for each configured mutation pattern. Empty when mutation pattern analysis is not configured and no global SNP cluster compatibility output is needed.", + "type": "array", + "items": { + "$ref": "#/definitions/MutationPatternResults" + } + } + } + }, + "MutationPatternResults": { + "description": "Results for a single mutation pattern.", + "examples": [ + { + "id": "adar", + "name": "ADAR-like RNA editing", + "matches": [ + { + "type": "nucSubstitution", + "pos": 5003, + "refNuc": "A", + "qryNuc": "G", + "refContext": [ + "A", + "A", + "G" + ], + "motifMatches": [ + { + "motif": "A[ACGT]G", + "start": 5002, + "end": 5005 + } + ] + }, + { + "type": "nucSubstitution", + "pos": 5033, + "refNuc": "A", + "qryNuc": "G", + "refContext": [ + "A", + "A", + "G" + ], + "motifMatches": [ + { + "motif": "A[ACGT]G", + "start": 5032, + "end": 5035 + } + ] + } + ], + "eventTypeCounts": [ + { + "type": "nucSubstitution", + "refNuc": "A", + "qryNuc": "G", + "count": 2 + } + ], + "clusters": [ + { + "start": 5003, + "end": 5033, + "count": 2, + "events": [ + { + "type": "nucSubstitution", + "pos": 5003, + "refNuc": "A", + "qryNuc": "G", + "refContext": [ + "A", + "A", + "G" + ], + "motifMatches": [ + { + "motif": "A[ACGT]G", + "start": 5002, + "end": 5005 + } + ] + }, + { + "type": "nucSubstitution", + "pos": 5033, + "refNuc": "A", + "qryNuc": "G", + "refContext": [ + "A", + "A", + "G" + ], + "motifMatches": [ + { + "motif": "A[ACGT]G", + "start": 5032, + "end": 5035 + } + ] + } + ], + "eventTypeCounts": [ + { + "type": "nucSubstitution", + "refNuc": "A", + "qryNuc": "G", + "count": 2 + } + ] + } + ], + "counts": { + "matches": 2, + "clustered": 2, + "clusters": 1 + }, + "description": "ADAR-mediated A-to-I editing observed as A>G and complementary T>C" + } + ], + "type": "object", + "required": [ + "clusters", + "counts", + "eventTypeCounts", + "id", + "matches", + "name" + ], + "properties": { + "id": { + "description": "Stable machine-readable pattern identifier copied from pathogen.json.", + "type": "string" + }, + "name": { + "description": "Human-readable pattern name copied from pathogen.json.", + "type": "string" + }, + "matches": { + "description": "All private mutation events matching this pattern.", + "type": "array", + "items": { + "$ref": "#/definitions/MutationPatternEventMatch" + } + }, + "eventTypeCounts": { + "description": "Counts of matched event types across all `matches`.", + "type": "array", + "items": { + "$ref": "#/definitions/MutationPatternEventTypeCount" + } + }, + "clusters": { + "description": "Pattern-local clusters detected among `matches`.", + "type": "array", + "items": { + "$ref": "#/definitions/MutationPatternCluster" + } + }, + "counts": { + "description": "Summary counts for matches and clusters.", + "allOf": [ + { + "$ref": "#/definitions/MutationPatternCounts" + } + ] + }, + "description": { + "description": "Optional explanatory text copied from pathogen.json.", + "type": [ + "string", + "null" + ] + } + } + }, + "MutationPatternEventMatch": { + "description": "Matched mutation pattern event.", + "examples": [ + { + "type": "nucSubstitution", + "pos": 5003, + "refNuc": "A", + "qryNuc": "G", + "refContext": [ + "A", + "A", + "G" + ], + "motifMatches": [ + { + "motif": "A[ACGT]G", + "start": 5002, + "end": 5005 + } + ] + } + ], + "oneOf": [ + { + "description": "Matched nucleotide substitution event.", + "examples": [ + { + "pos": 5003, + "refNuc": "A", + "qryNuc": "G", + "refContext": [ + "A", + "A", + "G" + ], + "motifMatches": [ + { + "motif": "A[ACGT]G", + "start": 5002, + "end": 5005 + } + ] + } + ], + "type": "object", + "required": [ + "motifMatches", + "pos", + "qryNuc", + "refContext", + "refNuc", + "type" + ], + "properties": { + "type": { + "type": "string", + "enum": [ + "nucSubstitution" + ] + }, + "motifMatches": { + "description": "Motif matches that spanned the substituted position. Empty when the pattern event had no motif restriction.", + "type": "array", + "items": { + "$ref": "#/definitions/MutationPatternMotifMatch" + } + }, + "refContext": { + "description": "Reference nucleotide context around the substituted position.", + "type": "array", + "items": { + "$ref": "#/definitions/Nuc" + } + }, + "pos": { + "description": "0-based position in the reference sequence", + "allOf": [ + { + "$ref": "#/definitions/Position" + } + ] + }, + "refNuc": { + "description": "Nucleotide in the reference at this position", + "allOf": [ + { + "$ref": "#/definitions/Nuc" + } + ] + }, + "qryNuc": { + "description": "Nucleotide in the query at this position", + "allOf": [ + { + "$ref": "#/definitions/Nuc" + } + ] + } + } + } + ] + }, + "MutationPatternMotifMatch": { + "description": "Reference motif match that overlapped a mutation pattern event.", + "examples": [ + { + "motif": "A[ACGT]G", + "start": 5002, + "end": 5005 + } + ], + "type": "object", + "required": [ + "end", + "motif", + "start" + ], + "properties": { + "motif": { + "description": "Regular expression from the pattern configuration that matched the reference sequence.", + "type": "string" + }, + "start": { + "description": "0-based first reference position included in the motif match.", + "type": "integer", + "format": "uint", + "minimum": 0.0 + }, + "end": { + "description": "0-based position after the end of the motif match.", + "type": "integer", + "format": "uint", + "minimum": 0.0 + } + } + }, + "MutationPatternEventTypeCount": { + "description": "Count of matched mutation pattern events of one event type.", + "examples": [ + { + "type": "nucSubstitution", + "refNuc": "A", + "qryNuc": "G", + "count": 8 + } + ], + "oneOf": [ + { + "description": "Count of nucleotide substitutions with the same reference and query nucleotide.", + "examples": [ + { + "refNuc": "A", + "qryNuc": "G", + "count": 8 + } + ], + "type": "object", + "required": [ + "count", + "qryNuc", + "refNuc", + "type" + ], + "properties": { + "type": { + "type": "string", + "enum": [ + "nucSubstitution" + ] + }, + "refNuc": { + "description": "Reference nucleotide at the substituted position.", + "allOf": [ + { + "$ref": "#/definitions/Nuc" + } + ] + }, + "qryNuc": { + "description": "Query nucleotide at the substituted position.", + "allOf": [ + { + "$ref": "#/definitions/Nuc" + } + ] + }, + "count": { + "description": "Number of matching substitutions with this reference and query nucleotide pair.", + "type": "integer", + "format": "uint", + "minimum": 0.0 + } + } + } + ] + }, + "MutationPatternCluster": { + "description": "Cluster of mutation pattern events detected within one sliding nucleotide window.", + "examples": [ + { + "start": 5003, + "end": 5033, + "count": 2, + "events": [ + { + "type": "nucSubstitution", + "pos": 5003, + "refNuc": "A", + "qryNuc": "G", + "refContext": [ + "A", + "A", + "G" + ], + "motifMatches": [ + { + "motif": "A[ACGT]G", + "start": 5002, + "end": 5005 + } + ] + }, + { + "type": "nucSubstitution", + "pos": 5033, + "refNuc": "A", + "qryNuc": "G", + "refContext": [ + "A", + "A", + "G" + ], + "motifMatches": [ + { + "motif": "A[ACGT]G", + "start": 5032, + "end": 5035 + } + ] + } + ], + "eventTypeCounts": [ + { + "type": "nucSubstitution", + "refNuc": "A", + "qryNuc": "G", + "count": 2 + } + ] + } + ], + "type": "object", + "required": [ + "count", + "end", + "eventTypeCounts", + "events", + "start" + ], + "properties": { + "start": { + "description": "0-based first reference position included in this cluster.", + "type": "integer", + "format": "uint", + "minimum": 0.0 + }, + "end": { + "description": "0-based last reference position included in this cluster.", + "type": "integer", + "format": "uint", + "minimum": 0.0 + }, + "count": { + "description": "Number of matched events in this cluster.", + "type": "integer", + "format": "uint", + "minimum": 0.0 + }, + "events": { + "description": "Matched events belonging to this cluster, sorted by reference position.", + "type": "array", + "items": { + "$ref": "#/definitions/MutationPatternEventMatch" + } + }, + "eventTypeCounts": { + "description": "Counts of event types within this cluster.", + "type": "array", + "items": { + "$ref": "#/definitions/MutationPatternEventTypeCount" + } + } + } + }, + "MutationPatternCounts": { + "description": "Summary counts for one mutation pattern.", + "examples": [ + { + "matches": 14, + "clustered": 14, + "clusters": 2 + } + ], + "type": "object", + "required": [ + "clustered", + "clusters", + "matches" + ], + "properties": { + "matches": { + "description": "Number of private mutation events matching the pattern.", + "type": "integer", + "format": "uint", + "minimum": 0.0 + }, + "clustered": { + "description": "Number of matching events that belong to reported clusters.", + "type": "integer", + "format": "uint", + "minimum": 0.0 + }, + "clusters": { + "description": "Number of clusters reported for this pattern.", + "type": "integer", + "format": "uint", + "minimum": 0.0 + } + } + }, "CladeNodeAttrFounderInfo": { "description": "Information about a clade-like node attribute founder", "type": "object", diff --git a/packages/nextclade-schemas/output-ndjson.schema.yaml b/packages/nextclade-schemas/output-ndjson.schema.yaml index b5d3ba19d..9e874e63b 100644 --- a/packages/nextclade-schemas/output-ndjson.schema.yaml +++ b/packages/nextclade-schemas/output-ndjson.schema.yaml @@ -29,6 +29,7 @@ required: - lenUnaligned - missing - missingCdses +- mutationPatterns - nearestNodeId - nearestNodeName - nonACGTNs @@ -261,6 +262,10 @@ properties: type: object additionalProperties: $ref: '#/definitions/PrivateAaMutations' + mutationPatterns: + description: Per-type mutation statistics, local reference context, and detected mutation clusters + allOf: + - $ref: '#/definitions/MutationPatternsResults' cladeFounderInfo: description: Mutations relative to the clade founder node anyOf: @@ -931,6 +936,423 @@ definitions: type: array items: type: string + MutationPatternsResults: + description: Mutation pattern analysis output. Contains per-pattern results. + examples: + - results: + - id: adar + name: ADAR-like RNA editing + matches: + - type: nucSubstitution + pos: 5003 + refNuc: A + qryNuc: G + refContext: + - A + - A + - G + motifMatches: + - motif: A[ACGT]G + start: 5002 + end: 5005 + - type: nucSubstitution + pos: 5033 + refNuc: A + qryNuc: G + refContext: + - A + - A + - G + motifMatches: + - motif: A[ACGT]G + start: 5032 + end: 5035 + eventTypeCounts: + - type: nucSubstitution + refNuc: A + qryNuc: G + count: 2 + clusters: + - start: 5003 + end: 5033 + count: 2 + events: + - type: nucSubstitution + pos: 5003 + refNuc: A + qryNuc: G + refContext: + - A + - A + - G + motifMatches: + - motif: A[ACGT]G + start: 5002 + end: 5005 + - type: nucSubstitution + pos: 5033 + refNuc: A + qryNuc: G + refContext: + - A + - A + - G + motifMatches: + - motif: A[ACGT]G + start: 5032 + end: 5035 + eventTypeCounts: + - type: nucSubstitution + refNuc: A + qryNuc: G + count: 2 + counts: + matches: 2 + clustered: 2 + clusters: 1 + description: ADAR-mediated A-to-I editing observed as A>G and complementary T>C + type: object + properties: + results: + description: Results for each configured mutation pattern. Empty when mutation pattern analysis is not configured and no global SNP cluster compatibility output is needed. + type: array + items: + $ref: '#/definitions/MutationPatternResults' + MutationPatternResults: + description: Results for a single mutation pattern. + examples: + - id: adar + name: ADAR-like RNA editing + matches: + - type: nucSubstitution + pos: 5003 + refNuc: A + qryNuc: G + refContext: + - A + - A + - G + motifMatches: + - motif: A[ACGT]G + start: 5002 + end: 5005 + - type: nucSubstitution + pos: 5033 + refNuc: A + qryNuc: G + refContext: + - A + - A + - G + motifMatches: + - motif: A[ACGT]G + start: 5032 + end: 5035 + eventTypeCounts: + - type: nucSubstitution + refNuc: A + qryNuc: G + count: 2 + clusters: + - start: 5003 + end: 5033 + count: 2 + events: + - type: nucSubstitution + pos: 5003 + refNuc: A + qryNuc: G + refContext: + - A + - A + - G + motifMatches: + - motif: A[ACGT]G + start: 5002 + end: 5005 + - type: nucSubstitution + pos: 5033 + refNuc: A + qryNuc: G + refContext: + - A + - A + - G + motifMatches: + - motif: A[ACGT]G + start: 5032 + end: 5035 + eventTypeCounts: + - type: nucSubstitution + refNuc: A + qryNuc: G + count: 2 + counts: + matches: 2 + clustered: 2 + clusters: 1 + description: ADAR-mediated A-to-I editing observed as A>G and complementary T>C + type: object + required: + - clusters + - counts + - eventTypeCounts + - id + - matches + - name + properties: + id: + description: Stable machine-readable pattern identifier copied from pathogen.json. + type: string + name: + description: Human-readable pattern name copied from pathogen.json. + type: string + matches: + description: All private mutation events matching this pattern. + type: array + items: + $ref: '#/definitions/MutationPatternEventMatch' + eventTypeCounts: + description: Counts of matched event types across all `matches`. + type: array + items: + $ref: '#/definitions/MutationPatternEventTypeCount' + clusters: + description: Pattern-local clusters detected among `matches`. + type: array + items: + $ref: '#/definitions/MutationPatternCluster' + counts: + description: Summary counts for matches and clusters. + allOf: + - $ref: '#/definitions/MutationPatternCounts' + description: + description: Optional explanatory text copied from pathogen.json. + type: + - string + - 'null' + MutationPatternEventMatch: + description: Matched mutation pattern event. + examples: + - type: nucSubstitution + pos: 5003 + refNuc: A + qryNuc: G + refContext: + - A + - A + - G + motifMatches: + - motif: A[ACGT]G + start: 5002 + end: 5005 + oneOf: + - description: Matched nucleotide substitution event. + examples: + - pos: 5003 + refNuc: A + qryNuc: G + refContext: + - A + - A + - G + motifMatches: + - motif: A[ACGT]G + start: 5002 + end: 5005 + type: object + required: + - motifMatches + - pos + - qryNuc + - refContext + - refNuc + - type + properties: + type: + type: string + enum: + - nucSubstitution + motifMatches: + description: Motif matches that spanned the substituted position. Empty when the pattern event had no motif restriction. + type: array + items: + $ref: '#/definitions/MutationPatternMotifMatch' + refContext: + description: Reference nucleotide context around the substituted position. + type: array + items: + $ref: '#/definitions/Nuc' + pos: + description: 0-based position in the reference sequence + allOf: + - $ref: '#/definitions/Position' + refNuc: + description: Nucleotide in the reference at this position + allOf: + - $ref: '#/definitions/Nuc' + qryNuc: + description: Nucleotide in the query at this position + allOf: + - $ref: '#/definitions/Nuc' + MutationPatternMotifMatch: + description: Reference motif match that overlapped a mutation pattern event. + examples: + - motif: A[ACGT]G + start: 5002 + end: 5005 + type: object + required: + - end + - motif + - start + properties: + motif: + description: Regular expression from the pattern configuration that matched the reference sequence. + type: string + start: + description: 0-based first reference position included in the motif match. + type: integer + format: uint + minimum: 0.0 + end: + description: 0-based position after the end of the motif match. + type: integer + format: uint + minimum: 0.0 + MutationPatternEventTypeCount: + description: Count of matched mutation pattern events of one event type. + examples: + - type: nucSubstitution + refNuc: A + qryNuc: G + count: 8 + oneOf: + - description: Count of nucleotide substitutions with the same reference and query nucleotide. + examples: + - refNuc: A + qryNuc: G + count: 8 + type: object + required: + - count + - qryNuc + - refNuc + - type + properties: + type: + type: string + enum: + - nucSubstitution + refNuc: + description: Reference nucleotide at the substituted position. + allOf: + - $ref: '#/definitions/Nuc' + qryNuc: + description: Query nucleotide at the substituted position. + allOf: + - $ref: '#/definitions/Nuc' + count: + description: Number of matching substitutions with this reference and query nucleotide pair. + type: integer + format: uint + minimum: 0.0 + MutationPatternCluster: + description: Cluster of mutation pattern events detected within one sliding nucleotide window. + examples: + - start: 5003 + end: 5033 + count: 2 + events: + - type: nucSubstitution + pos: 5003 + refNuc: A + qryNuc: G + refContext: + - A + - A + - G + motifMatches: + - motif: A[ACGT]G + start: 5002 + end: 5005 + - type: nucSubstitution + pos: 5033 + refNuc: A + qryNuc: G + refContext: + - A + - A + - G + motifMatches: + - motif: A[ACGT]G + start: 5032 + end: 5035 + eventTypeCounts: + - type: nucSubstitution + refNuc: A + qryNuc: G + count: 2 + type: object + required: + - count + - end + - eventTypeCounts + - events + - start + properties: + start: + description: 0-based first reference position included in this cluster. + type: integer + format: uint + minimum: 0.0 + end: + description: 0-based last reference position included in this cluster. + type: integer + format: uint + minimum: 0.0 + count: + description: Number of matched events in this cluster. + type: integer + format: uint + minimum: 0.0 + events: + description: Matched events belonging to this cluster, sorted by reference position. + type: array + items: + $ref: '#/definitions/MutationPatternEventMatch' + eventTypeCounts: + description: Counts of event types within this cluster. + type: array + items: + $ref: '#/definitions/MutationPatternEventTypeCount' + MutationPatternCounts: + description: Summary counts for one mutation pattern. + examples: + - matches: 14 + clustered: 14 + clusters: 2 + type: object + required: + - clustered + - clusters + - matches + properties: + matches: + description: Number of private mutation events matching the pattern. + type: integer + format: uint + minimum: 0.0 + clustered: + description: Number of matching events that belong to reported clusters. + type: integer + format: uint + minimum: 0.0 + clusters: + description: Number of clusters reported for this pattern. + type: integer + format: uint + minimum: 0.0 CladeNodeAttrFounderInfo: description: Information about a clade-like node attribute founder type: object diff --git a/packages/nextclade-web/src/components/Export/ExportTabColumnConfig.tsx b/packages/nextclade-web/src/components/Export/ExportTabColumnConfig.tsx index 423455908..c3fcfcd3c 100644 --- a/packages/nextclade-web/src/components/Export/ExportTabColumnConfig.tsx +++ b/packages/nextclade-web/src/components/Export/ExportTabColumnConfig.tsx @@ -137,6 +137,36 @@ export function ExportTabColumnConfig({ setActiveTabId }: { setActiveTabId(id: s [onRelMutsColumnsStateChange, relMutsColumnsState, t], ) + const mutPatternsColumnsState = useMemo( + () => csvColumnConfig?.includeMutPatterns ?? false, + [csvColumnConfig?.includeMutPatterns], + ) + + const onMutPatternsColumnsStateChange = useCallback(() => { + setCsvColumnConfig((config) => + config + ? { + ...config, + includeMutPatterns: !config.includeMutPatterns, + } + : undefined, + ) + }, [setCsvColumnConfig]) + + const mutPatterns = useMemo( + () => ( + + + + + {t('Mutation patterns')} + + + + ), + [mutPatternsColumnsState, onMutPatternsColumnsStateChange, t], + ) + const dynamicColumnsState = useMemo(() => csvColumnConfig?.includeDynamic ?? false, [csvColumnConfig?.includeDynamic]) const onDynamicColumnsStateChange = useCallback(() => { @@ -179,6 +209,7 @@ export function ExportTabColumnConfig({ setActiveTabId }: { setActiveTabId(id: s {categories} + diff --git a/packages/nextclade-web/src/components/Results/ColumnMutations.tsx b/packages/nextclade-web/src/components/Results/ColumnMutations.tsx index 561b3df83..ac16154a8 100644 --- a/packages/nextclade-web/src/components/Results/ColumnMutations.tsx +++ b/packages/nextclade-web/src/components/Results/ColumnMutations.tsx @@ -1,16 +1,129 @@ import React, { useCallback, useMemo, useState } from 'react' import { useRecoilValue } from 'recoil' +import styled from 'styled-components' import { REF_NODE_CLADE_FOUNDER, REF_NODE_PARENT, REF_NODE_ROOT } from 'src/constants' import { findCladeNodeAttrFounderInfo, getAaMutations, getNucMutations } from 'src/helpers/relativeMuts' import { viewedDatasetNameAtom } from 'src/state/dataset.state' import { currentRefNodeNameAtom, refNodesAtom } from 'src/state/results.state' import type { ColumnCladeProps } from 'src/components/Results/ColumnClade' +import type { AnalysisResult } from 'src/types' import { getSafeId } from 'src/helpers/getSafeId' -import { TableSlim } from 'src/components/Common/TableSlim' import { Tooltip } from 'src/components/Results/Tooltip' import { ListOfNucMuts } from 'src/components/Results/ListOfNucMuts' import { ListOfAaMuts } from 'src/components/Results/ListOfAaMuts' +import { NucleotideMutationBadge } from 'src/components/Common/MutationBadge' import { useTranslationSafe } from 'src/helpers/useTranslationSafe' +import { TableSlim } from 'src/components/Common/TableSlim' +import type { MutationPatternEventMatch } from 'src/gen/_SchemaRoot' + +const PatternList = styled.div` + border-top: 1px solid #dee2e6; + margin-top: 0.5rem; + padding-top: 0.5rem; +` + +const PatternSection = styled.section` + border-top: 1px solid #343a40; + margin-top: 0.7rem; + padding-top: 0.6rem; + + &:first-child { + border-top: none; + margin-top: 0; + padding-top: 0; + } +` + +const PatternName = styled.div` + font-weight: 700; +` + +const PatternDescription = styled.div` + color: ${(props) => props.theme.gray600}; + font-size: 0.85em; + margin-bottom: 0.35rem; +` + +const ClusterCard = styled.div` + background: rgba(255, 140, 0, 0.06); + border: 1px solid rgba(255, 140, 0, 0.2); + border-radius: 3px; + padding: 0.25rem 0.4rem; + + &:not(:last-child) { + margin-bottom: 0.35rem; + } +` + +const ClusterTitle = styled.div` + font-weight: 700; + font-size: 0.85em; + margin-bottom: 2px; +` + +const ClusterBadgeGrid = styled.div` + display: flex; + flex-wrap: wrap; + gap: 2px; +` + +function MutationPatternsSection({ analysisResult }: { analysisResult: AnalysisResult }) { + const { t } = useTranslationSafe() + const { mutationPatterns, privateNucMutations } = analysisResult + + if (!mutationPatterns?.results?.length || privateNucMutations.privateSubstitutions.length === 0) { + return null + } + + const visiblePatterns = mutationPatterns.results.filter((pattern) => pattern.clusters.length > 0) + + if (visiblePatterns.length === 0) { + return null + } + + return ( + + {visiblePatterns.map((pattern) => ( + + {pattern.name || t('Mutation pattern')} + {pattern.description && {pattern.description}} + + {pattern.clusters.map((cluster) => ( + + + {t('{{start}}-{{end}} ({{count}} events)', { + start: cluster.start + 1, + end: cluster.end + 1, + count: cluster.count, + })} + + + {cluster.events.map((event) => ( + + ))} + + + ))} + + ))} + + ) +} + +function MutationPatternEventBadge({ event }: { event: MutationPatternEventMatch }) { + switch (event.type) { + case 'nucSubstitution': + return + } +} + +function mutationPatternEventKey(event: MutationPatternEventMatch): string { + switch (event.type) { + case 'nucSubstitution': + return `${event.type}:${event.pos}:${event.refNuc}:${event.qryNuc}` + } + throw new Error(`Unknown mutation pattern event type: ${event.type}`) +} export function ColumnMutations({ analysisResult }: ColumnCladeProps) { const { t } = useTranslationSafe() @@ -117,6 +230,8 @@ export function ColumnMutations({ analysisResult }: ColumnCladeProps) { + + ) diff --git a/packages/nextclade-web/src/components/SequenceView/SequenceMarkerCluster.tsx b/packages/nextclade-web/src/components/SequenceView/SequenceMarkerCluster.tsx new file mode 100644 index 000000000..8a46587a3 --- /dev/null +++ b/packages/nextclade-web/src/components/SequenceView/SequenceMarkerCluster.tsx @@ -0,0 +1,130 @@ +import React, { SVGProps, useCallback, useMemo, useState } from 'react' +import { useRecoilValue } from 'recoil' +import styled from 'styled-components' + +import { useTranslationSafe as useTranslation } from 'src/helpers/useTranslationSafe' + +import { Tooltip } from 'src/components/Results/Tooltip' +import { NucleotideMutationBadge } from 'src/components/Common/MutationBadge' +import { getSafeId } from 'src/helpers/getSafeId' +import { + SeqMarkerHeightState, + getSeqMarkerDims, + seqMarkerClusterHeightStateAtom, +} from 'src/state/seqViewSettings.state' +import type { MutationPatternEventMatch } from 'src/gen/_SchemaRoot' + +const CLUSTER_FILL = 'rgba(255, 140, 0, 0.12)' +const CLUSTER_STROKE = '#e06000' +const CLUSTER_MIN_WIDTH_PX = 12 + +const ClusterBadgeGrid = styled.div` + display: flex; + flex-wrap: wrap; + gap: 2px; + margin-top: 4px; +` + +const ClusterDescription = styled.div` + margin-top: 4px; + font-size: 0.85em; + color: #666; +` + +interface ClusterProps { + start: number + end: number + count: number + events: MutationPatternEventMatch[] +} + +export interface SequenceMarkerClusterProps extends SVGProps { + index: number + seqName: string + cluster: ClusterProps + pixelsPerBase: number + description?: string +} + +function SequenceMarkerClusterUnmemoed({ + index, + seqName, + cluster, + pixelsPerBase, + description, + ...rest +}: SequenceMarkerClusterProps) { + const { t } = useTranslation() + const [showTooltip, setShowTooltip] = useState(false) + const onMouseEnter = useCallback(() => setShowTooltip(true), []) + const onMouseLeave = useCallback(() => setShowTooltip(false), []) + + const seqMarkerClusterHeightState = useRecoilValue(seqMarkerClusterHeightStateAtom) + const { y, height } = useMemo(() => getSeqMarkerDims(seqMarkerClusterHeightState), [seqMarkerClusterHeightState]) + + if (seqMarkerClusterHeightState === SeqMarkerHeightState.Off) { + return null + } + + const { start, end, count, events } = cluster + + const id = getSafeId('cluster-marker', { index, seqName, begin: start, end }) + + let width = (end - start + 1) * pixelsPerBase + width = Math.max(width, CLUSTER_MIN_WIDTH_PX) + const halfNuc = Math.max(pixelsPerBase, CLUSTER_MIN_WIDTH_PX) / 2 + const x = start * pixelsPerBase - halfNuc + + return ( + + + + + + {t('Mutation cluster: {{start}}-{{end}} ({{count}} events)', { + start: start + 1, + end: end + 1, + count, + })} + + + {events.length > 0 && ( + + {events.map((event) => ( + + ))} + + )} + {description && {description}} + + + ) +} + +export const SequenceMarkerCluster = React.memo(SequenceMarkerClusterUnmemoed) + +function MutationPatternEventBadge({ event }: { event: MutationPatternEventMatch }) { + switch (event.type) { + case 'nucSubstitution': + return + } +} + +function mutationPatternEventKey(event: MutationPatternEventMatch): string { + switch (event.type) { + case 'nucSubstitution': + return `${event.type}:${event.pos}:${event.refNuc}:${event.qryNuc}` + } + throw new Error(`Unknown mutation pattern event type: ${event.type}`) +} diff --git a/packages/nextclade-web/src/components/SequenceView/SequenceViewAbsolute.tsx b/packages/nextclade-web/src/components/SequenceView/SequenceViewAbsolute.tsx index ea164a01f..b20c21a44 100644 --- a/packages/nextclade-web/src/components/SequenceView/SequenceViewAbsolute.tsx +++ b/packages/nextclade-web/src/components/SequenceView/SequenceViewAbsolute.tsx @@ -11,6 +11,7 @@ import { SequenceMarkerGap } from './SequenceMarkerGap' import { SequenceMarkerMissing } from './SequenceMarkerMissing' import { SequenceMarkerMutation } from './SequenceMarkerMutation' import { SequenceMarkerUnsequencedEnd, SequenceMarkerUnsequencedStart } from './SequenceMarkerUnsequenced' +import { SequenceMarkerCluster } from './SequenceMarkerCluster' import { SequenceMarkerFrameShift } from './SequenceMarkerFrameShift' import { SequenceMarkerInsertion } from './SequenceMarkerInsertion' import { SequenceViewCoverageWrapper, SequenceViewCoverageText, SequenceViewSVG } from './SequenceViewStyles' @@ -32,6 +33,7 @@ export function SequenceViewAbsolute({ sequence, width }: SequenceViewAbsolutePr insertions, nucToAaMuts, nonACGTNs, + mutationPatterns, } = sequence const { t } = useTranslationSafe() @@ -103,6 +105,19 @@ export function SequenceViewAbsolute({ sequence, width }: SequenceViewAbsolutePr ) }) + const clusterViews = (mutationPatterns?.results ?? []).flatMap((pattern, patternIndex) => + (pattern.clusters ?? []).map((cluster) => ( + + )), + ) + const frameShiftMarkers = frameShifts.map((frameShift) => ( na.begin).join('-')}`} @@ -114,7 +129,7 @@ export function SequenceViewAbsolute({ sequence, width }: SequenceViewAbsolutePr )) const totalMarkers = - mutationViews.length + deletionViews.length + missingViews.length + frameShiftMarkers.length + insertionViews.length + mutationViews.length + deletionViews.length + missingViews.length + frameShiftMarkers.length + insertionViews.length + clusterViews.length if (totalMarkers > maxNucMarkers) { return ( @@ -164,6 +179,7 @@ export function SequenceViewAbsolute({ sequence, width }: SequenceViewAbsolutePr pixelsPerBase={pixelsPerBase} /> {frameShiftMarkers} + {clusterViews} ) } diff --git a/packages/nextclade-web/src/components/SequenceView/SequenceViewRelative.tsx b/packages/nextclade-web/src/components/SequenceView/SequenceViewRelative.tsx index de99b7c56..f82001f9b 100644 --- a/packages/nextclade-web/src/components/SequenceView/SequenceViewRelative.tsx +++ b/packages/nextclade-web/src/components/SequenceView/SequenceViewRelative.tsx @@ -11,6 +11,7 @@ import { SequenceMarkerMutation } from './SequenceMarkerMutation' import { SequenceMarkerGap } from './SequenceMarkerGap' import { SequenceMarkerAmbiguous } from './SequenceMarkerAmbiguous' import { SequenceMarkerMissing } from './SequenceMarkerMissing' +import { SequenceMarkerCluster } from './SequenceMarkerCluster' import { SequenceMarkerFrameShift } from './SequenceMarkerFrameShift' import { SequenceMarkerInsertion } from './SequenceMarkerInsertion' import { SequenceMarkerUnsequencedEnd, SequenceMarkerUnsequencedStart } from './SequenceMarkerUnsequenced' @@ -23,7 +24,7 @@ export interface SequenceViewRelativeProps { } export function SequenceViewRelative({ sequence, width, refNodeName }: SequenceViewRelativeProps) { - const { index, seqName, missing, alignmentRange, frameShifts, insertions, nucToAaMuts, nonACGTNs } = sequence + const { index, seqName, missing, alignmentRange, frameShifts, insertions, nucToAaMuts, nonACGTNs, mutationPatterns } = sequence const { t } = useTranslationSafe() const maxNucMarkers = useRecoilValue(maxNucMarkersAtom) @@ -93,6 +94,19 @@ export function SequenceViewRelative({ sequence, width, refNodeName }: SequenceV /> )) + const clusterViews = (mutationPatterns?.results ?? []).flatMap((pattern, patternIndex) => + (pattern.clusters ?? []).map((cluster) => ( + + )), + ) + const frameShiftMarkers = frameShifts.map((frameShift) => ( na.begin).join('-')}`} @@ -104,7 +118,7 @@ export function SequenceViewRelative({ sequence, width, refNodeName }: SequenceV )) const totalMarkers = - mutationViews.length + deletionViews.length + missingViews.length + frameShiftMarkers.length + insertionViews.length + mutationViews.length + deletionViews.length + missingViews.length + frameShiftMarkers.length + insertionViews.length + clusterViews.length if (totalMarkers > maxNucMarkers) { return ( @@ -155,6 +169,7 @@ export function SequenceViewRelative({ sequence, width, refNodeName }: SequenceV pixelsPerBase={pixelsPerBase} /> {frameShiftMarkers} + {clusterViews} ) } diff --git a/packages/nextclade-web/src/components/Settings/SeqViewSettings.tsx b/packages/nextclade-web/src/components/Settings/SeqViewSettings.tsx index 34fc4e6b5..4aa6ac454 100644 --- a/packages/nextclade-web/src/components/Settings/SeqViewSettings.tsx +++ b/packages/nextclade-web/src/components/Settings/SeqViewSettings.tsx @@ -12,6 +12,7 @@ import { SeqMarkerState, maxNucMarkersAtom, seqMarkerAmbiguousHeightStateAtom, + seqMarkerClusterHeightStateAtom, seqMarkerFrameShiftStateAtom, seqMarkerGapHeightStateAtom, seqMarkerHeightStateFromString, @@ -78,6 +79,10 @@ export function SeqViewSettings() { seqMarkerUnsequencedHeightStateAtom, ) + const [seqMarkerClusterHeightState, setSeqMarkerClusterHeightState] = useSeqMarkerHeightState( + seqMarkerClusterHeightStateAtom, + ) + const [seqMarkerInsertionState, setSeqMarkerInsertionState] = useSeqMarkerState(seqMarkerInsertionStateAtom) const [seqMarkerFrameShiftState, setSeqMarkerFrameShiftState] = useSeqMarkerState(seqMarkerFrameShiftStateAtom) @@ -174,6 +179,18 @@ export function SeqViewSettings() { + + + {t('Mutation clusters')} + + + + {t('Insertions')} diff --git a/packages/nextclade-web/src/helpers/formatQCSNPClusters.ts b/packages/nextclade-web/src/helpers/formatQCSNPClusters.ts index 450d7d873..c438cee70 100644 --- a/packages/nextclade-web/src/helpers/formatQCSNPClusters.ts +++ b/packages/nextclade-web/src/helpers/formatQCSNPClusters.ts @@ -20,10 +20,16 @@ export function formatQCSNPClusters( message = t('Too many mutation clusters found') } - return t('{{message}}. Seen {{nClusters}} mutation clusters with total of {{total}} mutations. QC score: {{score}}', { - message, - total: totalSNPs, - nClusters: clusteredSNPs.length, - score: round(score), - }) + const clusterRanges = clusteredSNPs.map((c) => `${c.start + 1}-${c.end + 1}:${c.numberOfSnps}`).join(', ') + + return t( + '{{message}}. {{nClusters}} cluster(s) with {{total}} total mutations. Positions: {{clusterRanges}}. QC score: {{score}}', + { + message, + total: totalSNPs, + nClusters: clusteredSNPs.length, + clusterRanges, + score: round(score), + }, + ) } diff --git a/packages/nextclade-web/src/hooks/useExportResults.ts b/packages/nextclade-web/src/hooks/useExportResults.ts index 9fcf0fac2..5cb7cf954 100644 --- a/packages/nextclade-web/src/hooks/useExportResults.ts +++ b/packages/nextclade-web/src/hooks/useExportResults.ts @@ -25,6 +25,7 @@ import { cdsesAtom, cladeNodeAttrDescsAtom, csvColumnConfigAtom, + mutationPatternKeysAtom, phenotypeAttrDescsAtom, refNodesAtom, treeAtom, @@ -153,6 +154,7 @@ async function prepareCsvData( phenotypeAttrDescs: PhenotypeAttrDesc[] | undefined, refNodes: AuspiceRefNodesDesc | undefined, aaMotifsDescs: AaMotifsDesc[] | undefined, + mutationPatternKeys: string[], csvColumnConfig: CsvColumnConfig, delimiter: string, worker: ExportWorker, @@ -164,6 +166,7 @@ async function prepareCsvData( phenotypeAttrDescs ?? [], refNodes ?? {}, aaMotifsDescs ?? [], + mutationPatternKeys, delimiter, csvColumnConfig, ) @@ -363,6 +366,7 @@ async function prepareAllExportData( phenotypeAttrDescs: PhenotypeAttrDesc[] | undefined, refNodes: AuspiceRefNodesDesc | undefined, aaMotifsDescs: AaMotifsDesc[] | undefined, + mutationPatternKeys: string[], csvColumnConfig: CsvColumnConfig | undefined, worker: ExportWorker, ) { @@ -381,6 +385,7 @@ async function prepareAllExportData( phenotypeAttrDescs ?? [], refNodes, aaMotifsDescs ?? [], + mutationPatternKeys, csvColumnConfig, ';', worker, @@ -392,6 +397,7 @@ async function prepareAllExportData( phenotypeAttrDescs ?? [], refNodes, aaMotifsDescs ?? [], + mutationPatternKeys, csvColumnConfig, '\t', worker, @@ -423,6 +429,7 @@ export function useExportZip({ datasetName }: { datasetName: string }) { const phenotypeAttrDescs = useRecoilValue(phenotypeAttrDescsAtom({ datasetName })) const refNodes = useRecoilValue(refNodesAtom({ datasetName })) const aaMotifsDescs = useRecoilValue(aaMotifsDescsAtom({ datasetName })) + const mutationPatternKeys = useRecoilValue(mutationPatternKeysAtom({ datasetName })) ?? [] const csvColumnConfig = useRecoilValue(csvColumnConfigAtom) const tree = useRecoilValue(treeAtom(datasetName)) const treeNwk = useRecoilValue(treeNwkAtom({ datasetName })) @@ -436,6 +443,7 @@ export function useExportZip({ datasetName }: { datasetName: string }) { phenotypeAttrDescs, refNodes, aaMotifsDescs, + mutationPatternKeys, csvColumnConfig, worker, ) @@ -475,6 +483,7 @@ export function useExportZip({ datasetName }: { datasetName: string }) { phenotypeAttrDescs, refNodes, aaMotifsDescs, + mutationPatternKeys, csvColumnConfig, tree, treeNwk, @@ -507,6 +516,7 @@ function createCsvExportHook(delimiter: string, mimeType: string) { const phenotypeAttrDescs = useRecoilValue(phenotypeAttrDescsAtom({ datasetName })) const refNodes = useRecoilValue(refNodesAtom({ datasetName })) const aaMotifsDescs = useRecoilValue(aaMotifsDescsAtom({ datasetName })) + const mutationPatternKeys = useRecoilValue(mutationPatternKeysAtom({ datasetName })) ?? [] const csvColumnConfig = useRecoilValue(csvColumnConfigAtom) const exportFn = useCallback( @@ -525,13 +535,14 @@ function createCsvExportHook(delimiter: string, mimeType: string) { phenotypeAttrDescs, refNodes, aaMotifsDescs, + mutationPatternKeys, csvColumnConfig, delimiter, worker, ) saveFile(csvStr, filename, mimeType) }, - [analysisResults, datasetName, cladeNodeAttrDescs, phenotypeAttrDescs, refNodes, aaMotifsDescs, csvColumnConfig], + [analysisResults, datasetName, cladeNodeAttrDescs, phenotypeAttrDescs, refNodes, aaMotifsDescs, mutationPatternKeys, csvColumnConfig], ) return useResultsExport(exportFn) diff --git a/packages/nextclade-web/src/hooks/useRunAnalysis.ts b/packages/nextclade-web/src/hooks/useRunAnalysis.ts index d1b9a7ec5..57bbc83d4 100644 --- a/packages/nextclade-web/src/hooks/useRunAnalysis.ts +++ b/packages/nextclade-web/src/hooks/useRunAnalysis.ts @@ -67,6 +67,7 @@ import { allCurrentRefNodeNameAtom, initialDataAtom, allInitialDataAtom, + mutationPatternKeysAtom, } from 'src/state/results.state' import { numThreadsAtom } from 'src/state/settings.state' import { launchAnalysis, LaunchAnalysisCallbacks, DatasetFilesOverrides } from 'src/workers/launchAnalysis' @@ -121,6 +122,7 @@ export function useRunAnalysis() { phenotypeAttrDescs, refNodes, aaMotifsDescs, + mutationPatternKeys, csvColumnConfigDefault, } = initialData @@ -152,6 +154,7 @@ export function useRunAnalysis() { set(currentRefNodeNameAtom({ datasetName }), defaultSearchName) set(aaMotifsDescsAtom({ datasetName }), aaMotifsDescs) + set(mutationPatternKeysAtom({ datasetName }), mutationPatternKeys) set(csvColumnConfigAtom, csvColumnConfigDefault) }, onAnalysisResult(result) { diff --git a/packages/nextclade-web/src/state/results.state.ts b/packages/nextclade-web/src/state/results.state.ts index e7e6592f7..aea53da8a 100644 --- a/packages/nextclade-web/src/state/results.state.ts +++ b/packages/nextclade-web/src/state/results.state.ts @@ -230,6 +230,10 @@ export const [aaMotifsDescsAtom, allAaMotifsDescsAtom] = multiAtom({ + key: 'mutationPatternKeysAtom', +}) + export const [initialDataAtom, allInitialDataAtom] = multiAtom({ key: 'initialDataAtom', }) diff --git a/packages/nextclade-web/src/state/seqViewSettings.state.ts b/packages/nextclade-web/src/state/seqViewSettings.state.ts index 252b76ef9..d5c3f4281 100644 --- a/packages/nextclade-web/src/state/seqViewSettings.state.ts +++ b/packages/nextclade-web/src/state/seqViewSettings.state.ts @@ -77,6 +77,12 @@ export const seqMarkerUnsequencedHeightStateAtom = atom({ effects: [persistAtom], }) +export const seqMarkerClusterHeightStateAtom = atom({ + key: 'seqMarkerClusterHeight', + default: SeqMarkerHeightState.Bottom, + effects: [persistAtom], +}) + export enum SeqMarkerState { Off = 'Off', On = 'On', diff --git a/packages/nextclade-web/src/wasm/main.rs b/packages/nextclade-web/src/wasm/main.rs index 6d683fbcf..34353aa50 100644 --- a/packages/nextclade-web/src/wasm/main.rs +++ b/packages/nextclade-web/src/wasm/main.rs @@ -229,6 +229,7 @@ impl NextcladeWasm { phenotype_attrs_json_str: &str, ref_nodes_json_str: &str, aa_motifs_keys_json_str: &str, + mutation_pattern_keys_json_str: &str, delimiter: char, csv_colum_config_json_str: &str, ) -> Result { @@ -260,6 +261,11 @@ impl NextcladeWasm { .wrap_err("When serializing results into CSV: When parsing AA motifs keys JSON internally"), )?; + let mutation_pattern_keys: Vec = jserr( + json_parse(mutation_pattern_keys_json_str) + .wrap_err("When serializing results into CSV: When parsing mutation pattern keys JSON internally"), + )?; + let phenotype_attr_keys = phenotype_attrs.into_iter().map(|attr| attr.name).collect_vec(); let aa_motifs_keys = aa_motifs_descs.into_iter().map(|desc| desc.name).collect_vec(); @@ -275,6 +281,7 @@ impl NextcladeWasm { &phenotype_attr_keys, &ref_nodes, &aa_motifs_keys, + &mutation_pattern_keys, delimiter as u8, &csv_colum_config, )) diff --git a/packages/nextclade-web/src/workers/ExportThread.ts b/packages/nextclade-web/src/workers/ExportThread.ts index 43c39c0a4..07ccb02d4 100644 --- a/packages/nextclade-web/src/workers/ExportThread.ts +++ b/packages/nextclade-web/src/workers/ExportThread.ts @@ -58,6 +58,7 @@ export class ExportWorker { phenotypeAttrs: PhenotypeAttrDesc[], refNodes: AuspiceRefNodesDesc, aaMotifsDescs: AaMotifsDesc[], + mutationPatternKeys: string[], delimiter: string, csvColumnConfig: CsvColumnConfig, ) { @@ -68,6 +69,7 @@ export class ExportWorker { phenotypeAttrs, refNodes, aaMotifsDescs, + mutationPatternKeys, delimiter, csvColumnConfig, ) diff --git a/packages/nextclade-web/src/workers/nextcladeWasm.worker.ts b/packages/nextclade-web/src/workers/nextcladeWasm.worker.ts index c0a829eff..1bda6b16e 100644 --- a/packages/nextclade-web/src/workers/nextcladeWasm.worker.ts +++ b/packages/nextclade-web/src/workers/nextcladeWasm.worker.ts @@ -156,6 +156,7 @@ export async function serializeResultsCsv( phenotypeAttrsJson: PhenotypeAttrDesc[], refNodesJson: AuspiceRefNodesDesc, aaMotifsDescs: AaMotifsDesc[], + mutationPatternKeys: string[], delimiter: string, csvColumnConfig: CsvColumnConfig, ) { @@ -166,6 +167,7 @@ export async function serializeResultsCsv( JSON.stringify(phenotypeAttrsJson), JSON.stringify(refNodesJson), JSON.stringify(aaMotifsDescs), + JSON.stringify(mutationPatternKeys), delimiter, JSON.stringify(csvColumnConfig), ) diff --git a/packages/nextclade/Cargo.toml b/packages/nextclade/Cargo.toml index a61bc871f..acc7486ba 100644 --- a/packages/nextclade/Cargo.toml +++ b/packages/nextclade/Cargo.toml @@ -69,6 +69,7 @@ serde_json = { workspace = true } serde_repr = { workspace = true } serde_stacker = { workspace = true } serde_yaml = { workspace = true } +smart-default = { workspace = true } strsim = { workspace = true } strum = { workspace = true } strum_macros = { workspace = true } diff --git a/packages/nextclade/src/analyze/mod.rs b/packages/nextclade/src/analyze/mod.rs index 641c3f429..2bb614ae2 100644 --- a/packages/nextclade/src/analyze/mod.rs +++ b/packages/nextclade/src/analyze/mod.rs @@ -20,10 +20,12 @@ pub mod group_adjacent_deletions; pub mod is_sequenced; pub mod letter_composition; pub mod letter_ranges; +pub mod mutation_patterns; pub mod nuc_alignment; pub mod nuc_changes; pub mod nuc_del; pub mod nuc_sub; +pub mod nuc_sub_context; pub mod pcr_primer_changes; pub mod pcr_primers; pub mod phenotype; diff --git a/packages/nextclade/src/analyze/mutation_patterns.rs b/packages/nextclade/src/analyze/mutation_patterns.rs new file mode 100644 index 000000000..18d89895d --- /dev/null +++ b/packages/nextclade/src/analyze/mutation_patterns.rs @@ -0,0 +1,1160 @@ +use crate::alphabet::nuc::{Nuc, from_nuc_seq, is_nuc_match}; +use crate::analyze::find_private_nuc_mutations::PrivateNucMutations; +use crate::analyze::nuc_sub::NucSub; +use crate::analyze::nuc_sub_context::NucSubWithContext; +use crate::analyze::virus_properties::{ + MutationPatternClusterConfig, MutationPatternEvent, MutationPatternNucSubstitution, MutationPatternsConfig, +}; +use crate::coord::position::{NucRefGlobalPosition, PositionLike}; +use crate::qc::qc_config::QcRulesConfigSnpClusters; +use eyre::{Report, WrapErr}; +use itertools::Itertools; +use regex::Regex; +use serde::{Deserialize, Serialize}; +use std::collections::BTreeMap; +use std::collections::VecDeque; + +/// Cluster of mutation pattern events detected within one sliding nucleotide window. +#[derive(Clone, Debug, Default, Serialize, Deserialize, schemars::JsonSchema)] +#[serde(rename_all = "camelCase")] +#[schemars(example = "MutationPatternCluster::example")] +pub struct MutationPatternCluster { + /// 0-based first reference position included in this cluster. + pub start: usize, + + /// 0-based last reference position included in this cluster. + pub end: usize, + + /// Number of matched events in this cluster. + pub count: usize, + + /// Matched events belonging to this cluster, sorted by reference position. + pub events: Vec, + + /// Counts of event types within this cluster. + pub event_type_counts: Vec, +} + +impl MutationPatternCluster { + pub fn example() -> Self { + Self { + start: 5003, + end: 5033, + count: 2, + events: vec![ + MutationPatternEventMatch::example_nuc_substitution(5003, Nuc::A, Nuc::G), + MutationPatternEventMatch::example_nuc_substitution(5033, Nuc::A, Nuc::G), + ], + event_type_counts: vec![MutationPatternEventTypeCount::NucSubstitution( + MutationPatternNucSubstitutionTypeCount { + ref_nuc: Nuc::A, + qry_nuc: Nuc::G, + count: 2, + }, + )], + } + } +} + +/// Count of matched mutation pattern events of one event type. +#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema)] +#[serde(tag = "type", rename_all = "camelCase")] +#[schemars(example = "MutationPatternEventTypeCount::example")] +pub enum MutationPatternEventTypeCount { + /// Count of nucleotide substitutions with the same reference and query nucleotide. + NucSubstitution(MutationPatternNucSubstitutionTypeCount), +} + +impl MutationPatternEventTypeCount { + pub const fn example() -> Self { + Self::NucSubstitution(MutationPatternNucSubstitutionTypeCount { + ref_nuc: Nuc::A, + qry_nuc: Nuc::G, + count: 8, + }) + } +} + +/// Count of nucleotide substitution events sharing the same reference and query nucleotide. +#[derive(Clone, Debug, Default, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema)] +#[serde(rename_all = "camelCase")] +#[schemars(example = "MutationPatternNucSubstitutionTypeCount::example")] +pub struct MutationPatternNucSubstitutionTypeCount { + /// Reference nucleotide at the substituted position. + pub ref_nuc: Nuc, + + /// Query nucleotide at the substituted position. + pub qry_nuc: Nuc, + + /// Number of matching substitutions with this reference and query nucleotide pair. + pub count: usize, +} + +impl MutationPatternNucSubstitutionTypeCount { + pub const fn example() -> Self { + Self { + ref_nuc: Nuc::A, + qry_nuc: Nuc::G, + count: 8, + } + } +} + +/// Reference motif match that overlapped a mutation pattern event. +#[derive(Clone, Debug, Default, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema)] +#[serde(rename_all = "camelCase")] +#[schemars(example = "MutationPatternMotifMatch::example")] +pub struct MutationPatternMotifMatch { + /// Regular expression from the pattern configuration that matched the reference sequence. + pub motif: String, + + /// 0-based first reference position included in the motif match. + pub start: usize, + + /// 0-based position after the end of the motif match. + pub end: usize, +} + +impl MutationPatternMotifMatch { + pub fn example() -> Self { + Self { + motif: "A[ACGT]G".to_owned(), + start: 5002, + end: 5005, + } + } +} + +/// Matched mutation pattern event. +#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema)] +#[serde(tag = "type", rename_all = "camelCase")] +#[schemars(example = "MutationPatternEventMatch::example")] +pub enum MutationPatternEventMatch { + /// Matched nucleotide substitution event. + NucSubstitution(MutationPatternNucSubstitutionMatch), +} + +impl MutationPatternEventMatch { + pub fn example() -> Self { + Self::example_nuc_substitution(5003, Nuc::A, Nuc::G) + } + + const fn unmatched_nuc_substitution(substitution: NucSubWithContext) -> Self { + Self::NucSubstitution(MutationPatternNucSubstitutionMatch { + substitution, + motif_matches: vec![], + }) + } + + fn example_nuc_substitution(pos: usize, ref_nuc: Nuc, qry_nuc: Nuc) -> Self { + Self::NucSubstitution(MutationPatternNucSubstitutionMatch { + substitution: NucSubWithContext { + sub: NucSub { + pos: NucRefGlobalPosition::from(pos), + ref_nuc, + qry_nuc, + }, + ref_context: vec![Nuc::A, ref_nuc, Nuc::G], + }, + motif_matches: vec![MutationPatternMotifMatch { + motif: "A[ACGT]G".to_owned(), + start: pos.saturating_sub(1), + end: pos + 2, + }], + }) + } +} + +/// Matched nucleotide substitution and the reference motifs that accepted it. +#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema)] +#[serde(rename_all = "camelCase")] +#[schemars(example = "MutationPatternNucSubstitutionMatch::example")] +pub struct MutationPatternNucSubstitutionMatch { + /// Nucleotide substitution plus local reference context at the substituted position. + #[serde(flatten)] + pub substitution: NucSubWithContext, + + /// Motif matches that spanned the substituted position. Empty when the pattern event had no motif restriction. + pub motif_matches: Vec, +} + +impl MutationPatternNucSubstitutionMatch { + pub fn example() -> Self { + match MutationPatternEventMatch::example() { + MutationPatternEventMatch::NucSubstitution(event) => event, + } + } +} + +/// Summary counts for one mutation pattern. +#[derive(Clone, Debug, Default, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema)] +#[serde(rename_all = "camelCase")] +#[schemars(example = "MutationPatternCounts::example")] +pub struct MutationPatternCounts { + /// Number of private mutation events matching the pattern. + pub matches: usize, + + /// Number of matching events that belong to reported clusters. + pub clustered: usize, + + /// Number of clusters reported for this pattern. + pub clusters: usize, +} + +impl MutationPatternCounts { + pub const fn example() -> Self { + Self { + matches: 14, + clustered: 14, + clusters: 2, + } + } +} + +/// Results for a single mutation pattern. +#[derive(Clone, Debug, Default, Serialize, Deserialize, schemars::JsonSchema)] +#[serde(rename_all = "camelCase")] +#[schemars(example = "MutationPatternResults::example")] +pub struct MutationPatternResults { + /// Stable machine-readable pattern identifier copied from pathogen.json. + pub id: String, + + /// Human-readable pattern name copied from pathogen.json. + pub name: String, + + /// All private mutation events matching this pattern. + pub matches: Vec, + + /// Counts of matched event types across all `matches`. + pub event_type_counts: Vec, + + /// Pattern-local clusters detected among `matches`. + pub clusters: Vec, + + /// Summary counts for matches and clusters. + pub counts: MutationPatternCounts, + + /// Optional explanatory text copied from pathogen.json. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub description: Option, +} + +impl MutationPatternResults { + pub fn example() -> Self { + let cluster = MutationPatternCluster::example(); + Self { + id: "adar".to_owned(), + name: "ADAR-like RNA editing".to_owned(), + matches: cluster.events.clone(), + event_type_counts: cluster.event_type_counts.clone(), + clusters: vec![cluster], + counts: MutationPatternCounts { + matches: 2, + clustered: 2, + clusters: 1, + }, + description: Some("ADAR-mediated A-to-I editing observed as A>G and complementary T>C".to_owned()), + } + } +} + +/// Mutation pattern analysis output. Contains per-pattern results. +#[derive(Clone, Debug, Default, Serialize, Deserialize, schemars::JsonSchema)] +#[serde(rename_all = "camelCase")] +#[schemars(example = "MutationPatternsResults::example")] +pub struct MutationPatternsResults { + /// Results for each configured mutation pattern. Empty when mutation pattern analysis is not configured and no global + /// SNP cluster compatibility output is needed. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub results: Vec, +} + +impl MutationPatternsResults { + pub fn example() -> Self { + Self { + results: vec![MutationPatternResults::example()], + } + } +} + +/// Internal result containing both filtered (for output) and unfiltered (for QC) clusters. +pub struct MutationPatternAnalysis { + pub results: MutationPatternsResults, + pub qc_clusters: Vec, +} + +/// Trinucleotide context uses the global reference sequence, not the parent tree node state. +/// This matches mutational signature conventions: the enzyme acts on the physical genome, +/// and context is defined by the reference genome flanking the substituted position. +pub fn analyze_mutation_patterns( + private_nuc_mutations: &PrivateNucMutations, + ref_seq: &[Nuc], + config: Option<&MutationPatternsConfig>, + qc_snp_clusters_config: Option<&QcRulesConfigSnpClusters>, +) -> Result { + let ref_seq_str = from_nuc_seq(ref_seq); + let context_subs: Vec = private_nuc_mutations + .private_substitutions + .iter() + .map(|sub| NucSubWithContext::from_sub(sub, ref_seq)) + .collect_vec(); + + let all_events = context_subs + .iter() + .cloned() + .map(MutationPatternEventMatch::unmatched_nuc_substitution) + .collect_vec(); + let event_type_counts = compute_event_type_counts(&all_events); + + let patterns = config.map_or(&[][..], |c| &c.patterns); + + let (qc_window_size, qc_cluster_cut_off) = resolve_qc_config(qc_snp_clusters_config); + + let qc_clusters = if qc_window_size > 0 && qc_cluster_cut_off > 0 { + find_clusters(&all_events, qc_window_size, qc_cluster_cut_off) + } else { + vec![] + }; + + let results = if patterns.is_empty() { + if qc_clusters.is_empty() { + vec![] + } else { + let total_clusters = qc_clusters.len(); + let total_clustered = qc_clusters.iter().map(|c| c.count).sum(); + vec![MutationPatternResults { + id: "snp_clusters".to_owned(), + name: "SNP clusters".to_owned(), + matches: all_events, + event_type_counts, + clusters: qc_clusters.clone(), + counts: MutationPatternCounts { + matches: context_subs.len(), + clustered: total_clustered, + clusters: total_clusters, + }, + description: None, + }] + } + } else { + patterns + .iter() + .map(|cfg| { + let compiled_events = compile_events(&cfg.events)?; + let matches = if compiled_events.is_empty() { + context_subs + .iter() + .cloned() + .map(MutationPatternEventMatch::unmatched_nuc_substitution) + .collect_vec() + } else { + context_subs + .iter() + .filter_map(|s| match_event(s, &compiled_events, &ref_seq_str)) + .collect_vec() + }; + let clusters = if let Some(cluster) = &cfg.cluster { + find_clusters_with_config(&matches, cluster) + } else { + vec![] + }; + let total_clusters = clusters.len(); + let total_clustered = clusters.iter().map(|c| c.count).sum(); + Ok(MutationPatternResults { + id: cfg.id.clone(), + name: cfg.name.clone(), + event_type_counts: compute_event_type_counts(&matches), + counts: MutationPatternCounts { + matches: matches.len(), + clustered: total_clustered, + clusters: total_clusters, + }, + matches, + clusters, + description: cfg.description.clone(), + }) + }) + .collect::, Report>>()? + }; + + Ok(MutationPatternAnalysis { + results: MutationPatternsResults { results }, + qc_clusters, + }) +} + +fn compute_event_type_counts(events: &[MutationPatternEventMatch]) -> Vec { + let mut counts: BTreeMap<(Nuc, Nuc), usize> = BTreeMap::new(); + for event in events { + let MutationPatternEventMatch::NucSubstitution(event) = event; + *counts + .entry((event.substitution.sub.ref_nuc, event.substitution.sub.qry_nuc)) + .or_default() += 1; + } + counts + .into_iter() + .map(|((ref_nuc, qry_nuc), count)| { + MutationPatternEventTypeCount::NucSubstitution(MutationPatternNucSubstitutionTypeCount { + ref_nuc, + qry_nuc, + count, + }) + }) + .collect_vec() +} + +fn event_match_position(event: &MutationPatternEventMatch) -> usize { + match event { + MutationPatternEventMatch::NucSubstitution(event) => event.substitution.sub.pos.as_usize(), + } +} + +fn compile_events(events: &[MutationPatternEvent]) -> Result, Report> { + events + .iter() + .map(CompiledEvent::new) + .collect::, Report>>() +} + +fn match_event( + sub: &NucSubWithContext, + events: &[CompiledEvent], + ref_seq_str: &str, +) -> Option { + events + .iter() + .find_map(|event| event.match_substitution(sub, ref_seq_str)) +} + +fn find_clusters_with_config( + events: &[MutationPatternEventMatch], + config: &MutationPatternClusterConfig, +) -> Vec { + if config.window_size > 0 && config.cutoff > 0 { + find_clusters(events, config.window_size, config.cutoff) + } else { + vec![] + } +} + +enum CompiledEvent { + NucSubstitution(CompiledNucSubstitution), +} + +impl CompiledEvent { + fn new(event: &MutationPatternEvent) -> Result { + match event { + MutationPatternEvent::NucSubstitution(event) => Ok(Self::NucSubstitution(CompiledNucSubstitution::new(event)?)), + } + } + + fn match_substitution(&self, sub: &NucSubWithContext, ref_seq_str: &str) -> Option { + match self { + Self::NucSubstitution(event) => event.match_substitution(sub, ref_seq_str), + } + } +} + +struct CompiledNucSubstitution { + ref_nucs: Vec, + qry: Vec, + motifs: Vec, +} + +impl CompiledNucSubstitution { + fn new(event: &MutationPatternNucSubstitution) -> Result { + Ok(Self { + ref_nucs: event.ref_nucs.clone(), + qry: event.qry.clone(), + motifs: event + .motifs + .iter() + .map(|motif| CompiledMotif::new(motif)) + .collect::, Report>>()?, + }) + } + + fn match_substitution(&self, sub: &NucSubWithContext, ref_seq_str: &str) -> Option { + if !self.ref_nucs.iter().any(|nuc| is_nuc_match(*nuc, sub.sub.ref_nuc)) { + return None; + } + if !self.qry.iter().any(|nuc| is_nuc_match(*nuc, sub.sub.qry_nuc)) { + return None; + } + + if self.motifs.is_empty() { + return Some(MutationPatternEventMatch::unmatched_nuc_substitution(sub.clone())); + } + + let motif_matches = self + .motifs + .iter() + .flat_map(|motif| motif.find_matches_containing(ref_seq_str, sub.sub.pos.as_usize())) + .collect_vec(); + + if motif_matches.is_empty() { + None + } else { + Some(MutationPatternEventMatch::NucSubstitution( + MutationPatternNucSubstitutionMatch { + substitution: sub.clone(), + motif_matches, + }, + )) + } + } +} + +struct CompiledMotif { + motif: String, + regex: Regex, +} + +impl CompiledMotif { + fn new(motif: &str) -> Result { + if motif.is_empty() { + eyre::bail!("Mutation pattern motif cannot be empty"); + } + Ok(Self { + motif: motif.to_owned(), + regex: Regex::new(motif).wrap_err_with(|| format!("When compiling mutation pattern motif '{motif}'"))?, + }) + } + + fn find_matches_containing(&self, ref_seq_str: &str, pos: usize) -> Vec { + self + .regex + .find_iter(ref_seq_str) + .filter(|m| m.start() <= pos && pos < m.end()) + .map(|m| MutationPatternMotifMatch { + motif: self.motif.clone(), + start: m.start(), + end: m.end(), + }) + .collect_vec() + } +} + +const fn resolve_qc_config(qc_snp_clusters_config: Option<&QcRulesConfigSnpClusters>) -> (usize, usize) { + if let Some(qc_snp_clusters) = qc_snp_clusters_config { + if qc_snp_clusters.enabled { + (qc_snp_clusters.window_size, qc_snp_clusters.cluster_cut_off) + } else { + (0, 0) + } + } else { + (0, 0) + } +} + +fn find_clusters( + events: &[MutationPatternEventMatch], + window_size: usize, + cluster_cut_off: usize, +) -> Vec { + let mut current_window = VecDeque::<&MutationPatternEventMatch>::new(); + let mut all_clusters: Vec> = Vec::new(); + let mut previous_pos: isize = -1; + + for event in events { + let pos = event_match_position(event) as isize; + current_window.push_back(event); + + while (event_match_position(current_window[0]) as isize) < (pos - window_size as isize) { + current_window.pop_front(); + } + + if current_window.len() > cluster_cut_off { + let n_clusters = all_clusters.len(); + + if !all_clusters.is_empty() && current_window.len() > 1 { + let last_cluster = &all_clusters[n_clusters - 1]; + let last_pos = event_match_position(last_cluster[last_cluster.len() - 1]) as isize; + + if last_pos == previous_pos { + all_clusters[n_clusters - 1].push(event); + } else { + all_clusters.push(current_window.iter().copied().collect_vec()); + } + } else { + all_clusters.push(current_window.iter().copied().collect_vec()); + } + } + previous_pos = pos; + } + + for cluster in &mut all_clusters { + cluster.sort_by_key(|event| event_match_position(event)); + cluster.dedup_by_key(|event| event_match_position(event)); + } + + all_clusters + .into_iter() + .map(|cluster_events| { + let start = event_match_position(cluster_events[0]); + let end = event_match_position(cluster_events[cluster_events.len() - 1]); + let events: Vec = cluster_events.into_iter().cloned().collect(); + let event_type_counts = compute_event_type_counts(&events); + let count = events.len(); + MutationPatternCluster { + start, + end, + count, + events, + event_type_counts, + } + }) + .collect_vec() +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::alphabet::nuc::to_nuc_seq; + use crate::analyze::find_private_nuc_mutations::PrivateNucMutations; + use crate::analyze::nuc_sub::NucSub; + use crate::analyze::virus_properties::MutationPatternConfig; + use crate::coord::position::NucRefGlobalPosition; + use crate::io::json::{JsonPretty, json_parse, json_stringify}; + use eyre::Report; + use pretty_assertions::assert_eq; + use serde_json::Value; + + fn make_sub(pos: usize, ref_nuc: Nuc, qry_nuc: Nuc) -> NucSub { + NucSub { + pos: NucRefGlobalPosition::from(pos), + ref_nuc, + qry_nuc, + } + } + + fn make_private_muts(subs: Vec) -> PrivateNucMutations { + let total = subs.len(); + PrivateNucMutations { + private_substitutions: subs, + total_private_substitutions: total, + ..PrivateNucMutations::default() + } + } + + fn ref_seq_acgt(len: usize) -> Vec { + let pattern = [Nuc::A, Nuc::C, Nuc::G, Nuc::T]; + (0..len).map(|i| pattern[i % 4]).collect() + } + + fn wrap(patterns: Vec) -> MutationPatternsConfig { + MutationPatternsConfig { patterns } + } + + fn cluster(window_size: usize, cutoff: usize) -> MutationPatternClusterConfig { + MutationPatternClusterConfig { window_size, cutoff } + } + + fn qc_cluster(window_size: usize, cluster_cut_off: usize) -> QcRulesConfigSnpClusters { + QcRulesConfigSnpClusters { + enabled: true, + score_weight: ordered_float::OrderedFloat(50.0), + window_size, + cluster_cut_off, + } + } + + fn event_type_counts_map(counts: &[MutationPatternEventTypeCount]) -> BTreeMap<(Nuc, Nuc), usize> { + counts + .iter() + .map(|c| match c { + MutationPatternEventTypeCount::NucSubstitution(c) => ((c.ref_nuc, c.qry_nuc), c.count), + }) + .collect() + } + + fn event_type_counts_total(counts: &[MutationPatternEventTypeCount]) -> usize { + counts + .iter() + .map(|c| match c { + MutationPatternEventTypeCount::NucSubstitution(c) => c.count, + }) + .sum() + } + + fn assert_cluster_events_are_nuc_substitutions(cluster: &MutationPatternCluster, ref_nuc: Nuc, qry_nuc: Nuc) { + for event in &cluster.events { + let MutationPatternEventMatch::NucSubstitution(event) = event; + assert_eq!(ref_nuc, event.substitution.sub.ref_nuc); + assert_eq!(qry_nuc, event.substitution.sub.qry_nuc); + } + } + + fn pattern_all(window_size: usize, cutoff: usize) -> MutationPatternConfig { + MutationPatternConfig { + id: "all".to_owned(), + name: "All private substitutions".to_owned(), + cluster: Some(cluster(window_size, cutoff)), + ..MutationPatternConfig::default() + } + } + + fn pattern_substitution( + id: &str, + name: &str, + ref_nucs: Vec, + qry: Vec, + motifs: Vec, + window_size: usize, + cutoff: usize, + ) -> MutationPatternConfig { + MutationPatternConfig { + id: id.to_owned(), + name: name.to_owned(), + events: vec![MutationPatternEvent::NucSubstitution(MutationPatternNucSubstitution { + ref_nucs, + qry, + motifs, + })], + cluster: Some(cluster(window_size, cutoff)), + ..MutationPatternConfig::default() + } + } + + #[test] + fn test_mutation_patterns_empty_input() -> Result<(), Report> { + let private_muts = make_private_muts(vec![]); + let ref_seq = ref_seq_acgt(100); + let analysis = analyze_mutation_patterns(&private_muts, &ref_seq, None, None)?; + assert!(analysis.results.results.is_empty()); + assert!(analysis.qc_clusters.is_empty()); + Ok(()) + } + + #[test] + fn test_mutation_patterns_type_counts() -> Result<(), Report> { + let subs = vec![ + make_sub(10, Nuc::T, Nuc::C), + make_sub(20, Nuc::T, Nuc::C), + make_sub(30, Nuc::A, Nuc::G), + make_sub(40, Nuc::C, Nuc::T), + ]; + let private_muts = make_private_muts(subs); + let ref_seq = ref_seq_acgt(100); + let config = wrap(vec![pattern_all(100, 5)]); + let qc = qc_cluster(100, 5); + let analysis = analyze_mutation_patterns(&private_muts, &ref_seq, Some(&config), Some(&qc))?; + let result = &analysis.results.results[0]; + + let counts = event_type_counts_map(&result.event_type_counts); + + assert_eq!(Some(&2), counts.get(&(Nuc::T, Nuc::C))); + assert_eq!(Some(&1), counts.get(&(Nuc::A, Nuc::G))); + assert_eq!(Some(&1), counts.get(&(Nuc::C, Nuc::T))); + assert_eq!(4, event_type_counts_total(&result.event_type_counts)); + Ok(()) + } + + #[test] + fn test_mutation_patterns_no_clusters_without_config() -> Result<(), Report> { + let subs: Vec = (0..10).map(|i| make_sub(i * 5, Nuc::T, Nuc::C)).collect(); + let private_muts = make_private_muts(subs); + let ref_seq = ref_seq_acgt(100); + let analysis = analyze_mutation_patterns(&private_muts, &ref_seq, None, None)?; + assert!(analysis.results.results.is_empty()); + assert!(analysis.qc_clusters.is_empty()); + Ok(()) + } + + #[test] + fn test_mutation_patterns_clusters_with_config() -> Result<(), Report> { + let ref_seq = ref_seq_acgt(200); + let subs: Vec = (0..10) + .map(|i| { + let pos = i * 5; + make_sub(pos, ref_seq[pos], Nuc::C) + }) + .collect(); + let private_muts = make_private_muts(subs); + let config = wrap(vec![pattern_all(100, 5)]); + let qc = qc_cluster(100, 3); + let analysis = analyze_mutation_patterns(&private_muts, &ref_seq, Some(&config), Some(&qc))?; + let result = &analysis.results.results[0]; + assert_eq!(1, result.counts.clusters); + assert_eq!(10, result.counts.clustered); + let cluster = &result.clusters[0]; + assert_eq!(0, cluster.start); + assert_eq!(45, cluster.end); + assert_eq!(10, cluster.count); + assert_eq!(10, cluster.events.len()); + for event in &cluster.events { + let pos = event_match_position(event); + assert!(pos >= cluster.start); + assert!(pos <= cluster.end); + } + Ok(()) + } + + #[test] + fn test_mutation_patterns_two_separate_clusters() -> Result<(), Report> { + let ref_seq = ref_seq_acgt(2000); + let mut subs = Vec::new(); + for i in 0..8 { + let pos = i * 5; + subs.push(make_sub(pos, ref_seq[pos], Nuc::C)); + } + for i in 0..8 { + let pos = 1000 + i * 5; + subs.push(make_sub(pos, ref_seq[pos], Nuc::G)); + } + let private_muts = make_private_muts(subs); + let config = wrap(vec![pattern_all(100, 5)]); + let qc = qc_cluster(100, 5); + let analysis = analyze_mutation_patterns(&private_muts, &ref_seq, Some(&config), Some(&qc))?; + let result = &analysis.results.results[0]; + assert_eq!(2, result.counts.clusters); + assert!(result.clusters[0].end < result.clusters[1].start); + Ok(()) + } + + #[test] + fn test_mutation_patterns_cutoff_boundary() -> Result<(), Report> { + let ref_seq = ref_seq_acgt(200); + let subs: Vec = (0..5) + .map(|i| { + let pos = i * 5; + make_sub(pos, ref_seq[pos], Nuc::C) + }) + .collect(); + let private_muts = make_private_muts(subs); + let config = wrap(vec![pattern_all(100, 5)]); + let qc = qc_cluster(100, 3); + let analysis = analyze_mutation_patterns(&private_muts, &ref_seq, Some(&config), Some(&qc))?; + assert_eq!(0, analysis.results.results[0].counts.clusters); + Ok(()) + } + + #[test] + fn test_mutation_patterns_cutoff_plus_one() -> Result<(), Report> { + let ref_seq = ref_seq_acgt(200); + let subs: Vec = (0..6) + .map(|i| { + let pos = i * 5; + make_sub(pos, ref_seq[pos], Nuc::C) + }) + .collect(); + let private_muts = make_private_muts(subs); + let config = wrap(vec![pattern_all(100, 5)]); + let analysis = analyze_mutation_patterns(&private_muts, &ref_seq, Some(&config), None)?; + assert_eq!(1, analysis.results.results[0].counts.clusters); + assert_eq!(6, analysis.results.results[0].counts.clustered); + Ok(()) + } + + #[test] + fn test_mutation_patterns_type_filter() -> Result<(), Report> { + let subs = vec![ + make_sub(5, Nuc::T, Nuc::C), + make_sub(10, Nuc::A, Nuc::G), + make_sub(15, Nuc::T, Nuc::C), + make_sub(20, Nuc::T, Nuc::C), + make_sub(25, Nuc::A, Nuc::G), + make_sub(30, Nuc::T, Nuc::C), + make_sub(35, Nuc::T, Nuc::C), + make_sub(40, Nuc::T, Nuc::C), + ]; + let private_muts = make_private_muts(subs); + let ref_seq = ref_seq_acgt(200); + let config = wrap(vec![pattern_substitution( + "tc", + "T>C", + vec![Nuc::T], + vec![Nuc::C], + vec![], + 100, + 5, + )]); + let qc = qc_cluster(100, 5); + let analysis = analyze_mutation_patterns(&private_muts, &ref_seq, Some(&config), Some(&qc))?; + let result = &analysis.results.results[0]; + assert_eq!(1, result.counts.clusters); + for cluster in &result.clusters { + assert_cluster_events_are_nuc_substitutions(cluster, Nuc::T, Nuc::C); + } + assert_eq!(6, event_type_counts_total(&result.event_type_counts)); + assert_eq!(6, result.counts.matches); + Ok(()) + } + + #[test] + fn test_mutation_patterns_filter_separates_qc_from_filtered() -> Result<(), Report> { + let subs = vec![ + make_sub(5, Nuc::A, Nuc::G), + make_sub(10, Nuc::A, Nuc::G), + make_sub(15, Nuc::A, Nuc::G), + make_sub(20, Nuc::A, Nuc::G), + make_sub(25, Nuc::A, Nuc::G), + make_sub(30, Nuc::A, Nuc::G), + ]; + let private_muts = make_private_muts(subs); + let ref_seq = ref_seq_acgt(200); + let config = wrap(vec![pattern_substitution( + "tc", + "T>C", + vec![Nuc::T], + vec![Nuc::C], + vec![], + 100, + 5, + )]); + let qc = qc_cluster(100, 5); + let analysis = analyze_mutation_patterns(&private_muts, &ref_seq, Some(&config), Some(&qc))?; + assert_eq!(0, analysis.results.results[0].counts.clusters); + assert_eq!(1, analysis.qc_clusters.len()); + assert_eq!(6, analysis.qc_clusters[0].count); + Ok(()) + } + + #[test] + fn test_mutation_patterns_motif_filter_observable() -> Result<(), Report> { + // 0 1 2 3 + // 01234567890123456789012345678901234 + let ref_seq = to_nuc_seq("ACGTATCATAACGTAACGTAACGTAACGTAACGTA")?; + // ^^^ *** ^^^ ^^^ ^^^ ^^^ ^^^ + // M1 X1 M2 M3 M4 M5 M6 + // ^ M1-M6: [ACGT]CG motif matches ACG contexts + // * X1: [ACGT]CG motif rejects TCA context + let subs = vec![ + make_sub(1, Nuc::C, Nuc::T), + make_sub(6, Nuc::C, Nuc::T), + make_sub(11, Nuc::C, Nuc::T), + make_sub(16, Nuc::C, Nuc::T), + make_sub(21, Nuc::C, Nuc::T), + make_sub(26, Nuc::C, Nuc::T), + make_sub(31, Nuc::C, Nuc::T), + ]; + let private_muts = make_private_muts(subs); + let config = json_parse::( + r#"{ + "patterns": [ + { + "id": "apobec", + "name": "APOBEC-like", + "events": [ + { + "type": "nucSubstitution", + "ref": ["C"], + "qry": ["T"], + "motifs": ["[ACGT]CG"] + } + ], + "cluster": { + "windowSize": 100, + "cutoff": 3 + } + } + ] + }"#, + )?; + let qc = qc_cluster(100, 3); + let analysis = analyze_mutation_patterns(&private_muts, &ref_seq, Some(&config), Some(&qc))?; + let result = &analysis.results.results[0]; + assert_eq!(1, result.counts.clusters); + assert_eq!(6, result.matches.len()); + assert_eq!(6, result.clusters[0].count); + assert!(result.matches.iter().any(|m| match m { + MutationPatternEventMatch::NucSubstitution(m) => m.motif_matches.iter().any(|m| m.motif == "[ACGT]CG"), + })); + assert_eq!(1, analysis.qc_clusters.len()); + assert_eq!(7, analysis.qc_clusters[0].count); + Ok(()) + } + + #[test] + fn test_mutation_patterns_output_shape_is_event_oriented() -> Result<(), Report> { + let ref_seq = ref_seq_acgt(20); + let private_muts = make_private_muts(vec![make_sub(3, Nuc::T, Nuc::C), make_sub(7, Nuc::T, Nuc::C)]); + let config = wrap(vec![pattern_substitution( + "tc", + "T>C", + vec![Nuc::T], + vec![Nuc::C], + vec![], + 100, + 1, + )]); + let analysis = analyze_mutation_patterns(&private_muts, &ref_seq, Some(&config), None)?; + let actual = json_stringify(&analysis.results, JsonPretty(true))?; + let actual = json_parse::(&actual)?; + + let result = &actual["results"][0]; + assert!(result.get("eventTypeCounts").is_some()); + assert!(result.get("counts").is_some()); + assert!(result.get("substitutionTypeCounts").is_none()); + assert!(result.get("totalClusters").is_none()); + assert!(result.get("totalClusteredSnps").is_none()); + + let cluster = &result["clusters"][0]; + assert!(cluster.get("count").is_some()); + assert!(cluster.get("events").is_some()); + assert!(cluster.get("eventTypeCounts").is_some()); + assert!(cluster.get("numberOfSnps").is_none()); + assert!(cluster.get("substitutions").is_none()); + assert!(cluster.get("substitutionTypeCounts").is_none()); + + Ok(()) + } + + #[test] + fn test_mutation_patterns_motif_accepts_non_trinucleotide() -> Result<(), Report> { + let result = json_parse::( + r#"{ + "type": "nucSubstitution", + "ref": ["C"], + "qry": ["T"], + "motifs": ["TC"] + }"#, + )?; + let MutationPatternEvent::NucSubstitution(result) = result; + assert_eq!("TC", result.motifs[0]); + Ok(()) + } + + #[test] + fn test_mutation_patterns_filter_matches_iupac() -> Result<(), Report> { + let filter = MutationPatternEvent::NucSubstitution(MutationPatternNucSubstitution { + ref_nucs: vec![Nuc::R], + qry: vec![Nuc::N], + motifs: vec![], + }); + let compiled = compile_events(&[filter])?; + let ref_seq = to_nuc_seq("AAA")?; + let sub_a = NucSubWithContext::from_sub(&make_sub(1, Nuc::A, Nuc::G), &ref_seq); + let sub_c = NucSubWithContext::from_sub(&make_sub(1, Nuc::C, Nuc::G), &ref_seq); + assert!(match_event(&sub_a, &compiled, "AAA").is_some()); + assert!(match_event(&sub_c, &compiled, "ACA").is_none()); + Ok(()) + } + + #[test] + fn test_mutation_patterns_qc_config_fallback() -> Result<(), Report> { + let subs: Vec = (0..10).map(|i| make_sub(i * 5, Nuc::T, Nuc::C)).collect(); + let private_muts = make_private_muts(subs); + let ref_seq = ref_seq_acgt(200); + let legacy = QcRulesConfigSnpClusters { + enabled: true, + score_weight: ordered_float::OrderedFloat(50.0), + window_size: 100, + cluster_cut_off: 5, + }; + let analysis = analyze_mutation_patterns(&private_muts, &ref_seq, None, Some(&legacy))?; + assert_eq!(1, analysis.results.results.len()); + assert_eq!(1, analysis.results.results[0].counts.clusters); + assert_eq!(10, analysis.results.results[0].counts.clustered); + assert_eq!(1, analysis.qc_clusters.len()); + assert_eq!(10, analysis.qc_clusters[0].count); + Ok(()) + } + + #[test] + fn test_mutation_patterns_qc_config_disabled() -> Result<(), Report> { + let subs: Vec = (0..10).map(|i| make_sub(i * 5, Nuc::T, Nuc::C)).collect(); + let private_muts = make_private_muts(subs); + let ref_seq = ref_seq_acgt(200); + let legacy = QcRulesConfigSnpClusters { + enabled: false, + score_weight: ordered_float::OrderedFloat(50.0), + window_size: 100, + cluster_cut_off: 5, + }; + let analysis = analyze_mutation_patterns(&private_muts, &ref_seq, None, Some(&legacy))?; + assert!(analysis.results.results.is_empty()); + assert!(analysis.qc_clusters.is_empty()); + Ok(()) + } + + #[test] + fn test_mutation_patterns_type_counts_use_matches() -> Result<(), Report> { + let subs = vec![ + make_sub(5, Nuc::T, Nuc::C), + make_sub(10, Nuc::A, Nuc::G), + make_sub(15, Nuc::T, Nuc::C), + ]; + let private_muts = make_private_muts(subs); + let ref_seq = ref_seq_acgt(200); + let config = wrap(vec![pattern_substitution( + "tc", + "T>C", + vec![Nuc::T], + vec![Nuc::C], + vec![], + 100, + 1, + )]); + let analysis = analyze_mutation_patterns(&private_muts, &ref_seq, Some(&config), None)?; + assert_eq!( + 2, + event_type_counts_total(&analysis.results.results[0].event_type_counts) + ); + assert_eq!(2, analysis.results.results[0].counts.matches); + Ok(()) + } + + #[test] + fn test_mutation_patterns_pattern_cluster_does_not_override_qc_config() -> Result<(), Report> { + let subs: Vec = (0..10).map(|i| make_sub(i * 5, Nuc::T, Nuc::C)).collect(); + let private_muts = make_private_muts(subs); + let ref_seq = ref_seq_acgt(200); + let config = wrap(vec![pattern_all(10, 5)]); + let legacy = QcRulesConfigSnpClusters { + enabled: true, + score_weight: ordered_float::OrderedFloat(50.0), + window_size: 100, + cluster_cut_off: 5, + }; + let with_new = analyze_mutation_patterns(&private_muts, &ref_seq, Some(&config), Some(&legacy))?; + let without_new = analyze_mutation_patterns(&private_muts, &ref_seq, None, Some(&legacy))?; + assert_eq!(without_new.qc_clusters.len(), with_new.qc_clusters.len()); + assert_eq!(without_new.qc_clusters[0].start, with_new.qc_clusters[0].start); + assert_eq!(without_new.qc_clusters[0].end, with_new.qc_clusters[0].end); + assert_eq!(without_new.qc_clusters[0].count, with_new.qc_clusters[0].count); + assert_eq!(0, with_new.results.results[0].counts.clusters); + assert_eq!(1, with_new.qc_clusters.len()); + Ok(()) + } + + #[test] + fn test_mutation_patterns_multiple_configs() -> Result<(), Report> { + let subs = vec![ + make_sub(5, Nuc::T, Nuc::C), + make_sub(10, Nuc::A, Nuc::G), + make_sub(15, Nuc::T, Nuc::C), + make_sub(20, Nuc::T, Nuc::C), + make_sub(25, Nuc::A, Nuc::G), + make_sub(30, Nuc::T, Nuc::C), + make_sub(35, Nuc::T, Nuc::C), + make_sub(40, Nuc::T, Nuc::C), + ]; + let private_muts = make_private_muts(subs); + let ref_seq = ref_seq_acgt(200); + let config = wrap(vec![ + MutationPatternConfig { + description: Some("T>C editing".to_owned()), + ..pattern_substitution("tc", "T>C", vec![Nuc::T], vec![Nuc::C], vec![], 100, 5) + }, + MutationPatternConfig { + description: Some("A>G editing".to_owned()), + ..pattern_substitution("ag", "A>G", vec![Nuc::A], vec![Nuc::G], vec![], 100, 1) + }, + ]); + let analysis = analyze_mutation_patterns(&private_muts, &ref_seq, Some(&config), None)?; + let results = &analysis.results.results; + assert_eq!(2, results.len()); + assert_eq!(1, results[0].counts.clusters); + assert_eq!(Some("T>C editing"), results[0].description.as_deref()); + assert_eq!(1, results[1].counts.clusters); + assert_eq!(Some("A>G editing"), results[1].description.as_deref()); + for cluster in &results[0].clusters { + assert_cluster_events_are_nuc_substitutions(cluster, Nuc::T, Nuc::C); + } + for cluster in &results[1].clusters { + assert_cluster_events_are_nuc_substitutions(cluster, Nuc::A, Nuc::G); + } + Ok(()) + } +} diff --git a/packages/nextclade/src/analyze/nuc_sub_context.rs b/packages/nextclade/src/analyze/nuc_sub_context.rs new file mode 100644 index 000000000..ba0b89330 --- /dev/null +++ b/packages/nextclade/src/analyze/nuc_sub_context.rs @@ -0,0 +1,75 @@ +use crate::alphabet::nuc::Nuc; +use crate::analyze::nuc_sub::NucSub; +use crate::coord::position::PositionLike; +use serde::{Deserialize, Serialize}; + +/// Nucleotide substitution with local reference nucleotide context centered on the substituted position. +/// +/// Current output contains `[upstream, current, downstream]`. `current` is the substituted reference nucleotide. +/// Boundary positions use gap characters for absent flanking nucleotides. +#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema, Hash)] +#[serde(rename_all = "camelCase")] +pub struct NucSubWithContext { + /// Nucleotide substitution. + #[serde(flatten)] + pub sub: NucSub, + + /// Reference nucleotide context around the substituted position. + pub ref_context: Vec, +} + +impl NucSubWithContext { + pub fn from_sub(sub: &NucSub, ref_seq: &[Nuc]) -> Self { + let pos = sub.pos.as_usize(); + let upstream = if pos > 0 { ref_seq[pos - 1] } else { Nuc::Gap }; + let current = ref_seq[pos]; + let downstream = if pos + 1 < ref_seq.len() { + ref_seq[pos + 1] + } else { + Nuc::Gap + }; + Self { + sub: sub.clone(), + ref_context: vec![upstream, current, downstream], + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::alphabet::nuc::to_nuc_seq; + use crate::coord::position::NucRefGlobalPosition; + use eyre::Report; + use pretty_assertions::assert_eq; + use rstest::rstest; + + fn make_sub(pos: usize, ref_nuc: Nuc, qry_nuc: Nuc) -> NucSub { + NucSub { + pos: NucRefGlobalPosition::from(pos), + ref_nuc, + qry_nuc, + } + } + + #[rustfmt::skip] + #[rstest] + #[case::interior( 5, "ACGTACGT", "ACG")] + #[case::first_pos( 0, "TAC", "-TA")] + #[case::last_pos( 2, "ACG", "CG-")] + #[case::single_base(0, "T", "-T-")] + #[trace] + fn test_nuc_sub_context_from_sub( + #[case] pos: usize, + #[case] ref_seq: &str, + #[case] expected_context: &str, + ) -> Result<(), Report> { + let ref_seq = to_nuc_seq(ref_seq)?; + let expected_context = to_nuc_seq(expected_context)?; + let sub = make_sub(pos, ref_seq[pos], Nuc::A); + let result = NucSubWithContext::from_sub(&sub, &ref_seq); + assert_eq!(expected_context, result.ref_context); + assert_eq!(ref_seq[pos], result.ref_context[1]); + Ok(()) + } +} diff --git a/packages/nextclade/src/analyze/virus_properties.rs b/packages/nextclade/src/analyze/virus_properties.rs index 59d447f7a..a9f6c7866 100644 --- a/packages/nextclade/src/analyze/virus_properties.rs +++ b/packages/nextclade/src/analyze/virus_properties.rs @@ -19,9 +19,9 @@ use crate::{o, vec_of_owned}; use eyre::{Report, WrapErr}; use maplit::btreemap; use ordered_float::OrderedFloat; -use schemars; use semver::Version; use serde::{Deserialize, Serialize}; +use smart_default::SmartDefault; use std::collections::BTreeMap; use std::path::Path; use validator::Validate; @@ -55,6 +55,169 @@ impl PathogenAttributes { } } +/// Mutation event filter in mutation pattern analysis. +/// +/// Each event selects one class of private mutations to include in a pattern. More event variants can be added without +/// changing the surrounding `mutationPatterns.patterns[]` structure. +#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema)] +#[serde(tag = "type", rename_all = "camelCase")] +#[schemars(example = "MutationPatternEvent::example")] +pub enum MutationPatternEvent { + /// Nucleotide substitution event, selected by reference nucleotide, query nucleotide, and optional reference motifs. + NucSubstitution(MutationPatternNucSubstitution), +} + +impl MutationPatternEvent { + pub fn example() -> Self { + Self::NucSubstitution(MutationPatternNucSubstitution::example()) + } + + pub const fn matches_all_contexts(&self) -> bool { + match self { + Self::NucSubstitution(event) => event.motifs.is_empty(), + } + } +} + +/// Filter for selecting nucleotide substitutions in mutation pattern analysis. +/// +/// A substitution matches this event when its reference nucleotide is in `ref`, its query nucleotide is in `qry`, and at +/// least one motif matches the surrounding reference sequence. If `motifs` is empty, the event matches without a sequence +/// context restriction. +#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema)] +#[serde(rename_all = "camelCase")] +#[schemars(example = "MutationPatternNucSubstitution::example")] +pub struct MutationPatternNucSubstitution { + /// Reference nucleotides to match at the mutated position. + #[serde(rename = "ref")] + pub ref_nucs: Vec, + + /// Query nucleotides to match at the mutated position. + pub qry: Vec, + + /// Regular expressions matched against the reference sequence. A mutation matches a motif when the regex match spans + /// the mutated position. Use IUPAC nucleotide letters directly in the regex, for example `TC[AT]` for a TCW motif. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub motifs: Vec, +} + +impl MutationPatternNucSubstitution { + pub fn example() -> Self { + Self { + ref_nucs: vec![Nuc::A, Nuc::T], + qry: vec![Nuc::G, Nuc::C], + motifs: vec_of_owned!["A[ACGT]G", "T[ACGT]C"], + } + } +} + +/// Clustering rule applied to events matching one mutation pattern. +/// +/// Clustering is pattern-local. It decides which matched events are reported as dense clusters in this pattern. It does +/// not change the global `qc.snpClusters` rule unless the QC rule itself is configured separately. +#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize, SmartDefault, schemars::JsonSchema)] +#[serde(rename_all = "camelCase")] +#[serde(default)] +#[schemars(example = "MutationPatternClusterConfig::example")] +pub struct MutationPatternClusterConfig { + /// Sliding nucleotide window size. A cluster is detected when `cutoff` matched events fall within this many reference + /// nucleotides. + #[default = 100] + pub window_size: usize, + + /// Minimum number of matched events in one sliding window required to report a cluster. + #[default = 5] + pub cutoff: usize, +} + +impl MutationPatternClusterConfig { + pub const fn example() -> Self { + Self { + window_size: 100, + cutoff: 5, + } + } +} + +/// Named mutation pattern: event filters, optional clustering, and display metadata. +/// +/// Dataset authors can define multiple patterns to separate biologically different mutation processes, such as +/// ADAR-like A-to-I editing and APOBEC-like cytosine deamination. +#[derive(Clone, Debug, Default, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema)] +#[serde(rename_all = "camelCase")] +#[schemars(example = "MutationPatternConfig::example")] +pub struct MutationPatternConfig { + /// Stable machine-readable identifier. This value appears in JSON and TSV output. + pub id: String, + + /// Human-readable name shown in Nextclade Web tooltips and reports. + pub name: String, + + /// Optional explanatory text shown together with the pattern result. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub description: Option, + + /// Event filters included in this pattern. If empty, the pattern matches all private nucleotide substitutions. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub events: Vec, + + /// Optional pattern-local clustering rule. If omitted, Nextclade reports matches and type counts, but no clusters for + /// this pattern. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub cluster: Option, +} + +impl MutationPatternConfig { + pub fn example() -> Self { + Self { + id: o!("adar"), + name: o!("ADAR-like RNA editing"), + description: Some(o!("ADAR-mediated A-to-I editing observed as A>G and complementary T>C")), + events: vec![MutationPatternEvent::NucSubstitution( + MutationPatternNucSubstitution::example(), + )], + cluster: Some(MutationPatternClusterConfig::example()), + } + } +} + +/// Configuration for mutation pattern analysis. Detects private mutations matching biologically meaningful mutation +/// type and reference-context rules. Pattern-specific clustering is independent from global `qc.snpClusters`. +#[derive(Clone, Debug, Default, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema)] +#[serde(rename_all = "camelCase")] +#[schemars(example = "MutationPatternsConfig::example")] +pub struct MutationPatternsConfig { + /// Mutation patterns evaluated independently for every analyzed sequence. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub patterns: Vec, +} + +impl MutationPatternsConfig { + pub fn example() -> Self { + Self { + patterns: vec![ + MutationPatternConfig::example(), + MutationPatternConfig { + id: o!("apobec"), + name: o!("APOBEC-like cytosine deamination"), + description: Some(o!( + "APOBEC-like cytosine deamination observed as C>T and complementary G>A" + )), + events: vec![MutationPatternEvent::NucSubstitution(MutationPatternNucSubstitution { + ref_nucs: vec![Nuc::C, Nuc::G], + qry: vec![Nuc::T, Nuc::A], + motifs: vec_of_owned!["TC[AT]", "[AT]GA"], + })], + cluster: Some(MutationPatternClusterConfig { + window_size: 50, + cutoff: 4, + }), + }, + ], + } + } +} + /// pathogen.json dataset file. Contains external configuration and data specific for a particular pathogen. #[derive(Clone, Default, Debug, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema)] #[serde(rename_all = "camelCase")] @@ -95,6 +258,11 @@ pub struct VirusProperties { #[serde(default, skip_serializing_if = "Option::is_none")] pub qc: Option, + /// Mutation pattern analysis configuration. When present, detects private mutation patterns such as enzyme-associated + /// clustered substitution signatures. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub mutation_patterns: Option, + /// General analysis parameters (e.g. includeReference, inOrder, replaceUnknown). #[serde(default, skip_serializing_if = "Option::is_none")] pub general_params: Option, @@ -416,6 +584,7 @@ impl VirusProperties { cds_order_preference: vec_of_owned!["HA1", "HA2"], mut_labels: LabelledMutationsConfig::example(), qc: Some(QcConfig::example()), + mutation_patterns: Some(MutationPatternsConfig::example()), general_params: None, alignment_params: None, tree_builder_params: None, diff --git a/packages/nextclade/src/io/nextclade_csv.rs b/packages/nextclade/src/io/nextclade_csv.rs index d7ea92f57..5b4eaab64 100644 --- a/packages/nextclade/src/io/nextclade_csv.rs +++ b/packages/nextclade/src/io/nextclade_csv.rs @@ -15,6 +15,7 @@ pub fn prepare_headers( phenotype_attr_keys: &[String], ref_nodes: &AuspiceRefNodesDesc, aa_motifs_keys: &[String], + mutation_pattern_keys: &[String], column_config: &CsvColumnConfig, ) -> Vec { // Get names of enabled columns @@ -60,6 +61,21 @@ pub fn prepare_headers( }); } + if column_config.include_mut_patterns { + let mut insert_custom_cols_at_index = headers + .iter() + .position(|header| header == "clade") + .unwrap_or_else(|| headers.len().saturating_sub(1)) + .clamp(0, headers.len()); + + for pattern_key in mutation_pattern_keys { + for col in &mut_pattern_cols(pattern_key) { + insert_after(&mut headers, insert_custom_cols_at_index, col.to_owned()); + insert_custom_cols_at_index += 1; + } + } + } + if column_config.include_rel_muts { // Insert columns after this column index let mut insert_custom_cols_at_index = headers @@ -119,6 +135,15 @@ fn sort_headers_by_canonical_order(headers: Vec) -> Vec { .collect() } +pub fn mut_pattern_cols(pattern_id: &str) -> [String; 4] { + [ + format!("mutationPatterns['{pattern_id}'].counts.matches"), + format!("mutationPatterns['{pattern_id}'].counts.clustered"), + format!("mutationPatterns['{pattern_id}'].counts.clusters"), + format!("mutationPatterns['{pattern_id}'].eventTypeCounts"), + ] +} + fn clade_founder_cols(name: impl AsRef) -> [String; 5] { let name = name.as_ref(); [ @@ -196,6 +221,7 @@ impl NextcladeResultsCsvFileWriter { phenotype_attr_keys: &[String], ref_nodes: &AuspiceRefNodesDesc, aa_motifs_keys: &[String], + mutation_pattern_keys: &[String], column_config: &CsvColumnConfig, ) -> Result { let headers: Vec = prepare_headers( @@ -203,6 +229,7 @@ impl NextcladeResultsCsvFileWriter { phenotype_attr_keys, ref_nodes, aa_motifs_keys, + mutation_pattern_keys, column_config, ); let csv_writer = CsvVecFileWriter::new(filepath, delimiter, &headers)?; @@ -227,6 +254,7 @@ pub fn results_to_csv_string( phenotype_attr_keys: &[String], ref_nodes: &AuspiceRefNodesDesc, aa_motifs_keys: &[String], + mutation_pattern_keys: &[String], delimiter: u8, column_config: &CsvColumnConfig, ) -> Result { @@ -238,6 +266,7 @@ pub fn results_to_csv_string( phenotype_attr_keys, ref_nodes, aa_motifs_keys, + mutation_pattern_keys, column_config, ); let csv_writer = CsvVecWriter::new(&mut buf, delimiter, &headers)?; @@ -283,9 +312,10 @@ mod tests { include_dynamic: false, include_rel_muts: false, include_clade_founder_muts: false, + include_mut_patterns: false, }; - let headers = prepare_headers(&[], &[], &AuspiceRefNodesDesc::default(), &[], &column_config); + let headers = prepare_headers(&[], &[], &AuspiceRefNodesDesc::default(), &[], &[], &column_config); // Verify headers are in canonical order as defined in CSV_COLUMN_CONFIG_MAP_DEFAULT let expected_order = vec![ @@ -324,4 +354,152 @@ mod tests { assert_eq!(sorted, expected_order); } + + #[test] + fn test_nextclade_csv_mut_pattern_cols_single() { + let cols = mut_pattern_cols("adar"); + assert_eq!( + cols, + [ + "mutationPatterns['adar'].counts.matches", + "mutationPatterns['adar'].counts.clustered", + "mutationPatterns['adar'].counts.clusters", + "mutationPatterns['adar'].eventTypeCounts", + ] + ); + } + + #[test] + fn test_nextclade_csv_mut_pattern_cols_special_chars() { + let cols = mut_pattern_cols("rna_editing"); + assert_eq!(cols[0], "mutationPatterns['rna_editing'].counts.matches"); + } + + #[test] + fn test_nextclade_csv_prepare_headers_no_patterns() { + let column_config = CsvColumnConfig { + categories: indexmap! { + CsvColumnCategory::General => indexmap! { + o!("seqName") => true, + o!("clade") => true, + }, + }, + individual: vec![], + include_dynamic: false, + include_rel_muts: false, + include_clade_founder_muts: false, + include_mut_patterns: true, + }; + + let headers = prepare_headers(&[], &[], &AuspiceRefNodesDesc::default(), &[], &[], &column_config); + + assert!(!headers.iter().any(|h| h.contains("mutationPatterns"))); + } + + #[test] + fn test_nextclade_csv_prepare_headers_single_pattern() { + let column_config = CsvColumnConfig { + categories: indexmap! { + CsvColumnCategory::General => indexmap! { + o!("seqName") => true, + o!("clade") => true, + }, + }, + individual: vec![], + include_dynamic: false, + include_rel_muts: false, + include_clade_founder_muts: false, + include_mut_patterns: true, + }; + + let pattern_keys = vec![o!("adar")]; + let headers = prepare_headers( + &[], + &[], + &AuspiceRefNodesDesc::default(), + &[], + &pattern_keys, + &column_config, + ); + + let mut_cols: Vec<&str> = headers + .iter() + .filter(|h| h.contains("mutationPatterns")) + .map(String::as_str) + .collect(); + assert_eq!( + mut_cols, + vec![ + "mutationPatterns['adar'].counts.matches", + "mutationPatterns['adar'].counts.clustered", + "mutationPatterns['adar'].counts.clusters", + "mutationPatterns['adar'].eventTypeCounts", + ] + ); + } + + #[test] + fn test_nextclade_csv_prepare_headers_two_patterns() { + let column_config = CsvColumnConfig { + categories: indexmap! { + CsvColumnCategory::General => indexmap! { + o!("seqName") => true, + o!("clade") => true, + }, + }, + individual: vec![], + include_dynamic: false, + include_rel_muts: false, + include_clade_founder_muts: false, + include_mut_patterns: true, + }; + + let pattern_keys = vec![o!("adar"), o!("apobec")]; + let headers = prepare_headers( + &[], + &[], + &AuspiceRefNodesDesc::default(), + &[], + &pattern_keys, + &column_config, + ); + + let mut_cols: Vec<&str> = headers + .iter() + .filter(|h| h.contains("mutationPatterns")) + .map(String::as_str) + .collect(); + assert_eq!(mut_cols.len(), 8); + assert_eq!(mut_cols[0], "mutationPatterns['adar'].counts.matches"); + assert_eq!(mut_cols[4], "mutationPatterns['apobec'].counts.matches"); + } + + #[test] + fn test_nextclade_csv_prepare_headers_patterns_disabled() { + let column_config = CsvColumnConfig { + categories: indexmap! { + CsvColumnCategory::General => indexmap! { + o!("seqName") => true, + o!("clade") => true, + }, + }, + individual: vec![], + include_dynamic: false, + include_rel_muts: false, + include_clade_founder_muts: false, + include_mut_patterns: false, + }; + + let pattern_keys = vec![o!("adar")]; + let headers = prepare_headers( + &[], + &[], + &AuspiceRefNodesDesc::default(), + &[], + &pattern_keys, + &column_config, + ); + + assert!(!headers.iter().any(|h| h.contains("mutationPatterns"))); + } } diff --git a/packages/nextclade/src/io/nextclade_csv_column_config.rs b/packages/nextclade/src/io/nextclade_csv_column_config.rs index d9a4e2d33..540bab39d 100644 --- a/packages/nextclade/src/io/nextclade_csv_column_config.rs +++ b/packages/nextclade/src/io/nextclade_csv_column_config.rs @@ -26,6 +26,7 @@ pub enum CsvColumnCategory { RelMuts, ErrsWarns, Qc, + MutPatterns, Primers, Dynamic, } @@ -35,12 +36,14 @@ pub type CsvColumnConfigMap = IndexMap // Configuration for enabling/disabling CSV columns or categories of them #[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema)] #[serde(rename_all = "camelCase")] +#[allow(clippy::struct_excessive_bools)] pub struct CsvColumnConfig { pub categories: CsvColumnConfigMap, pub individual: Vec, pub include_dynamic: bool, pub include_clade_founder_muts: bool, pub include_rel_muts: bool, + pub include_mut_patterns: bool, } impl CsvColumnConfig { @@ -80,6 +83,8 @@ impl CsvColumnConfig { let include_clade_founder_muts = categories.contains(&CsvColumnCategory::CladeFounderMuts); + let include_mut_patterns = categories.contains(&CsvColumnCategory::MutPatterns); + let categories = categories .into_iter() .filter_map(|category| { @@ -94,6 +99,7 @@ impl CsvColumnConfig { include_dynamic, include_clade_founder_muts, include_rel_muts, + include_mut_patterns, }) } } @@ -107,6 +113,7 @@ impl Default for CsvColumnConfig { include_dynamic: true, include_clade_founder_muts: true, include_rel_muts: true, + include_mut_patterns: true, } } } diff --git a/packages/nextclade/src/io/nextclade_csv_row.rs b/packages/nextclade/src/io/nextclade_csv_row.rs index 9dc0b25e7..8325fa64b 100644 --- a/packages/nextclade/src/io/nextclade_csv_row.rs +++ b/packages/nextclade/src/io/nextclade_csv_row.rs @@ -6,10 +6,12 @@ use crate::analyze::aa_sub::{AaSub, AaSubLabeled}; use crate::analyze::find_aa_motifs::AaMotif; use crate::analyze::find_clade_founder::CladeNodeAttrFounderInfo; use crate::analyze::letter_ranges::{CdsAaRange, NucRange}; +use crate::analyze::mutation_patterns::MutationPatternEventTypeCount; use crate::analyze::nuc_del::NucDelRange; use crate::analyze::nuc_sub::{NucSub, NucSubLabeled}; use crate::analyze::pcr_primer_changes::PcrPrimerChange; use crate::coord::range::NucRefGlobalRange; +use crate::io::nextclade_csv::mut_pattern_cols; use crate::o; use crate::qc::qc_config::StopCodonLocation; use crate::qc::qc_rule_snp_clusters::ClusteredSnp; @@ -66,6 +68,7 @@ impl NextcladeResultsCsvRow { clade, private_nuc_mutations, private_aa_mutations, + mutation_patterns, missing_cdses, // divergence, coverage, @@ -429,6 +432,16 @@ impl NextcladeResultsCsvRow { "qc.stopCodons.status", qc.stop_codons.as_ref().map(|sc| sc.status.to_string()), )?; + for pattern in &mutation_patterns.results { + let [col_matches, col_clustered, col_clusters, col_event_type_counts] = mut_pattern_cols(&pattern.id); + self.add_entry(col_matches, &pattern.counts.matches)?; + self.add_entry(col_clustered, &pattern.counts.clustered)?; + self.add_entry(col_clusters, &pattern.counts.clusters)?; + self.add_entry( + col_event_type_counts, + &format_mutation_pattern_event_type_counts(&pattern.event_type_counts, ARRAY_ITEM_DELIMITER), + )?; + } self.add_entry("isReverseComplement", &is_reverse_complement.to_string())?; self.add_entry("failedCdses", &format_failed_cdses(missing_cdses, ARRAY_ITEM_DELIMITER))?; self.add_entry( @@ -671,9 +684,26 @@ pub fn format_clustered_snps(snps: &[ClusteredSnp], delimiter: &str) -> String { snps .iter() .map(|snp| { - let range = NucRefGlobalRange::from_usize(snp.start, snp.end).to_string(); - let number_of_snps = snp.number_of_snps; - format!("{range}:{number_of_snps}") + let range = NucRefGlobalRange::from_usize(snp.start, snp.end + 1).to_string(); + let count = snp.number_of_snps; + format!("{range}:{count}") + }) + .join(delimiter) +} + +#[inline] +pub fn format_mutation_pattern_event_type_counts(counts: &[MutationPatternEventTypeCount], delimiter: &str) -> String { + counts + .iter() + .map(|c| match c { + MutationPatternEventTypeCount::NucSubstitution(c) => { + format!( + "nucSubstitution:{}>{}:{}", + from_nuc(c.ref_nuc), + from_nuc(c.qry_nuc), + c.count + ) + } }) .join(delimiter) } diff --git a/packages/nextclade/src/io/xlsx.rs b/packages/nextclade/src/io/xlsx.rs index dd2126bed..608e94dae 100644 --- a/packages/nextclade/src/io/xlsx.rs +++ b/packages/nextclade/src/io/xlsx.rs @@ -24,6 +24,7 @@ pub fn results_to_excel_sheet( &initial_data.phenotype_attr_keys, &initial_data.ref_nodes, &initial_data.aa_motif_keys, + &initial_data.mutation_pattern_keys, column_config, ); diff --git a/packages/nextclade/src/qc/qc_config.rs b/packages/nextclade/src/qc/qc_config.rs index 6a4efe6be..a2f329355 100644 --- a/packages/nextclade/src/qc/qc_config.rs +++ b/packages/nextclade/src/qc/qc_config.rs @@ -118,21 +118,25 @@ const fn one() -> OrderedFloat { #[schemars(example = "QcRulesConfigSnpClusters::example")] pub struct QcRulesConfigSnpClusters { pub enabled: bool, - /// Size of the sliding window (in nucleotides) used to count private mutations - pub window_size: usize, - /// Number of private mutations within a window above which the window is flagged as a cluster - pub cluster_cut_off: usize, /// QC score added per detected cluster pub score_weight: OrderedFloat, + + /// Size of the sliding nucleotide window for global SNP cluster detection + #[serde(default)] + pub window_size: usize, + + /// Number of substitutions within a window required to count as a global SNP cluster + #[serde(default)] + pub cluster_cut_off: usize, } impl QcRulesConfigSnpClusters { pub const fn example() -> Self { Self { enabled: true, + score_weight: OrderedFloat(50.0), window_size: 100, cluster_cut_off: 5, - score_weight: OrderedFloat(50.0), } } } diff --git a/packages/nextclade/src/qc/qc_rule_snp_clusters.rs b/packages/nextclade/src/qc/qc_rule_snp_clusters.rs index 9c0a66d24..f75c308a0 100644 --- a/packages/nextclade/src/qc/qc_rule_snp_clusters.rs +++ b/packages/nextclade/src/qc/qc_rule_snp_clusters.rs @@ -1,12 +1,8 @@ -use crate::analyze::find_private_nuc_mutations::PrivateNucMutations; -use crate::analyze::nuc_sub::NucSub; -use crate::coord::position::PositionLike; +use crate::analyze::mutation_patterns::MutationPatternCluster; use crate::qc::qc_config::QcRulesConfigSnpClusters; use crate::qc::qc_run::{QcRule, QcStatus}; -use itertools::Itertools; use num::traits::clamp_min; use serde::{Deserialize, Serialize}; -use std::collections::VecDeque; /// A cluster of private nucleotide substitutions within a genomic window. /// @@ -51,22 +47,24 @@ impl QcRule for QcResultSnpClusters { } pub fn rule_snp_clusters( - private_nuc_mutations: &PrivateNucMutations, + qc_clusters: &[MutationPatternCluster], config: &QcRulesConfigSnpClusters, ) -> Option { if !config.enabled { return None; } - let mut snp_clusters = find_snp_clusters(&private_nuc_mutations.private_substitutions, config); - for cluster in &mut snp_clusters { - cluster.sort_unstable(); - } - - let total_clusters = snp_clusters.len(); + let clustered_snps: Vec = qc_clusters + .iter() + .map(|c| ClusteredSnp { + start: c.start, + end: c.end, + number_of_snps: c.count, + }) + .collect(); - let clustered_snps = process_snp_clusters(snp_clusters); - let total_snps = clustered_snps.iter().map(|cluster| cluster.number_of_snps).sum(); + let total_clusters = clustered_snps.len(); + let total_snps = clustered_snps.iter().map(|c| c.number_of_snps).sum(); let score = clamp_min(total_clusters as f64 * *config.score_weight, 0.0); let status = QcStatus::from_score(score); @@ -79,49 +77,68 @@ pub fn rule_snp_clusters( }) } -fn find_snp_clusters(private_nuc_mutations: &[NucSub], config: &QcRulesConfigSnpClusters) -> Vec> { - let mut current_cluster = VecDeque::::new(); - let mut all_clusters = Vec::>::new(); - let mut previous_pos: isize = -1; - for mutation in private_nuc_mutations { - let pos = mutation.pos.as_isize(); - current_cluster.push_back(pos); - - while current_cluster[0] < (pos - config.window_size as isize) { - current_cluster.pop_front(); +#[cfg(test)] +#[allow( + clippy::float_cmp, + reason = "SNP cluster QC scores are exact products of integer cluster counts and fixed score weights" +)] +mod tests { + use super::*; + use ordered_float::OrderedFloat; + use pretty_assertions::assert_eq; + + fn make_config(enabled: bool, score_weight: f64) -> QcRulesConfigSnpClusters { + QcRulesConfigSnpClusters { + enabled, + score_weight: OrderedFloat(score_weight), + window_size: 0, + cluster_cut_off: 0, } + } - if current_cluster.len() > config.cluster_cut_off { - let n_clusters = all_clusters.len(); - - if !all_clusters.is_empty() && current_cluster.len() > 1 { - let i = n_clusters - 1; - let j = all_clusters[i].len() - 1; - let p = all_clusters[i][j]; - - if p == previous_pos { - all_clusters[i].push(pos); - } else { - all_clusters.push(current_cluster.iter().copied().collect_vec()); - } - } else { - all_clusters.push(current_cluster.iter().copied().collect_vec()); - } + fn make_cluster(start: usize, end: usize, n: usize) -> MutationPatternCluster { + MutationPatternCluster { + start, + end, + count: n, + ..MutationPatternCluster::default() } - previous_pos = pos; } - all_clusters -} + #[test] + fn test_rule_snp_clusters_disabled() { + let result = rule_snp_clusters(&[], &make_config(false, 50.0)); + assert!(result.is_none()); + } + + #[test] + fn test_rule_snp_clusters_empty() { + let result = rule_snp_clusters(&[], &make_config(true, 50.0)).expect("should be Some"); + assert_eq!(0.0, result.score); + assert!(matches!(result.status, QcStatus::Good)); + assert_eq!(0, result.total_snps); + assert!(result.clustered_snps.is_empty()); + } + + #[test] + fn test_rule_snp_clusters_one_cluster() { + let clusters = vec![make_cluster(100, 200, 6)]; + let result = rule_snp_clusters(&clusters, &make_config(true, 50.0)).expect("should be Some"); + assert_eq!(50.0, result.score); + assert!(matches!(result.status, QcStatus::Mediocre)); + assert_eq!(6, result.total_snps); + assert_eq!(1, result.clustered_snps.len()); + assert_eq!(100, result.clustered_snps[0].start); + assert_eq!(200, result.clustered_snps[0].end); + assert_eq!(6, result.clustered_snps[0].number_of_snps); + } -fn process_snp_clusters(snp_clusters: Vec>) -> Vec { - let mut result = Vec::with_capacity(snp_clusters.len()); - for cluster in snp_clusters { - result.push(ClusteredSnp { - start: cluster[0] as usize, - end: cluster[cluster.len() - 1] as usize, - number_of_snps: cluster.len(), - }); + #[test] + fn test_rule_snp_clusters_two_clusters_bad() { + let clusters = vec![make_cluster(100, 200, 6), make_cluster(500, 600, 8)]; + let result = rule_snp_clusters(&clusters, &make_config(true, 50.0)).expect("should be Some"); + assert_eq!(100.0, result.score); + assert!(matches!(result.status, QcStatus::Bad)); + assert_eq!(14, result.total_snps); } - result } diff --git a/packages/nextclade/src/qc/qc_run.rs b/packages/nextclade/src/qc/qc_run.rs index e74640779..c47722fdf 100644 --- a/packages/nextclade/src/qc/qc_run.rs +++ b/packages/nextclade/src/qc/qc_run.rs @@ -1,5 +1,6 @@ use crate::alphabet::nuc::Nuc; use crate::analyze::find_private_nuc_mutations::PrivateNucMutations; +use crate::analyze::mutation_patterns::MutationPatternCluster; use crate::qc::qc_config::QcConfig; use crate::qc::qc_rule_frame_shifts::{QcResultFrameShifts, rule_frame_shifts}; use crate::qc::qc_rule_missing_data::{QcResultMissingData, rule_missing_data}; @@ -82,6 +83,7 @@ pub trait QcRule { pub fn qc_run( private_nuc_mutations: &PrivateNucMutations, + qc_clusters: &[MutationPatternCluster], nucleotide_composition: &BTreeMap, total_missing: usize, translation: &Translation, @@ -92,7 +94,7 @@ pub fn qc_run( missing_data: rule_missing_data(total_missing, &config.missing_data), mixed_sites: rule_mixed_sites(nucleotide_composition, &config.mixed_sites), private_mutations: rule_private_mutations(private_nuc_mutations, &config.private_mutations), - snp_clusters: rule_snp_clusters(private_nuc_mutations, &config.snp_clusters), + snp_clusters: rule_snp_clusters(qc_clusters, &config.snp_clusters), frame_shifts: rule_frame_shifts(frame_shifts, &config.frame_shifts), stop_codons: rule_stop_codons(translation, &config.stop_codons), overall_score: 0.0, diff --git a/packages/nextclade/src/run/nextclade_run_one.rs b/packages/nextclade/src/run/nextclade_run_one.rs index cdc647fc4..a90aa967b 100644 --- a/packages/nextclade/src/run/nextclade_run_one.rs +++ b/packages/nextclade/src/run/nextclade_run_one.rs @@ -26,6 +26,7 @@ use crate::analyze::letter_composition::get_letter_composition; use crate::analyze::letter_ranges::{ CdsAaRange, NucRange, find_aa_letter_ranges, find_letter_ranges, find_letter_ranges_by, }; +use crate::analyze::mutation_patterns::analyze_mutation_patterns; use crate::analyze::nuc_alignment::NucAlignment; use crate::analyze::nuc_changes::{FindNucChangesOutput, find_nuc_changes}; use crate::analyze::nuc_del::NucDelRange; @@ -428,12 +429,21 @@ pub fn nextclade_run_one( let aa_motifs = find_aa_motifs(&virus_properties.aa_motifs, &translation)?; let aa_motifs_changes = find_aa_motifs_changes(aa_motifs_ref, &aa_motifs, ref_translation, &translation)?; + let mutation_pattern_analysis = analyze_mutation_patterns( + &private_nuc_mutations, + ref_seq, + virus_properties.mutation_patterns.as_ref(), + virus_properties.qc.as_ref().map(|qc| &qc.snp_clusters), + )?; + let mutation_patterns = mutation_pattern_analysis.results; + let qc = virus_properties .qc .as_ref() .map(|qc_config| { qc_run( &private_nuc_mutations, + &mutation_pattern_analysis.qc_clusters, &nucleotide_composition, total_missing, &translation, @@ -511,6 +521,7 @@ pub fn nextclade_run_one( clade, private_nuc_mutations, private_aa_mutations, + mutation_patterns, clade_founder_info, clade_node_attr_founder_info, ref_nodes: ref_nodes.to_owned(), diff --git a/packages/nextclade/src/run/nextclade_wasm.rs b/packages/nextclade/src/run/nextclade_wasm.rs index f091cf9d1..2a487db26 100644 --- a/packages/nextclade/src/run/nextclade_wasm.rs +++ b/packages/nextclade/src/run/nextclade_wasm.rs @@ -246,6 +246,7 @@ pub struct AnalysisInitialData { pub ref_nodes: AuspiceRefNodesDesc, pub aa_motifs_descs: Vec, pub aa_motif_keys: Vec, + pub mutation_pattern_keys: Vec, pub csv_column_config_default: CsvColumnConfig, } @@ -430,6 +431,12 @@ impl Nextclade { ref_nodes: self.ref_nodes.clone(), aa_motifs_descs: self.aa_motifs_descs.clone(), aa_motif_keys: self.aa_motifs_keys.clone(), + mutation_pattern_keys: self + .virus_properties + .mutation_patterns + .as_ref() + .map(|mp| mp.patterns.iter().map(|p| p.id.clone()).collect()) + .unwrap_or_default(), csv_column_config_default: CsvColumnConfig::default(), } } diff --git a/packages/nextclade/src/types/outputs.rs b/packages/nextclade/src/types/outputs.rs index 255290228..c7791fc1c 100644 --- a/packages/nextclade/src/types/outputs.rs +++ b/packages/nextclade/src/types/outputs.rs @@ -10,6 +10,7 @@ use crate::analyze::find_private_nuc_mutations::PrivateNucMutations; use crate::analyze::find_relative_aa_mutations::RelativeAaMutations; use crate::analyze::find_relative_nuc_mutations::RelativeNucMutations; use crate::analyze::letter_ranges::{CdsAaRange, NucRange}; +use crate::analyze::mutation_patterns::MutationPatternsResults; use crate::analyze::nuc_del::NucDelRange; use crate::analyze::nuc_sub::NucSub; use crate::analyze::pcr_primer_changes::PcrPrimerChange; @@ -137,6 +138,8 @@ pub struct NextcladeOutputs { pub private_nuc_mutations: PrivateNucMutations, /// Per-CDS amino acid mutations not shared with the nearest reference tree node pub private_aa_mutations: BTreeMap, + /// Per-type mutation statistics, local reference context, and detected mutation clusters + pub mutation_patterns: MutationPatternsResults, /// Mutations relative to the clade founder node pub clade_founder_info: Option, /// Per-attribute mutations relative to founder nodes, keyed by attribute name