Skip to main content

gapsmith_find/
classify.rs

1//! Hit classification — port of `src/analyse_alignments.R:108–143`.
2//!
3//! Inputs:
4//! - per-reaction reference-FASTA presence (did we have sequences to search?)
5//! - raw alignment hits for this reaction (possibly many, possibly none)
6//! - bitscore / identity / coverage / exception-EC cutoffs
7//!
8//! Output: a [`HitStatus`] plus the best hit (largest bitscore) when present.
9
10use crate::types::HitStatus;
11use gapsmith_align::Hit;
12use std::collections::HashSet;
13
14#[derive(Debug, Clone)]
15pub struct ClassifyOptions<'a> {
16    pub bitcutoff: f32,
17    pub identcutoff: f32,
18    /// Identity cutoff applied only when the reaction's EC is listed in
19    /// `dat/exception.tbl` (known "false-friend" enzymes).
20    pub ident_exception: f32,
21    /// Precomputed set of EC strings that require the exception cutoff.
22    pub exception_ecs: &'a HashSet<String>,
23}
24
25/// Classify a single reaction's hits. Returns `(status, is_exception,
26/// best_hit_index)` — the index points back into `hits` so the caller can
27/// attach the winning hit's fields to the reaction row.
28pub fn classify_reaction(
29    hits: &[Hit],
30    ec: &str,
31    has_seq_data: bool,
32    spont: bool,
33    opts: &ClassifyOptions<'_>,
34) -> (HitStatus, bool, Option<usize>) {
35    let exception = ec_is_exception(ec, opts.exception_ecs);
36
37    if hits.is_empty() {
38        let status = if !has_seq_data && spont {
39            HitStatus::Spontaneous
40        } else if has_seq_data {
41            HitStatus::NoBlast
42        } else {
43            HitStatus::NoSeqData
44        };
45        return (status, exception, None);
46    }
47
48    // Pick the top-bitscore hit. In the R pipeline ties would sort by
49    // (-bitscore, stitle, complex) after the complex join; we keep a
50    // single representative for M5's no-complex regime.
51    let (idx, best) = hits
52        .iter()
53        .enumerate()
54        .max_by(|a, b| a.1.bitscore.partial_cmp(&b.1.bitscore).unwrap_or(std::cmp::Ordering::Equal))
55        .unwrap();
56
57    let pass_main = best.bitscore >= opts.bitcutoff && best.pident >= opts.identcutoff;
58    let pass_exc = if exception { best.pident >= opts.ident_exception } else { true };
59
60    let status = if pass_main && pass_exc {
61        HitStatus::GoodBlast
62    } else {
63        HitStatus::BadBlast
64    };
65    (status, exception, Some(idx))
66}
67
68/// Decide whether an EC belongs in the exception table. Handles both
69/// exact-match EC codes and EC/EC-style entries (`enzyme/reaction` column
70/// in `dat/exception.tbl`).
71pub fn ec_is_exception(ec: &str, exception_set: &HashSet<String>) -> bool {
72    for token in ec.split('/').map(str::trim).filter(|s| !s.is_empty()) {
73        if exception_set.contains(token) {
74            return true;
75        }
76    }
77    false
78}
79
80/// Legacy one-shot classifier preserved for API callers that operate on a
81/// slice of `(bitscore, pident)` pairs. Not used by the runner.
82pub fn classify_hits(
83    scores: &[(f32, f32)],
84    ec: &str,
85    has_seq_data: bool,
86    spont: bool,
87    opts: &ClassifyOptions<'_>,
88) -> HitStatus {
89    let exception = ec_is_exception(ec, opts.exception_ecs);
90    if scores.is_empty() {
91        if !has_seq_data && spont {
92            return HitStatus::Spontaneous;
93        }
94        if has_seq_data {
95            return HitStatus::NoBlast;
96        }
97        return HitStatus::NoSeqData;
98    }
99    let (b, p) = scores
100        .iter()
101        .cloned()
102        .fold((f32::MIN, f32::MIN), |(ab, ap), (nb, np)| (ab.max(nb), ap.max(np)));
103    if b >= opts.bitcutoff && p >= opts.identcutoff {
104        if exception && p < opts.ident_exception {
105            HitStatus::BadBlast
106        } else {
107            HitStatus::GoodBlast
108        }
109    } else {
110        HitStatus::BadBlast
111    }
112}
113
114#[cfg(test)]
115mod tests {
116    use super::*;
117
118    fn opts(exc: &HashSet<String>) -> ClassifyOptions<'_> {
119        ClassifyOptions {
120            bitcutoff: 200.0,
121            identcutoff: 30.0,
122            ident_exception: 70.0,
123            exception_ecs: exc,
124        }
125    }
126
127    fn hit(bit: f32, pid: f32) -> Hit {
128        Hit {
129            qseqid: "q".into(),
130            pident: pid,
131            evalue: 0.0,
132            bitscore: bit,
133            qcov: 100.0,
134            stitle: "t".into(),
135            sstart: 1,
136            send: 100,
137        }
138    }
139
140    #[test]
141    fn good_blast_above_cutoffs() {
142        let e = HashSet::new();
143        let (s, _, _) = classify_reaction(&[hit(250.0, 50.0)], "1.1.1.1", true, false, &opts(&e));
144        assert_eq!(s, HitStatus::GoodBlast);
145    }
146
147    #[test]
148    fn bad_blast_below_bitscore() {
149        let e = HashSet::new();
150        let (s, _, _) = classify_reaction(&[hit(100.0, 50.0)], "1.1.1.1", true, false, &opts(&e));
151        assert_eq!(s, HitStatus::BadBlast);
152    }
153
154    #[test]
155    fn exception_requires_higher_identity() {
156        let mut e = HashSet::new();
157        e.insert("7.1.1.9".into());
158        let (s, exc, _) =
159            classify_reaction(&[hit(250.0, 50.0)], "7.1.1.9", true, false, &opts(&e));
160        assert_eq!(s, HitStatus::BadBlast);
161        assert!(exc);
162        let (s, _, _) = classify_reaction(&[hit(250.0, 80.0)], "7.1.1.9", true, false, &opts(&e));
163        assert_eq!(s, HitStatus::GoodBlast);
164    }
165
166    #[test]
167    fn no_blast_when_seq_data_but_no_hit() {
168        let e = HashSet::new();
169        let (s, _, _) = classify_reaction(&[], "1.1.1.1", true, false, &opts(&e));
170        assert_eq!(s, HitStatus::NoBlast);
171    }
172
173    #[test]
174    fn no_seq_data_when_nothing() {
175        let e = HashSet::new();
176        let (s, _, _) = classify_reaction(&[], "1.1.1.1", false, false, &opts(&e));
177        assert_eq!(s, HitStatus::NoSeqData);
178    }
179
180    #[test]
181    fn spontaneous_when_marked_and_no_data() {
182        let e = HashSet::new();
183        let (s, _, _) = classify_reaction(&[], "", false, true, &opts(&e));
184        assert_eq!(s, HitStatus::Spontaneous);
185    }
186}