Skip to main content

gapsmith_find/
runner.rs

1//! End-to-end driver for `gapsmith find`.
2//!
3//! Orchestrates pathway expansion, seqfile resolution, one concatenated
4//! alignment, per-reaction classification, and pathway completeness
5//! scoring. Returns a [`FindReport`] that downstream code (CLI + output
6//! writers) consumes.
7//!
8//! Complex / subunit detection lives in [`crate::complex`]; this runner
9//! invokes it per-reaction to populate `is_complex` / `complex_status`.
10
11use crate::classify::ClassifyOptions;
12use crate::complex::detect_subunits;
13use crate::dbhit::DbhitIndex;
14use crate::pathways::{self, ExpandedReaction, MatchMode, PathwaySelectOptions};
15use crate::seqfile::{self, SeqfileOptions};
16use crate::types::{HitStatus, PathwayResult, PwyStatus, ReactionHit};
17use gapsmith_align::{AlignOpts, Aligner, Hit};
18use gapsmith_db::{ComplexSubunitTable, ExceptionRow, PathwayTable};
19use std::collections::{BTreeMap, HashMap, HashSet};
20use std::fs::File;
21use std::io::{BufRead, BufReader, BufWriter, Read, Write};
22use std::path::Path;
23
24#[derive(Debug, thiserror::Error)]
25pub enum FindError {
26    #[error("alignment error: {0}")]
27    Align(#[from] gapsmith_align::AlignError),
28    #[error("i/o error on `{path}`: {source}")]
29    Io { path: std::path::PathBuf, #[source] source: std::io::Error },
30    #[error("no reference FASTAs resolved — check --seq-dir and --taxonomy")]
31    NoReferences,
32}
33
34#[derive(Debug, Clone)]
35pub struct FindOptions<'a> {
36    pub keyword: &'a str,
37    pub match_mode: MatchMode,
38    pub exclude_superpathways: bool,
39    pub only_active: bool,
40    pub bitcutoff: f32,
41    pub identcutoff: f32,
42    pub ident_exception: f32,
43    /// Coverage cutoff passed through to the aligner. Not re-checked in
44    /// classification — we trust the aligner to filter.
45    pub coverage_pct: u32,
46    /// Fraction-of-reactions threshold above which vague (no_seq_data)
47    /// reactions dilute completeness. Matches the `vagueCutoff` arg in
48    /// analyse_alignments.R.
49    pub vague_cutoff: f32,
50    /// Main completeness threshold, 0–1 (e.g. 0.80).
51    pub completeness_hint_off: f32,
52    /// Relaxed threshold applied when all key reactions are present, 0–1.
53    pub completeness_hint_on: f32,
54    pub strict_candidates: bool,
55    /// Fraction of known subunits needed for a complex to count as
56    /// present (0–1). Default 0.5.
57    pub subunit_cutoff: f32,
58    /// Tax IDs kept when filtering by `taxrange`. Empty → no filter.
59    pub valid_tax_ids: &'a [String],
60}
61
62impl<'a> Default for FindOptions<'a> {
63    fn default() -> Self {
64        Self {
65            keyword: "all",
66            match_mode: MatchMode::Regex,
67            exclude_superpathways: true,
68            only_active: true,
69            bitcutoff: 200.0,
70            identcutoff: 0.0,
71            ident_exception: 70.0,
72            coverage_pct: 75,
73            vague_cutoff: 0.3,
74            completeness_hint_off: 0.80,
75            completeness_hint_on: 0.66,
76            strict_candidates: false,
77            subunit_cutoff: 0.5,
78            valid_tax_ids: &[],
79        }
80    }
81}
82
83#[derive(Debug, Clone)]
84pub struct FindReport {
85    pub reactions: Vec<ReactionHit>,
86    pub pathways: Vec<PathwayResult>,
87}
88
89/// Run the full find pipeline.
90#[allow(clippy::too_many_arguments)]
91pub fn run_find(
92    table: &PathwayTable,
93    exceptions: &[ExceptionRow],
94    subunit_dict: &ComplexSubunitTable,
95    dbhit_index: &DbhitIndex,
96    seq_opts: &SeqfileOptions,
97    genome_fasta: &Path,
98    aligner: &dyn Aligner,
99    align_opts: &AlignOpts,
100    opts: &FindOptions<'_>,
101    workdir: &Path,
102) -> Result<FindReport, FindError> {
103    std::fs::create_dir_all(workdir).map_err(|e| FindError::Io {
104        path: workdir.to_path_buf(),
105        source: e,
106    })?;
107
108    // 1. Pick pathways and expand to reactions.
109    let expanded = pathways::select(
110        table,
111        &PathwaySelectOptions {
112            keyword: opts.keyword,
113            mode: opts.match_mode,
114            exclude_superpathways: opts.exclude_superpathways,
115            only_active: opts.only_active,
116            valid_tax_ids: opts.valid_tax_ids,
117        },
118    );
119    tracing::info!(reactions = expanded.len(), "pathways expanded to reactions");
120    if expanded.is_empty() {
121        return Ok(FindReport { reactions: Vec::new(), pathways: Vec::new() });
122    }
123
124    // 2. For each unique reaction, resolve reference FASTAs and scan each
125    //    one for subunit assignments.
126    //
127    // Multiple reactions frequently share the same reference FASTA (two
128    // reactions with the same EC → same `rev/<EC>.fasta`). Cache the
129    // parsed header list keyed by absolute path so each file is read
130    // and parsed exactly once per `find` invocation. Entries are wrapped
131    // in `Arc` so the cache holds ownership while callers borrow the
132    // parsed headers freely.
133    let unique_reactions = dedup_reactions(&expanded);
134    let mut seq_by_key: HashMap<ReactionKey, Vec<seqfile::ResolvedSeq>> = HashMap::new();
135    let mut subunit_by_qseqid: HashMap<(String, String), Option<String>> = HashMap::new();
136    let mut subunit_count_by_key: HashMap<ReactionKey, (bool, u32, String)> = HashMap::new();
137    let mut fasta_header_cache: HashMap<std::path::PathBuf, std::sync::Arc<Vec<(String, String)>>> =
138        HashMap::new();
139    for k in &unique_reactions {
140        let resolved = seqfile::resolve_for_reaction(seq_opts, &k.rxn, &k.ec, &k.name);
141        // Scan every resolved fasta's headers for subunit clues.
142        let mut desc_owned: Vec<(String, String, String)> = Vec::new();
143        for r in &resolved {
144            let entries = fasta_header_cache
145                .entry(r.path.clone())
146                .or_insert_with(|| {
147                    std::sync::Arc::new(read_fasta_headers(&r.path).unwrap_or_default())
148                })
149                .clone();
150            for (acc, full) in entries.iter() {
151                desc_owned.push((r.label.clone(), acc.clone(), full.clone()));
152            }
153        }
154        // Run detect_subunits across the combined descriptor list (gapseq's
155        // R code operates on the unioned set per reaction).
156        let refs: Vec<&str> =
157            desc_owned.iter().map(|(_, _, full)| full.as_str()).collect();
158        let assigns = detect_subunits(&k.rxn, &refs, subunit_dict);
159        let mut distinct: HashSet<String> = HashSet::new();
160        for ((label, acc, _), su) in desc_owned.iter().zip(assigns.iter()) {
161            subunit_by_qseqid.insert((label.clone(), acc.clone()), su.clone());
162            if let Some(name) = su {
163                if name != "Subunit undefined" {
164                    distinct.insert(name.clone());
165                }
166            }
167        }
168        let subunit_count = distinct.len() as u32;
169        let is_complex = subunit_count >= 2;
170        let mut list: Vec<String> = distinct.into_iter().collect();
171        list.sort();
172        subunit_count_by_key
173            .insert(k.clone(), (is_complex, subunit_count, list.join(",")));
174        seq_by_key.insert(k.clone(), resolved);
175    }
176
177    // 3. Concatenate all reference fastas into one query.faa.
178    let query_path = workdir.join("query.faa");
179    let n_concat = concat_refs(&query_path, &seq_by_key)?;
180    tracing::info!(seqfiles = n_concat, "concatenated reference fastas");
181
182    // 4. If nothing to align, emit empty results.
183    let hits_by_file: HashMap<String, Vec<Hit>> = if n_concat > 0 {
184        let raw_hits = aligner.align(&query_path, genome_fasta, align_opts)?;
185        tracing::info!(hits = raw_hits.len(), "alignment complete");
186        group_by_file(raw_hits)
187    } else {
188        HashMap::new()
189    };
190
191    // 5. Build per-reaction classified results.
192    let exception_set: HashSet<String> = exceptions.iter().map(|e| e.id.clone()).collect();
193    let classify_opts = ClassifyOptions {
194        bitcutoff: opts.bitcutoff,
195        identcutoff: opts.identcutoff,
196        ident_exception: opts.ident_exception,
197        exception_ecs: &exception_set,
198    };
199
200    let mut reactions: Vec<ReactionHit> = Vec::with_capacity(expanded.len());
201    for e in &expanded {
202        // R emits one row per unique (rxn, name, ec, stitle, complex). We
203        // reproduce that here: if the reaction has ≥ 1 hit, emit one row
204        // per hit; otherwise emit one placeholder row.
205        let rows = build_reaction_rows(
206            e,
207            &seq_by_key,
208            &hits_by_file,
209            &subunit_by_qseqid,
210            &subunit_count_by_key,
211            dbhit_index,
212            &classify_opts,
213            opts.subunit_cutoff,
214        );
215        reactions.extend(rows);
216    }
217
218    // 5b. Sort reactions by (pathway, rxn, complex, -bitscore). Matches
219    //     R's `rxndt <- rxndt[order(pathway, rxn, complex, -bitscore)]`.
220    reactions.sort_by(|a, b| {
221        a.pathway
222            .cmp(&b.pathway)
223            .then_with(|| a.rxn.cmp(&b.rxn))
224            .then_with(|| {
225                a.complex
226                    .as_deref()
227                    .unwrap_or("")
228                    .cmp(b.complex.as_deref().unwrap_or(""))
229            })
230            .then_with(|| {
231                b.bitscore
232                    .partial_cmp(&a.bitscore)
233                    .unwrap_or(std::cmp::Ordering::Equal)
234            })
235    });
236
237    // 6. Pathway scoring.
238    let pathway_results = score_pathways(&reactions, opts);
239
240    // 7. Stamp each reaction row with its pathway status and `has_dbhit`.
241    let pwy_status_map: HashMap<&str, Option<PwyStatus>> =
242        pathway_results.iter().map(|p| (p.id.as_str(), p.status)).collect();
243    for r in &mut reactions {
244        r.pathway_status = pwy_status_map.get(r.pathway.as_str()).copied().unwrap_or(None);
245        r.has_dbhit = !r.dbhit.is_empty();
246    }
247
248    Ok(FindReport { reactions, pathways: pathway_results })
249}
250
251// -- Internal plumbing --
252
253#[derive(Clone, Hash, PartialEq, Eq)]
254struct ReactionKey {
255    rxn: String,
256    name: String,
257    ec: String,
258}
259
260fn dedup_reactions(expanded: &[ExpandedReaction]) -> Vec<ReactionKey> {
261    let mut seen = HashSet::new();
262    let mut out = Vec::new();
263    for e in expanded {
264        let k = ReactionKey { rxn: e.rxn.clone(), name: e.name.clone(), ec: e.ec.clone() };
265        if seen.insert(k.clone()) {
266            out.push(k);
267        }
268    }
269    out
270}
271
272/// Concatenate every resolved reference FASTA into one big `query.faa`,
273/// prepending `<label>|` to each sequence header so the downstream hit
274/// parser can trace a hit back to its originating reference file (matching
275/// `prepare_batch_alignments.R:374-407`).
276fn concat_refs(
277    out: &Path,
278    seq_by_key: &HashMap<ReactionKey, Vec<seqfile::ResolvedSeq>>,
279) -> Result<usize, FindError> {
280    let f = File::create(out).map_err(|e| FindError::Io {
281        path: out.to_path_buf(),
282        source: e,
283    })?;
284    let mut w = BufWriter::new(f);
285    let mut emitted: HashSet<&str> = HashSet::new();
286    let mut n = 0usize;
287    for list in seq_by_key.values() {
288        for r in list {
289            if !emitted.insert(r.label.as_str()) {
290                continue;
291            }
292            let mut buf = Vec::new();
293            File::open(&r.path)
294                .and_then(|mut f| f.read_to_end(&mut buf))
295                .map_err(|e| FindError::Io { path: r.path.clone(), source: e })?;
296            // rewrite headers `>...` to `>LABEL|...`
297            for line in buf.split(|b| *b == b'\n') {
298                if line.is_empty() {
299                    continue;
300                }
301                if line.first() == Some(&b'>') {
302                    write!(w, ">{}|", r.label).map_err(|e| FindError::Io {
303                        path: out.to_path_buf(),
304                        source: e,
305                    })?;
306                    w.write_all(&line[1..]).map_err(|e| FindError::Io {
307                        path: out.to_path_buf(),
308                        source: e,
309                    })?;
310                    writeln!(w).map_err(|e| FindError::Io {
311                        path: out.to_path_buf(),
312                        source: e,
313                    })?;
314                } else {
315                    w.write_all(line).map_err(|e| FindError::Io {
316                        path: out.to_path_buf(),
317                        source: e,
318                    })?;
319                    writeln!(w).map_err(|e| FindError::Io {
320                        path: out.to_path_buf(),
321                        source: e,
322                    })?;
323                }
324            }
325            n += 1;
326        }
327    }
328    w.flush().map_err(|e| FindError::Io {
329        path: out.to_path_buf(),
330        source: e,
331    })?;
332    Ok(n)
333}
334
335fn group_by_file(hits: Vec<Hit>) -> HashMap<String, Vec<Hit>> {
336    let mut map: HashMap<String, Vec<Hit>> = HashMap::new();
337    for h in hits {
338        // The qseqid is the full FASTA header first-token, i.e.
339        // `<label>|<orig_acc>` (BLAST/diamond preserve first-token only).
340        let (file, orig) = match h.qseqid.split_once('|') {
341            Some((f, rest)) => (f.to_string(), rest.to_string()),
342            None => (String::new(), h.qseqid.clone()),
343        };
344        let mut hh = h;
345        hh.qseqid = orig;
346        map.entry(file).or_default().push(hh);
347    }
348    map
349}
350
351#[allow(clippy::too_many_arguments)]
352fn build_reaction_rows(
353    e: &ExpandedReaction,
354    seq_by_key: &HashMap<ReactionKey, Vec<seqfile::ResolvedSeq>>,
355    hits_by_file: &HashMap<String, Vec<Hit>>,
356    subunit_by_qseqid: &HashMap<(String, String), Option<String>>,
357    subunit_count_by_key: &HashMap<ReactionKey, (bool, u32, String)>,
358    dbhit_index: &DbhitIndex,
359    classify_opts: &ClassifyOptions<'_>,
360    subunit_cutoff: f32,
361) -> Vec<ReactionHit> {
362    let key = ReactionKey { rxn: e.rxn.clone(), name: e.name.clone(), ec: e.ec.clone() };
363    let resolved = seq_by_key.get(&key);
364    let files: Vec<String> = resolved
365        .map(|v| v.iter().map(|r| r.label.clone()).collect())
366        .unwrap_or_default();
367    let has_seq_data = !files.is_empty();
368
369    // Collect per-file hit list (preserves file attribution).
370    let mut all_hits: Vec<(String, Hit)> = Vec::new();
371    for f in &files {
372        if let Some(v) = hits_by_file.get(f) {
373            for h in v {
374                all_hits.push((f.clone(), h.clone()));
375            }
376        }
377    }
378
379    // Pre-compute dbhit once per reaction.
380    let dbhit = dbhit_index.lookup(&e.rxn, &e.name, &e.ec);
381
382    // Subunit metadata (per-reaction, applied to every emitted row).
383    // `complex_scanned` is true iff we had at least one reference FASTA
384    // header to analyse — matches gapseq's convention of printing `NA`
385    // only when complex_detection actually ran.
386    let (is_complex, subunit_count, subunits_str) = subunit_count_by_key
387        .get(&key)
388        .cloned()
389        .unwrap_or((false, 0, String::new()));
390    let complex_scanned = has_seq_data;
391
392    // Per-reaction exception flag — computed once so every emitted row
393    // carries the same value.
394    let exception = crate::classify::ec_is_exception(&e.ec, classify_opts.exception_ecs);
395
396    // Complex-status computation across the full hit list.
397    let (complex_subunits_found, complex_undef_found, complex_status) = if is_complex {
398        let mut found: HashSet<String> = HashSet::new();
399        let mut undef = false;
400        for (f, h) in &all_hits {
401            if h.bitscore < classify_opts.bitcutoff || h.pident < classify_opts.identcutoff {
402                continue;
403            }
404            if exception && h.pident < classify_opts.ident_exception {
405                continue;
406            }
407            if let Some(Some(sub)) = subunit_by_qseqid.get(&(f.clone(), h.qseqid.clone())) {
408                if sub == "Subunit undefined" {
409                    undef = true;
410                } else {
411                    found.insert(sub.clone());
412                }
413            }
414        }
415        let sf = found.len() as u32;
416        let ratio = if subunit_count == 0 { 0.0 } else { sf as f32 / subunit_count as f32 };
417        let st = if ratio > subunit_cutoff || (ratio == subunit_cutoff && undef) {
418            Some(1u8)
419        } else {
420            None
421        };
422        (Some(sf), Some(undef), st)
423    } else {
424        (None, None, None)
425    };
426
427    // Emit one row per hit. When there are no hits, emit a single
428    // placeholder row with status ∈ {NoBlast, NoSeqData, Spontaneous}.
429    if all_hits.is_empty() {
430        let status = if !has_seq_data && e.spont {
431            HitStatus::Spontaneous
432        } else if has_seq_data {
433            HitStatus::NoBlast
434        } else {
435            HitStatus::NoSeqData
436        };
437        let label_for_meta = files.first().cloned().unwrap_or_default();
438        let (src, reftype) = derive_src_and_type(&label_for_meta);
439        return vec![ReactionHit {
440            pathway: e.pathway.clone(),
441            pathway_status: None,
442            rxn: e.rxn.clone(),
443            name: e.name.clone(),
444            ec: e.ec.clone(),
445            keyrea: e.keyrea,
446            spont: e.spont,
447            is_complex,
448            subunit_count,
449            // gapseq leaves the column blank when complex detection didn't
450            // run (no reference fastas). The output layer turns a blank
451            // with complex_scanned=true into `NA`; keep blank otherwise.
452            subunits: if complex_scanned { subunits_str.clone() } else { String::new() },
453            complex: None,
454            subunits_found: if complex_scanned { complex_subunits_found.or(Some(0)) } else { None },
455            subunit_undefined_found: complex_undef_found,
456            complex_status,
457            file: files.first().cloned(),
458            dbhit,
459            has_dbhit: false,
460            src,
461            reftype,
462            qseqid: None,
463            pident: None,
464            evalue: None,
465            bitscore: None,
466            qcov: None,
467            stitle: None,
468            sstart: None,
469            send: None,
470            exception,
471            status,
472        }];
473    }
474
475    // Per-hit rows, sorted by (-bitscore, stitle) to mirror R's
476    // `order(rxn, name, ec, stitle, complex, -bitscore)` then unique().
477    let mut rows: Vec<ReactionHit> = Vec::with_capacity(all_hits.len());
478    for (f, h) in &all_hits {
479        let pass_main = h.bitscore >= classify_opts.bitcutoff && h.pident >= classify_opts.identcutoff;
480        let pass_exc = if exception { h.pident >= classify_opts.ident_exception } else { true };
481        let status = if pass_main && pass_exc {
482            HitStatus::GoodBlast
483        } else {
484            HitStatus::BadBlast
485        };
486        let complex = subunit_by_qseqid.get(&(f.clone(), h.qseqid.clone())).cloned().flatten();
487        let (src, reftype) = derive_src_and_type(f);
488        rows.push(ReactionHit {
489            pathway: e.pathway.clone(),
490            pathway_status: None,
491            rxn: e.rxn.clone(),
492            name: e.name.clone(),
493            ec: e.ec.clone(),
494            keyrea: e.keyrea,
495            spont: e.spont,
496            is_complex,
497            subunit_count,
498            subunits: subunits_str.clone(),
499            complex,
500            subunits_found: complex_subunits_found,
501            subunit_undefined_found: complex_undef_found,
502            complex_status,
503            file: Some(f.clone()),
504            dbhit: dbhit.clone(),
505            has_dbhit: false,
506            src,
507            reftype,
508            qseqid: Some(h.qseqid.clone()),
509            pident: Some(h.pident),
510            evalue: Some(h.evalue),
511            bitscore: Some(h.bitscore),
512            qcov: Some(h.qcov),
513            stitle: Some(h.stitle.clone()),
514            sstart: Some(h.sstart),
515            send: Some(h.send),
516            exception,
517            status,
518        });
519    }
520
521    // R dedups by (rxn, name, ec, stitle, complex) keeping the row with
522    // the highest bitscore (because the preceding sort puts -bitscore last).
523    rows.sort_by(|a, b| {
524        let key_a = (a.stitle.as_deref().unwrap_or(""), a.complex.as_deref().unwrap_or(""));
525        let key_b = (b.stitle.as_deref().unwrap_or(""), b.complex.as_deref().unwrap_or(""));
526        key_a
527            .cmp(&key_b)
528            .then_with(|| b.bitscore.partial_cmp(&a.bitscore).unwrap_or(std::cmp::Ordering::Equal))
529    });
530    let mut seen: HashSet<(String, String)> = HashSet::new();
531    rows.retain(|r| {
532        seen.insert((
533            r.stitle.clone().unwrap_or_default(),
534            r.complex.clone().unwrap_or_default(),
535        ))
536    });
537    rows
538}
539
540/// Map a file label like `rev/1.1.1.1.fasta` into (`src`, `type`).
541/// - `rxn/*.fasta` → src=rxn, type=metacyc
542/// - `rev/*.fasta` / `unrev/*.fasta` with `<EC>.fasta` → type=EC
543/// - `rev/*.fasta` / `unrev/*.fasta` with MD5 stem → type=reaName
544/// - `user/*.fasta` → src=user, type preserved from filename if distinguishable
545fn derive_src_and_type(label: &str) -> (String, String) {
546    if label.is_empty() {
547        return (String::new(), String::new());
548    }
549    let (src, rest) = match label.split_once('/') {
550        Some(p) => p,
551        None => return (String::new(), String::new()),
552    };
553    let stem = rest.trim_end_matches(".fasta");
554    let rtype = match src {
555        "rxn" => "metacyc",
556        "user" => "user",
557        _ => {
558            if is_ec_stem(stem) {
559                "EC"
560            } else {
561                "reaName"
562            }
563        }
564    };
565    (src.to_string(), rtype.to_string())
566}
567
568/// True if `stem` looks like an EC number: `N.N.N.N` with digits and
569/// optional `n` in each slot.
570fn is_ec_stem(stem: &str) -> bool {
571    let parts: Vec<&str> = stem.split('.').collect();
572    if parts.len() != 4 {
573        return false;
574    }
575    parts.iter().all(|p| !p.is_empty() && p.chars().all(|c| c.is_ascii_digit() || c == 'n'))
576}
577
578/// Scan a FASTA file and return `(accession, full_descriptor)` per header.
579/// `accession` is the first whitespace-delimited token of the header;
580/// `full_descriptor` is the entire header line after `>`.
581fn read_fasta_headers(path: &Path) -> Result<Vec<(String, String)>, std::io::Error> {
582    let f = File::open(path)?;
583    let r = BufReader::new(f);
584    let mut out = Vec::new();
585    for line in r.lines() {
586        let line = line?;
587        if let Some(rest) = line.strip_prefix('>') {
588            let acc = rest
589                .split_whitespace()
590                .next()
591                .unwrap_or("")
592                .to_string();
593            out.push((acc, rest.to_string()));
594        }
595    }
596    Ok(out)
597}
598
599fn score_pathways(reactions: &[ReactionHit], opts: &FindOptions<'_>) -> Vec<PathwayResult> {
600    // One entry per pathway; reactions are already one-per-(pathway,rxn).
601    let mut by_pwy: BTreeMap<String, (String, Vec<&ReactionHit>)> = BTreeMap::new();
602    for r in reactions {
603        let entry = by_pwy.entry(r.pathway.clone()).or_insert_with(|| (String::new(), Vec::new()));
604        entry.1.push(r);
605    }
606
607    // Reactions were emitted with pathway_name on the ExpandedReaction,
608    // but not stored on ReactionHit. We infer from the first pass by
609    // keeping a separate lookup elsewhere; here we leave blank and let
610    // the caller supply it. Simplification: we stash nothing and the
611    // caller downstream can join with the pathway table by id.
612    let _ = &mut by_pwy; // placeholder for name injection below
613
614    let mut out = Vec::new();
615    for (pwy_id, (_name, rxns)) in &by_pwy {
616        // Deduplicate by reaction id within a pathway (same rxn may
617        // appear twice if reaEc has duplicates). Sort alphabetically by
618        // rxn id to mirror `analyse_alignments.R:143` which runs
619        // `order(pathway, rxn, complex, -bitscore)` before aggregation.
620        let mut seen = HashSet::new();
621        let mut rxns: Vec<&ReactionHit> = rxns
622            .iter()
623            .copied()
624            .filter(|r| seen.insert(r.rxn.clone()))
625            .collect();
626        rxns.sort_by(|a, b| a.rxn.cmp(&b.rxn));
627
628        let n_reaction = rxns.len() as u32;
629        let n_spont = rxns.iter().filter(|r| r.spont).count() as u32;
630        let n_vague = rxns.iter().filter(|r| r.status == HitStatus::NoSeqData).count() as u32;
631        let n_key = rxns
632            .iter()
633            .filter(|r| r.keyrea && r.status != HitStatus::NoSeqData)
634            .count() as u32;
635        let reaction_found = |r: &&ReactionHit| {
636            (!r.is_complex && r.status == HitStatus::GoodBlast)
637                || (r.is_complex && r.complex_status.is_some())
638        };
639        let n_found = rxns.iter().filter(|r| reaction_found(r)).count() as u32;
640        let n_key_found = rxns
641            .iter()
642            .filter(|r| reaction_found(r) && r.keyrea)
643            .count() as u32;
644
645        let found_ids: Vec<String> = rxns
646            .iter()
647            .filter(|r| reaction_found(r))
648            .map(|r| r.rxn.clone())
649            .collect();
650        let spont_ids: Vec<String> =
651            rxns.iter().filter(|r| r.spont).map(|r| r.rxn.clone()).collect();
652        let key_ids: Vec<String> =
653            rxns.iter().filter(|r| r.keyrea).map(|r| r.rxn.clone()).collect();
654
655        // Match R's analyse_alignments.R:164-165 exactly — R uses the ratio
656        // `NrVague/(NrReaction-NrSpontaneous)` with 0/0 guarded by data.table
657        // (coerces to NaN which is then neither < nor >=). We handle that
658        // explicitly: if the denominator is 0, force completeness to 0.
659        let denom_no_spont = n_reaction.saturating_sub(n_spont);
660        let hint_off = opts.completeness_hint_off as f64;
661        let hint_on = opts.completeness_hint_on as f64;
662        let completeness: f64 = if denom_no_spont == 0 {
663            0.0
664        } else {
665            let vague_frac = n_vague as f64 / denom_no_spont as f64;
666            if vague_frac < opts.vague_cutoff as f64 {
667                let denom = n_reaction.saturating_sub(n_vague).saturating_sub(n_spont);
668                if denom == 0 {
669                    0.0
670                } else {
671                    n_found as f64 / denom as f64 * 100.0
672                }
673            } else {
674                n_found as f64 / denom_no_spont as f64 * 100.0
675            }
676        };
677
678        let all_key_found = n_key == n_key_found;
679        let mut prediction = if !opts.strict_candidates {
680            completeness >= hint_off * 100.0 && all_key_found
681        } else {
682            completeness >= hint_off * 100.0
683        };
684        if !opts.strict_candidates
685            && n_key > 0
686            && all_key_found
687            && completeness >= hint_on * 100.0
688        {
689            prediction = true;
690        }
691        if n_reaction == n_vague + n_spont {
692            prediction = false;
693        }
694
695        let status = if (completeness - 100.0).abs() < 1e-9 {
696            Some(PwyStatus::Full)
697        } else if prediction && all_key_found {
698            Some(PwyStatus::Threshold)
699        } else if prediction && completeness < hint_off * 100.0 {
700            Some(PwyStatus::Keyenzyme)
701        } else {
702            None
703        };
704
705        out.push(PathwayResult {
706            id: pwy_id.clone(),
707            name: String::new(), // stamped below
708            prediction,
709            completeness,
710            status,
711            n_reaction,
712            n_spontaneous: n_spont,
713            n_vague,
714            n_key_reaction: n_key,
715            n_reaction_found: n_found,
716            n_key_reaction_found: n_key_found,
717            reactions_found: found_ids,
718            spontaneous_reactions: spont_ids,
719            key_reactions: key_ids,
720        });
721    }
722    out
723}