1use crate::classify::ClassifyOptions;
12use crate::complex::detect_subunits;
13use crate::dbhit::DbhitIndex;
14use crate::pathways::{self, ExpandedReaction, MatchMode, PathwaySelectOptions};
15use crate::seqfile::{self, SeqfileOptions};
16use crate::types::{HitStatus, PathwayResult, PwyStatus, ReactionHit};
17use gapsmith_align::{AlignOpts, Aligner, Hit};
18use gapsmith_db::{ComplexSubunitTable, ExceptionRow, PathwayTable};
19use std::collections::{BTreeMap, HashMap, HashSet};
20use std::fs::File;
21use std::io::{BufRead, BufReader, BufWriter, Read, Write};
22use std::path::Path;
23
24#[derive(Debug, thiserror::Error)]
25pub enum FindError {
26 #[error("alignment error: {0}")]
27 Align(#[from] gapsmith_align::AlignError),
28 #[error("i/o error on `{path}`: {source}")]
29 Io { path: std::path::PathBuf, #[source] source: std::io::Error },
30 #[error("no reference FASTAs resolved — check --seq-dir and --taxonomy")]
31 NoReferences,
32}
33
34#[derive(Debug, Clone)]
35pub struct FindOptions<'a> {
36 pub keyword: &'a str,
37 pub match_mode: MatchMode,
38 pub exclude_superpathways: bool,
39 pub only_active: bool,
40 pub bitcutoff: f32,
41 pub identcutoff: f32,
42 pub ident_exception: f32,
43 pub coverage_pct: u32,
46 pub vague_cutoff: f32,
50 pub completeness_hint_off: f32,
52 pub completeness_hint_on: f32,
54 pub strict_candidates: bool,
55 pub subunit_cutoff: f32,
58 pub valid_tax_ids: &'a [String],
60}
61
62impl<'a> Default for FindOptions<'a> {
63 fn default() -> Self {
64 Self {
65 keyword: "all",
66 match_mode: MatchMode::Regex,
67 exclude_superpathways: true,
68 only_active: true,
69 bitcutoff: 200.0,
70 identcutoff: 0.0,
71 ident_exception: 70.0,
72 coverage_pct: 75,
73 vague_cutoff: 0.3,
74 completeness_hint_off: 0.80,
75 completeness_hint_on: 0.66,
76 strict_candidates: false,
77 subunit_cutoff: 0.5,
78 valid_tax_ids: &[],
79 }
80 }
81}
82
83#[derive(Debug, Clone)]
84pub struct FindReport {
85 pub reactions: Vec<ReactionHit>,
86 pub pathways: Vec<PathwayResult>,
87}
88
89#[allow(clippy::too_many_arguments)]
91pub fn run_find(
92 table: &PathwayTable,
93 exceptions: &[ExceptionRow],
94 subunit_dict: &ComplexSubunitTable,
95 dbhit_index: &DbhitIndex,
96 seq_opts: &SeqfileOptions,
97 genome_fasta: &Path,
98 aligner: &dyn Aligner,
99 align_opts: &AlignOpts,
100 opts: &FindOptions<'_>,
101 workdir: &Path,
102) -> Result<FindReport, FindError> {
103 std::fs::create_dir_all(workdir).map_err(|e| FindError::Io {
104 path: workdir.to_path_buf(),
105 source: e,
106 })?;
107
108 let expanded = pathways::select(
110 table,
111 &PathwaySelectOptions {
112 keyword: opts.keyword,
113 mode: opts.match_mode,
114 exclude_superpathways: opts.exclude_superpathways,
115 only_active: opts.only_active,
116 valid_tax_ids: opts.valid_tax_ids,
117 },
118 );
119 tracing::info!(reactions = expanded.len(), "pathways expanded to reactions");
120 if expanded.is_empty() {
121 return Ok(FindReport { reactions: Vec::new(), pathways: Vec::new() });
122 }
123
124 let unique_reactions = dedup_reactions(&expanded);
134 let mut seq_by_key: HashMap<ReactionKey, Vec<seqfile::ResolvedSeq>> = HashMap::new();
135 let mut subunit_by_qseqid: HashMap<(String, String), Option<String>> = HashMap::new();
136 let mut subunit_count_by_key: HashMap<ReactionKey, (bool, u32, String)> = HashMap::new();
137 let mut fasta_header_cache: HashMap<std::path::PathBuf, std::sync::Arc<Vec<(String, String)>>> =
138 HashMap::new();
139 for k in &unique_reactions {
140 let resolved = seqfile::resolve_for_reaction(seq_opts, &k.rxn, &k.ec, &k.name);
141 let mut desc_owned: Vec<(String, String, String)> = Vec::new();
143 for r in &resolved {
144 let entries = fasta_header_cache
145 .entry(r.path.clone())
146 .or_insert_with(|| {
147 std::sync::Arc::new(read_fasta_headers(&r.path).unwrap_or_default())
148 })
149 .clone();
150 for (acc, full) in entries.iter() {
151 desc_owned.push((r.label.clone(), acc.clone(), full.clone()));
152 }
153 }
154 let refs: Vec<&str> =
157 desc_owned.iter().map(|(_, _, full)| full.as_str()).collect();
158 let assigns = detect_subunits(&k.rxn, &refs, subunit_dict);
159 let mut distinct: HashSet<String> = HashSet::new();
160 for ((label, acc, _), su) in desc_owned.iter().zip(assigns.iter()) {
161 subunit_by_qseqid.insert((label.clone(), acc.clone()), su.clone());
162 if let Some(name) = su {
163 if name != "Subunit undefined" {
164 distinct.insert(name.clone());
165 }
166 }
167 }
168 let subunit_count = distinct.len() as u32;
169 let is_complex = subunit_count >= 2;
170 let mut list: Vec<String> = distinct.into_iter().collect();
171 list.sort();
172 subunit_count_by_key
173 .insert(k.clone(), (is_complex, subunit_count, list.join(",")));
174 seq_by_key.insert(k.clone(), resolved);
175 }
176
177 let query_path = workdir.join("query.faa");
179 let n_concat = concat_refs(&query_path, &seq_by_key)?;
180 tracing::info!(seqfiles = n_concat, "concatenated reference fastas");
181
182 let hits_by_file: HashMap<String, Vec<Hit>> = if n_concat > 0 {
184 let raw_hits = aligner.align(&query_path, genome_fasta, align_opts)?;
185 tracing::info!(hits = raw_hits.len(), "alignment complete");
186 group_by_file(raw_hits)
187 } else {
188 HashMap::new()
189 };
190
191 let exception_set: HashSet<String> = exceptions.iter().map(|e| e.id.clone()).collect();
193 let classify_opts = ClassifyOptions {
194 bitcutoff: opts.bitcutoff,
195 identcutoff: opts.identcutoff,
196 ident_exception: opts.ident_exception,
197 exception_ecs: &exception_set,
198 };
199
200 let mut reactions: Vec<ReactionHit> = Vec::with_capacity(expanded.len());
201 for e in &expanded {
202 let rows = build_reaction_rows(
206 e,
207 &seq_by_key,
208 &hits_by_file,
209 &subunit_by_qseqid,
210 &subunit_count_by_key,
211 dbhit_index,
212 &classify_opts,
213 opts.subunit_cutoff,
214 );
215 reactions.extend(rows);
216 }
217
218 reactions.sort_by(|a, b| {
221 a.pathway
222 .cmp(&b.pathway)
223 .then_with(|| a.rxn.cmp(&b.rxn))
224 .then_with(|| {
225 a.complex
226 .as_deref()
227 .unwrap_or("")
228 .cmp(b.complex.as_deref().unwrap_or(""))
229 })
230 .then_with(|| {
231 b.bitscore
232 .partial_cmp(&a.bitscore)
233 .unwrap_or(std::cmp::Ordering::Equal)
234 })
235 });
236
237 let pathway_results = score_pathways(&reactions, opts);
239
240 let pwy_status_map: HashMap<&str, Option<PwyStatus>> =
242 pathway_results.iter().map(|p| (p.id.as_str(), p.status)).collect();
243 for r in &mut reactions {
244 r.pathway_status = pwy_status_map.get(r.pathway.as_str()).copied().unwrap_or(None);
245 r.has_dbhit = !r.dbhit.is_empty();
246 }
247
248 Ok(FindReport { reactions, pathways: pathway_results })
249}
250
251#[derive(Clone, Hash, PartialEq, Eq)]
254struct ReactionKey {
255 rxn: String,
256 name: String,
257 ec: String,
258}
259
260fn dedup_reactions(expanded: &[ExpandedReaction]) -> Vec<ReactionKey> {
261 let mut seen = HashSet::new();
262 let mut out = Vec::new();
263 for e in expanded {
264 let k = ReactionKey { rxn: e.rxn.clone(), name: e.name.clone(), ec: e.ec.clone() };
265 if seen.insert(k.clone()) {
266 out.push(k);
267 }
268 }
269 out
270}
271
272fn concat_refs(
277 out: &Path,
278 seq_by_key: &HashMap<ReactionKey, Vec<seqfile::ResolvedSeq>>,
279) -> Result<usize, FindError> {
280 let f = File::create(out).map_err(|e| FindError::Io {
281 path: out.to_path_buf(),
282 source: e,
283 })?;
284 let mut w = BufWriter::new(f);
285 let mut emitted: HashSet<&str> = HashSet::new();
286 let mut n = 0usize;
287 for list in seq_by_key.values() {
288 for r in list {
289 if !emitted.insert(r.label.as_str()) {
290 continue;
291 }
292 let mut buf = Vec::new();
293 File::open(&r.path)
294 .and_then(|mut f| f.read_to_end(&mut buf))
295 .map_err(|e| FindError::Io { path: r.path.clone(), source: e })?;
296 for line in buf.split(|b| *b == b'\n') {
298 if line.is_empty() {
299 continue;
300 }
301 if line.first() == Some(&b'>') {
302 write!(w, ">{}|", r.label).map_err(|e| FindError::Io {
303 path: out.to_path_buf(),
304 source: e,
305 })?;
306 w.write_all(&line[1..]).map_err(|e| FindError::Io {
307 path: out.to_path_buf(),
308 source: e,
309 })?;
310 writeln!(w).map_err(|e| FindError::Io {
311 path: out.to_path_buf(),
312 source: e,
313 })?;
314 } else {
315 w.write_all(line).map_err(|e| FindError::Io {
316 path: out.to_path_buf(),
317 source: e,
318 })?;
319 writeln!(w).map_err(|e| FindError::Io {
320 path: out.to_path_buf(),
321 source: e,
322 })?;
323 }
324 }
325 n += 1;
326 }
327 }
328 w.flush().map_err(|e| FindError::Io {
329 path: out.to_path_buf(),
330 source: e,
331 })?;
332 Ok(n)
333}
334
335fn group_by_file(hits: Vec<Hit>) -> HashMap<String, Vec<Hit>> {
336 let mut map: HashMap<String, Vec<Hit>> = HashMap::new();
337 for h in hits {
338 let (file, orig) = match h.qseqid.split_once('|') {
341 Some((f, rest)) => (f.to_string(), rest.to_string()),
342 None => (String::new(), h.qseqid.clone()),
343 };
344 let mut hh = h;
345 hh.qseqid = orig;
346 map.entry(file).or_default().push(hh);
347 }
348 map
349}
350
351#[allow(clippy::too_many_arguments)]
352fn build_reaction_rows(
353 e: &ExpandedReaction,
354 seq_by_key: &HashMap<ReactionKey, Vec<seqfile::ResolvedSeq>>,
355 hits_by_file: &HashMap<String, Vec<Hit>>,
356 subunit_by_qseqid: &HashMap<(String, String), Option<String>>,
357 subunit_count_by_key: &HashMap<ReactionKey, (bool, u32, String)>,
358 dbhit_index: &DbhitIndex,
359 classify_opts: &ClassifyOptions<'_>,
360 subunit_cutoff: f32,
361) -> Vec<ReactionHit> {
362 let key = ReactionKey { rxn: e.rxn.clone(), name: e.name.clone(), ec: e.ec.clone() };
363 let resolved = seq_by_key.get(&key);
364 let files: Vec<String> = resolved
365 .map(|v| v.iter().map(|r| r.label.clone()).collect())
366 .unwrap_or_default();
367 let has_seq_data = !files.is_empty();
368
369 let mut all_hits: Vec<(String, Hit)> = Vec::new();
371 for f in &files {
372 if let Some(v) = hits_by_file.get(f) {
373 for h in v {
374 all_hits.push((f.clone(), h.clone()));
375 }
376 }
377 }
378
379 let dbhit = dbhit_index.lookup(&e.rxn, &e.name, &e.ec);
381
382 let (is_complex, subunit_count, subunits_str) = subunit_count_by_key
387 .get(&key)
388 .cloned()
389 .unwrap_or((false, 0, String::new()));
390 let complex_scanned = has_seq_data;
391
392 let exception = crate::classify::ec_is_exception(&e.ec, classify_opts.exception_ecs);
395
396 let (complex_subunits_found, complex_undef_found, complex_status) = if is_complex {
398 let mut found: HashSet<String> = HashSet::new();
399 let mut undef = false;
400 for (f, h) in &all_hits {
401 if h.bitscore < classify_opts.bitcutoff || h.pident < classify_opts.identcutoff {
402 continue;
403 }
404 if exception && h.pident < classify_opts.ident_exception {
405 continue;
406 }
407 if let Some(Some(sub)) = subunit_by_qseqid.get(&(f.clone(), h.qseqid.clone())) {
408 if sub == "Subunit undefined" {
409 undef = true;
410 } else {
411 found.insert(sub.clone());
412 }
413 }
414 }
415 let sf = found.len() as u32;
416 let ratio = if subunit_count == 0 { 0.0 } else { sf as f32 / subunit_count as f32 };
417 let st = if ratio > subunit_cutoff || (ratio == subunit_cutoff && undef) {
418 Some(1u8)
419 } else {
420 None
421 };
422 (Some(sf), Some(undef), st)
423 } else {
424 (None, None, None)
425 };
426
427 if all_hits.is_empty() {
430 let status = if !has_seq_data && e.spont {
431 HitStatus::Spontaneous
432 } else if has_seq_data {
433 HitStatus::NoBlast
434 } else {
435 HitStatus::NoSeqData
436 };
437 let label_for_meta = files.first().cloned().unwrap_or_default();
438 let (src, reftype) = derive_src_and_type(&label_for_meta);
439 return vec![ReactionHit {
440 pathway: e.pathway.clone(),
441 pathway_status: None,
442 rxn: e.rxn.clone(),
443 name: e.name.clone(),
444 ec: e.ec.clone(),
445 keyrea: e.keyrea,
446 spont: e.spont,
447 is_complex,
448 subunit_count,
449 subunits: if complex_scanned { subunits_str.clone() } else { String::new() },
453 complex: None,
454 subunits_found: if complex_scanned { complex_subunits_found.or(Some(0)) } else { None },
455 subunit_undefined_found: complex_undef_found,
456 complex_status,
457 file: files.first().cloned(),
458 dbhit,
459 has_dbhit: false,
460 src,
461 reftype,
462 qseqid: None,
463 pident: None,
464 evalue: None,
465 bitscore: None,
466 qcov: None,
467 stitle: None,
468 sstart: None,
469 send: None,
470 exception,
471 status,
472 }];
473 }
474
475 let mut rows: Vec<ReactionHit> = Vec::with_capacity(all_hits.len());
478 for (f, h) in &all_hits {
479 let pass_main = h.bitscore >= classify_opts.bitcutoff && h.pident >= classify_opts.identcutoff;
480 let pass_exc = if exception { h.pident >= classify_opts.ident_exception } else { true };
481 let status = if pass_main && pass_exc {
482 HitStatus::GoodBlast
483 } else {
484 HitStatus::BadBlast
485 };
486 let complex = subunit_by_qseqid.get(&(f.clone(), h.qseqid.clone())).cloned().flatten();
487 let (src, reftype) = derive_src_and_type(f);
488 rows.push(ReactionHit {
489 pathway: e.pathway.clone(),
490 pathway_status: None,
491 rxn: e.rxn.clone(),
492 name: e.name.clone(),
493 ec: e.ec.clone(),
494 keyrea: e.keyrea,
495 spont: e.spont,
496 is_complex,
497 subunit_count,
498 subunits: subunits_str.clone(),
499 complex,
500 subunits_found: complex_subunits_found,
501 subunit_undefined_found: complex_undef_found,
502 complex_status,
503 file: Some(f.clone()),
504 dbhit: dbhit.clone(),
505 has_dbhit: false,
506 src,
507 reftype,
508 qseqid: Some(h.qseqid.clone()),
509 pident: Some(h.pident),
510 evalue: Some(h.evalue),
511 bitscore: Some(h.bitscore),
512 qcov: Some(h.qcov),
513 stitle: Some(h.stitle.clone()),
514 sstart: Some(h.sstart),
515 send: Some(h.send),
516 exception,
517 status,
518 });
519 }
520
521 rows.sort_by(|a, b| {
524 let key_a = (a.stitle.as_deref().unwrap_or(""), a.complex.as_deref().unwrap_or(""));
525 let key_b = (b.stitle.as_deref().unwrap_or(""), b.complex.as_deref().unwrap_or(""));
526 key_a
527 .cmp(&key_b)
528 .then_with(|| b.bitscore.partial_cmp(&a.bitscore).unwrap_or(std::cmp::Ordering::Equal))
529 });
530 let mut seen: HashSet<(String, String)> = HashSet::new();
531 rows.retain(|r| {
532 seen.insert((
533 r.stitle.clone().unwrap_or_default(),
534 r.complex.clone().unwrap_or_default(),
535 ))
536 });
537 rows
538}
539
540fn derive_src_and_type(label: &str) -> (String, String) {
546 if label.is_empty() {
547 return (String::new(), String::new());
548 }
549 let (src, rest) = match label.split_once('/') {
550 Some(p) => p,
551 None => return (String::new(), String::new()),
552 };
553 let stem = rest.trim_end_matches(".fasta");
554 let rtype = match src {
555 "rxn" => "metacyc",
556 "user" => "user",
557 _ => {
558 if is_ec_stem(stem) {
559 "EC"
560 } else {
561 "reaName"
562 }
563 }
564 };
565 (src.to_string(), rtype.to_string())
566}
567
568fn is_ec_stem(stem: &str) -> bool {
571 let parts: Vec<&str> = stem.split('.').collect();
572 if parts.len() != 4 {
573 return false;
574 }
575 parts.iter().all(|p| !p.is_empty() && p.chars().all(|c| c.is_ascii_digit() || c == 'n'))
576}
577
578fn read_fasta_headers(path: &Path) -> Result<Vec<(String, String)>, std::io::Error> {
582 let f = File::open(path)?;
583 let r = BufReader::new(f);
584 let mut out = Vec::new();
585 for line in r.lines() {
586 let line = line?;
587 if let Some(rest) = line.strip_prefix('>') {
588 let acc = rest
589 .split_whitespace()
590 .next()
591 .unwrap_or("")
592 .to_string();
593 out.push((acc, rest.to_string()));
594 }
595 }
596 Ok(out)
597}
598
599fn score_pathways(reactions: &[ReactionHit], opts: &FindOptions<'_>) -> Vec<PathwayResult> {
600 let mut by_pwy: BTreeMap<String, (String, Vec<&ReactionHit>)> = BTreeMap::new();
602 for r in reactions {
603 let entry = by_pwy.entry(r.pathway.clone()).or_insert_with(|| (String::new(), Vec::new()));
604 entry.1.push(r);
605 }
606
607 let _ = &mut by_pwy; let mut out = Vec::new();
615 for (pwy_id, (_name, rxns)) in &by_pwy {
616 let mut seen = HashSet::new();
621 let mut rxns: Vec<&ReactionHit> = rxns
622 .iter()
623 .copied()
624 .filter(|r| seen.insert(r.rxn.clone()))
625 .collect();
626 rxns.sort_by(|a, b| a.rxn.cmp(&b.rxn));
627
628 let n_reaction = rxns.len() as u32;
629 let n_spont = rxns.iter().filter(|r| r.spont).count() as u32;
630 let n_vague = rxns.iter().filter(|r| r.status == HitStatus::NoSeqData).count() as u32;
631 let n_key = rxns
632 .iter()
633 .filter(|r| r.keyrea && r.status != HitStatus::NoSeqData)
634 .count() as u32;
635 let reaction_found = |r: &&ReactionHit| {
636 (!r.is_complex && r.status == HitStatus::GoodBlast)
637 || (r.is_complex && r.complex_status.is_some())
638 };
639 let n_found = rxns.iter().filter(|r| reaction_found(r)).count() as u32;
640 let n_key_found = rxns
641 .iter()
642 .filter(|r| reaction_found(r) && r.keyrea)
643 .count() as u32;
644
645 let found_ids: Vec<String> = rxns
646 .iter()
647 .filter(|r| reaction_found(r))
648 .map(|r| r.rxn.clone())
649 .collect();
650 let spont_ids: Vec<String> =
651 rxns.iter().filter(|r| r.spont).map(|r| r.rxn.clone()).collect();
652 let key_ids: Vec<String> =
653 rxns.iter().filter(|r| r.keyrea).map(|r| r.rxn.clone()).collect();
654
655 let denom_no_spont = n_reaction.saturating_sub(n_spont);
660 let hint_off = opts.completeness_hint_off as f64;
661 let hint_on = opts.completeness_hint_on as f64;
662 let completeness: f64 = if denom_no_spont == 0 {
663 0.0
664 } else {
665 let vague_frac = n_vague as f64 / denom_no_spont as f64;
666 if vague_frac < opts.vague_cutoff as f64 {
667 let denom = n_reaction.saturating_sub(n_vague).saturating_sub(n_spont);
668 if denom == 0 {
669 0.0
670 } else {
671 n_found as f64 / denom as f64 * 100.0
672 }
673 } else {
674 n_found as f64 / denom_no_spont as f64 * 100.0
675 }
676 };
677
678 let all_key_found = n_key == n_key_found;
679 let mut prediction = if !opts.strict_candidates {
680 completeness >= hint_off * 100.0 && all_key_found
681 } else {
682 completeness >= hint_off * 100.0
683 };
684 if !opts.strict_candidates
685 && n_key > 0
686 && all_key_found
687 && completeness >= hint_on * 100.0
688 {
689 prediction = true;
690 }
691 if n_reaction == n_vague + n_spont {
692 prediction = false;
693 }
694
695 let status = if (completeness - 100.0).abs() < 1e-9 {
696 Some(PwyStatus::Full)
697 } else if prediction && all_key_found {
698 Some(PwyStatus::Threshold)
699 } else if prediction && completeness < hint_off * 100.0 {
700 Some(PwyStatus::Keyenzyme)
701 } else {
702 None
703 };
704
705 out.push(PathwayResult {
706 id: pwy_id.clone(),
707 name: String::new(), prediction,
709 completeness,
710 status,
711 n_reaction,
712 n_spontaneous: n_spont,
713 n_vague,
714 n_key_reaction: n_key,
715 n_reaction_found: n_found,
716 n_key_reaction_found: n_key_found,
717 reactions_found: found_ids,
718 spontaneous_reactions: spont_ids,
719 key_reactions: key_ids,
720 });
721 }
722 out
723}