Skip to main content

gapsmith_find/
seqfile.rs

1//! Reference-FASTA resolver.
2//!
3//! Given a `(rxn_id, ec, name)` tuple, walk `dat/seq/<taxonomy>/` in the
4//! priority order specified by `prepare_batch_alignments.R:150-234, 315-366`:
5//!
6//! 1. `user/<file>.fasta` — user-supplied overrides win.
7//! 2. `rxn/<RXN>.fasta` — direct per-reaction links (MetaCyc style).
8//! 3. `rev/<EC>.fasta` — UniProt reviewed clustered at 0.9.
9//! 4. `unrev/<EC>.fasta` — UniProt unreviewed clustered at 0.5.
10//! 5. `rev/<MD5(name)>.fasta` / `unrev/<MD5(name)>.fasta` — reaction-name
11//!    fallback. The MD5 of the bare reaction-name UTF-8 bytes, no trailing
12//!    newline, lower-case 32-char hex. Matches `uniprot.sh:155`:
13//!    `reaNameHash=$(echo -n "$rea" | md5sum | awk '{print $1}')`.
14//!
15//! The reaction-name fallback is SKIPPED when the name looks like a
16//! synthetic rxn id (`rxn00001`, `Rxn00001`, `RXN-...`) because those
17//! are not real enzyme names and gapseq's R code explicitly filters them
18//! (`prepare_batch_alignments.R:209`).
19//!
20//! `SeqfileOptions::seq_src` controls the reviewed-vs-unreviewed policy:
21//!
22//! - `SeqSrc::Reviewed`    → reviewed only
23//! - `SeqSrc::PreferRev`   → reviewed if present, else unreviewed
24//! - `SeqSrc::Both`        → reviewed and unreviewed both, concatenated
25//! - `SeqSrc::Unreviewed`  → unreviewed only
26//!
27//! Empty files (0 bytes) are treated as absent — the R pipeline does the
28//! same.
29
30use md5;
31use std::path::{Path, PathBuf};
32
33#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
34pub enum SeqSrc {
35    Reviewed,
36    #[default]
37    PreferRev,
38    Both,
39    Unreviewed,
40}
41
42#[derive(Debug, Clone)]
43pub struct SeqfileOptions {
44    /// Root of the per-taxonomy sequence tree, e.g. `<seq-dir>/Bacteria`.
45    /// Used for `rev/`, `unrev/`, `rxn/` resolution.
46    pub tax_root: PathBuf,
47    /// Separate root for `user/` files — gapseq's R code hard-codes this
48    /// to the built-in `<gapseq>/dat/seq/<tax>/` regardless of `-D`
49    /// (`prepare_batch_alignments.R:217`). Typically the same as
50    /// `tax_root` unless the caller passed an override seqdb.
51    pub user_root: PathBuf,
52    pub seq_src: SeqSrc,
53}
54
55#[derive(Debug, Clone)]
56pub struct ResolvedSeq {
57    /// Path to the reference FASTA.
58    pub path: PathBuf,
59    /// Short relative label used as the `file` field in reaction rows
60    /// (e.g. `rxn/FOO-RXN.fasta` or `rev/1.1.1.1.fasta`). Matches what
61    /// gapseq stamps in the concatenated `query.faa` headers.
62    pub label: String,
63}
64
65/// Resolve every applicable reference fasta for one reaction. Empty means
66/// the reaction has `no_seq_data`.
67///
68/// Priority order (mirrors `prepare_batch_alignments.R:150-234`):
69///
70/// 1. `user/<EC>.fasta`, `user/<MD5 name>.fasta`, `user/<rxn>.fasta` —
71///    all probed against `opts.user_root` (the built-in gapseq data dir).
72/// 2. `rxn/<RXN>.fasta` under `opts.tax_root`.
73/// 3. `rev/<EC>.fasta` / `unrev/<EC>.fasta` under `opts.tax_root`.
74/// 4. `rev/<MD5 name>.fasta` / `unrev/<MD5 name>.fasta` under `opts.tax_root`.
75pub fn resolve_for_reaction(
76    opts: &SeqfileOptions,
77    rxn_id: &str,
78    ec: &str,
79    name: &str,
80) -> Vec<ResolvedSeq> {
81    let mut out: Vec<ResolvedSeq> = Vec::new();
82
83    // 1a. user/<EC>.fasta — one probe per top-level EC (handles `/`-sep).
84    for one_ec in ec.split('/').map(str::trim).filter(|s| !s.is_empty()) {
85        if let Some(r) = probe(&opts.user_root, "user", &format!("{one_ec}.fasta")) {
86            out.push(r);
87        }
88    }
89    // 1b. user/<MD5(name)>.fasta if the name is a real enzyme label.
90    let name_trim = name.trim();
91    if !name_trim.is_empty() && !looks_like_rxn_id(name_trim) {
92        let hash = md5_hex(name_trim);
93        if let Some(r) = probe(&opts.user_root, "user", &format!("{hash}.fasta")) {
94            out.push(r);
95        }
96    }
97    // 1c. user/<rxn>.fasta.
98    if !rxn_id.is_empty() {
99        if let Some(r) = probe(&opts.user_root, "user", &format!("{rxn_id}.fasta")) {
100            out.push(r);
101        }
102    }
103    if !out.is_empty() {
104        return out;
105    }
106
107    // 2. Direct MetaCyc-style link.
108    if let Some(r) = probe(&opts.tax_root, "rxn", &format!("{rxn_id}.fasta")) {
109        return vec![r];
110    }
111
112    // 3. EC-based resolution (reviewed / unreviewed per seq_src).
113    let ec_trim = ec.trim();
114    if !ec_trim.is_empty() {
115        let mut ec_paths = Vec::new();
116        for one_ec in ec_trim.split('/').map(str::trim).filter(|s| !s.is_empty()) {
117            ec_paths.extend(resolve_by_stem(opts, one_ec));
118        }
119        if !ec_paths.is_empty() {
120            return ec_paths;
121        }
122    }
123
124    // 4. Reaction-name fallback via MD5 hash of the name bytes.
125    if !name_trim.is_empty() && !looks_like_rxn_id(name_trim) {
126        let hash = md5_hex(name_trim);
127        let hits = resolve_by_stem(opts, &hash);
128        if !hits.is_empty() {
129            return hits;
130        }
131    }
132
133    Vec::new()
134}
135
136/// `true` when a string looks like a synthetic reaction id rather than a
137/// human enzyme name. Mirrors the filter in
138/// `src/prepare_batch_alignments.R:209`:
139/// `^rxn[0-9]+$ | ^Rxn[0-9]+$ | ^RXN`.
140pub fn looks_like_rxn_id(name: &str) -> bool {
141    if let Some(rest) = name.strip_prefix("rxn") {
142        return !rest.is_empty() && rest.bytes().all(|b| b.is_ascii_digit());
143    }
144    if let Some(rest) = name.strip_prefix("Rxn") {
145        return !rest.is_empty() && rest.bytes().all(|b| b.is_ascii_digit());
146    }
147    name.starts_with("RXN")
148}
149
150/// MD5 hex digest of the UTF-8 bytes of `s` (no trailing newline).
151/// Matches `echo -n "$s" | md5sum`.
152pub fn md5_hex(s: &str) -> String {
153    use md5::{Digest, Md5};
154    let mut h = Md5::new();
155    h.update(s.as_bytes());
156    let out = h.finalize();
157    let mut hex = String::with_capacity(32);
158    for b in out.iter() {
159        hex.push_str(&format!("{b:02x}"));
160    }
161    hex
162}
163
164fn resolve_by_stem(opts: &SeqfileOptions, ec: &str) -> Vec<ResolvedSeq> {
165    let f = format!("{ec}.fasta");
166    let rev = probe(&opts.tax_root, "rev", &f);
167    let unrev = probe(&opts.tax_root, "unrev", &f);
168    match opts.seq_src {
169        SeqSrc::Reviewed => rev.into_iter().collect(),
170        SeqSrc::PreferRev => rev.or(unrev).into_iter().collect(),
171        SeqSrc::Both => vec![rev, unrev].into_iter().flatten().collect(),
172        SeqSrc::Unreviewed => unrev.into_iter().collect(),
173    }
174}
175
176fn probe(tax_root: &Path, dir: &str, filename: &str) -> Option<ResolvedSeq> {
177    let p = tax_root.join(dir).join(filename);
178    if !p.is_file() {
179        return None;
180    }
181    let meta = std::fs::metadata(&p).ok()?;
182    if meta.len() == 0 {
183        return None;
184    }
185    Some(ResolvedSeq { path: p, label: format!("{dir}/{filename}") })
186}
187
188#[cfg(test)]
189mod tests {
190    use super::*;
191    use std::fs;
192
193    #[test]
194    fn prefers_rxn_dir() {
195        let d = tempfile::tempdir().unwrap();
196        let root = d.path();
197        for sub in ["rxn", "rev", "unrev", "user"] {
198            fs::create_dir_all(root.join(sub)).unwrap();
199        }
200        fs::write(root.join("rxn/RXN-1.fasta"), ">a\nA\n").unwrap();
201        fs::write(root.join("rev/1.1.1.1.fasta"), ">b\nB\n").unwrap();
202
203        let opts = SeqfileOptions {
204            tax_root: root.into(),
205            user_root: root.into(),
206            seq_src: SeqSrc::PreferRev,
207        };
208        let r = resolve_for_reaction(&opts, "RXN-1", "1.1.1.1", "");
209        assert_eq!(r.len(), 1);
210        assert_eq!(r[0].label, "rxn/RXN-1.fasta");
211    }
212
213    #[test]
214    fn falls_back_to_reviewed_ec() {
215        let d = tempfile::tempdir().unwrap();
216        let root = d.path();
217        fs::create_dir_all(root.join("rev")).unwrap();
218        fs::write(root.join("rev/1.1.1.1.fasta"), ">b\nB\n").unwrap();
219        let opts = SeqfileOptions {
220            tax_root: root.into(),
221            user_root: root.into(),
222            seq_src: SeqSrc::PreferRev,
223        };
224        let r = resolve_for_reaction(&opts, "RXN-missing", "1.1.1.1", "");
225        assert_eq!(r.len(), 1);
226        assert_eq!(r[0].label, "rev/1.1.1.1.fasta");
227    }
228
229    #[test]
230    fn both_mode_returns_rev_and_unrev() {
231        let d = tempfile::tempdir().unwrap();
232        let root = d.path();
233        for sub in ["rev", "unrev"] {
234            fs::create_dir_all(root.join(sub)).unwrap();
235        }
236        fs::write(root.join("rev/1.1.1.1.fasta"), ">r\nR\n").unwrap();
237        fs::write(root.join("unrev/1.1.1.1.fasta"), ">u\nU\n").unwrap();
238        let opts = SeqfileOptions {
239            tax_root: root.into(),
240            user_root: root.into(),
241            seq_src: SeqSrc::Both,
242        };
243        let r = resolve_for_reaction(&opts, "RXN-missing", "1.1.1.1", "");
244        assert_eq!(r.len(), 2);
245    }
246
247    #[test]
248    fn user_override_wins() {
249        let d = tempfile::tempdir().unwrap();
250        let root = d.path();
251        for sub in ["rxn", "rev", "user"] {
252            fs::create_dir_all(root.join(sub)).unwrap();
253        }
254        fs::write(root.join("rxn/RXN-1.fasta"), ">a\n").unwrap();
255        fs::write(root.join("rev/1.1.1.1.fasta"), ">b\n").unwrap();
256        fs::write(root.join("user/RXN-1.fasta"), ">c\nC\n").unwrap();
257        let opts = SeqfileOptions {
258            tax_root: root.into(),
259            user_root: root.into(),
260            seq_src: SeqSrc::PreferRev,
261        };
262        let r = resolve_for_reaction(&opts, "RXN-1", "1.1.1.1", "");
263        assert_eq!(r[0].label, "user/RXN-1.fasta");
264    }
265
266    #[test]
267    fn md5_hex_matches_gnu_md5sum() {
268        // Verified with `echo -n "Acetaldehyde dehydrogenase" | md5sum`.
269        assert_eq!(md5_hex("Acetaldehyde dehydrogenase"), "1390704749ddc17f2b61599cf204ac4a");
270        // Empty input — well-known.
271        assert_eq!(md5_hex(""), "d41d8cd98f00b204e9800998ecf8427e");
272    }
273
274    #[test]
275    fn looks_like_rxn_id_filter() {
276        assert!(looks_like_rxn_id("rxn00001"));
277        assert!(looks_like_rxn_id("Rxn00001"));
278        assert!(looks_like_rxn_id("RXN-8099"));
279        assert!(!looks_like_rxn_id("Acetaldehyde dehydrogenase"));
280        assert!(!looks_like_rxn_id("ATP synthase"));
281        assert!(!looks_like_rxn_id(""));
282    }
283
284    #[test]
285    fn reaname_md5_fallback() {
286        let d = tempfile::tempdir().unwrap();
287        let root = d.path();
288        fs::create_dir_all(root.join("rev")).unwrap();
289        let hash = md5_hex("Acetaldehyde dehydrogenase");
290        fs::write(root.join(format!("rev/{hash}.fasta")), ">x\nX\n").unwrap();
291        let opts = SeqfileOptions {
292            tax_root: root.into(),
293            user_root: root.into(),
294            seq_src: SeqSrc::PreferRev,
295        };
296        let r = resolve_for_reaction(&opts, "RXN-missing", "", "Acetaldehyde dehydrogenase");
297        assert_eq!(r.len(), 1);
298        assert_eq!(r[0].label, format!("rev/{hash}.fasta"));
299    }
300
301    #[test]
302    fn reaname_fallback_skips_rxn_id_names() {
303        let d = tempfile::tempdir().unwrap();
304        let root = d.path();
305        fs::create_dir_all(root.join("rev")).unwrap();
306        let hash = md5_hex("rxn00001");
307        fs::write(root.join(format!("rev/{hash}.fasta")), ">x\nX\n").unwrap();
308        let opts = SeqfileOptions {
309            tax_root: root.into(),
310            user_root: root.into(),
311            seq_src: SeqSrc::PreferRev,
312        };
313        let r = resolve_for_reaction(&opts, "", "", "rxn00001");
314        // Even though the md5 file exists, we shouldn't pick it — "rxn00001"
315        // is a synthetic id.
316        assert!(r.is_empty());
317    }
318
319    #[test]
320    fn empty_file_treated_as_missing() {
321        let d = tempfile::tempdir().unwrap();
322        let root = d.path();
323        fs::create_dir_all(root.join("rxn")).unwrap();
324        fs::write(root.join("rxn/RXN-1.fasta"), "").unwrap();
325        let opts = SeqfileOptions {
326            tax_root: root.into(),
327            user_root: root.into(),
328            seq_src: SeqSrc::PreferRev,
329        };
330        let r = resolve_for_reaction(&opts, "RXN-1", "", "");
331        assert!(r.is_empty());
332    }
333}