Skip to main content

gapsmith_find/
complex.rs

1//! Port of `src/complex_detection.R`.
2//!
3//! Input: a vector of FASTA sequence descriptors (the full header text after
4//! `>`) taken from one reaction's reference files, plus the reaction id (so
5//! per-reaction subunit dictionaries apply).
6//!
7//! Output: a parallel vector of `Option<String>` giving each sequence's
8//! canonical subunit assignment, or `None` when the extractor decided the
9//! sequence is unlabeled.
10//!
11//! Algorithm (faithful to the R original, line numbers from
12//! `src/complex_detection.R`):
13//!
14//! 1. Apply 8 alternating regex patterns (com.pat1..com.pat8) to extract a
15//!    raw subunit phrase.
16//! 2. Normalize "subunit/chain/polypeptide/component" → "Subunit".
17//! 3. Reorder so the label comes after `Subunit` (`alpha Subunit` →
18//!    `Subunit alpha`).
19//! 4. Subunit-dict translation (`dat/complex_subunit_dict.tsv`): `Subunit
20//!    <synonym>` → `Subunit <canonical>`.
21//! 5. Numeral mapping: Latin I..XV → 1..15, single letters A..Z → 1..26,
22//!    greek alpha..sigma → 1..17, small/medium/large → 1..3.
23//! 6. Strip any trailing `[A-z]` on `Subunit N<letter>` (R drops
24//!    sub-sub-complexes).
25//! 7. Low-count filter: if mean count ≥ 10, drop subunits with count < 5.
26//! 8. High-quality selection: if ≥ 66% of hits are numbered subunits,
27//!    drop everything else.
28//! 9. Global cutoff: if ≤ 20% of inputs produced any hit at all, blank the
29//!    whole vector.
30
31use gapsmith_db::ComplexSubunitTable;
32use regex::Regex;
33use std::collections::HashMap;
34use std::sync::OnceLock;
35
36/// Detect one subunit per sequence descriptor for a given reaction. A
37/// `None` entry means "no subunit recognized".
38pub fn detect_subunits(
39    rxn_id: &str,
40    descriptors: &[&str],
41    dict: &ComplexSubunitTable,
42) -> Vec<Option<String>> {
43    if descriptors.is_empty() {
44        return Vec::new();
45    }
46
47    // Step 1: regex extraction.
48    let mut hits: Vec<Option<String>> = descriptors
49        .iter()
50        .map(|s| extract_subunit_phrase(s))
51        .collect();
52
53    // Steps 2-3: canonicalization.
54    for h in hits.iter_mut().flatten() {
55        *h = canonicalize_subunit(h);
56    }
57
58    // Step 4: per-reaction dictionary translation.
59    if let Some(entries) = dict.for_rxn(rxn_id) {
60        let lookup: HashMap<String, String> = entries
61            .iter()
62            .map(|(synonym, canonical)| {
63                (format!("Subunit {synonym}"), format!("Subunit {canonical}"))
64            })
65            .collect();
66        for h in hits.iter_mut().flatten() {
67            if let Some(repl) = lookup.get(h.as_str()) {
68                *h = repl.clone();
69            }
70        }
71    }
72
73    // Step 5: numeral mapping.
74    for h in hits.iter_mut().flatten() {
75        *h = apply_numeral_maps(h);
76    }
77
78    // Step 6: strip trailing letter on numeric subunits.
79    let re_sub_num_letter = sub_num_letter_re();
80    for h in hits.iter_mut().flatten() {
81        if let Some(caps) = re_sub_num_letter.captures(h) {
82            *h = caps.get(1).unwrap().as_str().to_string();
83        }
84    }
85
86    // Step 7: low-count filter.
87    apply_low_count_filter(&mut hits);
88
89    // Step 8: high-quality numbered-subunit selection.
90    apply_numbered_quality_filter(&mut hits);
91
92    // Step 9: ≤ 20% coverage ⇒ blank everything.
93    let n = hits.len();
94    let any = hits.iter().filter(|h| h.is_some()).count();
95    if any * 5 <= n {
96        hits.fill(None);
97    }
98
99    hits
100}
101
102/// Apply the union of the 8 patterns from `complex_detection.R:11-19` and
103/// return the first match.
104fn extract_subunit_phrase(descriptor: &str) -> Option<String> {
105    // The R original uses case-sensitive matches — subunit words are always
106    // lowercase in UniProt descriptors. We replicate that here.
107    static COMPILED: OnceLock<Regex> = OnceLock::new();
108    let re = COMPILED.get_or_init(|| {
109        // synonymes = (subunit|chain|polypeptide|component)
110        let syn = r"(?:subunit|chain|polypeptide|component)";
111        let numeric = format!(r"{syn} [1-9]+(?:[A-Z])?\b");
112        let caps_letters = format!(r"{syn} [A-Z]+\b");
113        let pre_caps = format!(r"\b[A-Z]+ {syn}");
114        let greek_pre = format!(
115            r"(?:alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|my|ny|omikron|pi|rho|sigma) {syn}"
116        );
117        let greek_post = format!(
118            r"{syn} (?:alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|my|ny|omikron|pi|rho|sigma)"
119        );
120        let titled = format!(r"{syn} [A-Z][A-Za-z]+\b");
121        let size_pre = format!(r"(?:large|medium|small) {syn}");
122        let greek_dash = format!(
123            r"(?:alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|my|ny|omikron|pi|rho|sigma)-{syn}"
124        );
125        // Order matters only marginally because `|` picks the leftmost; we
126        // keep the same order as the R file.
127        let combined = format!(
128            r"({numeric}|{caps_letters}|{pre_caps}|{greek_pre}|{greek_post}|{titled}|{size_pre}|{greek_dash})"
129        );
130        Regex::new(&combined).expect("complex pattern should compile")
131    });
132    re.find(descriptor).map(|m| m.as_str().to_string())
133}
134
135/// Rewrite synonyms (subunit/chain/polypeptide/component) to the canonical
136/// "Subunit" and reorder so the tag comes first.
137fn canonicalize_subunit(raw: &str) -> String {
138    static SYN: OnceLock<Regex> = OnceLock::new();
139    let re_syn = SYN.get_or_init(|| {
140        Regex::new(r"subunit|chain|polypeptide|component").expect("syn regex")
141    });
142    let mut out = re_syn.replace_all(raw, "Subunit").to_string();
143
144    // `X Subunit` → `Subunit X` (for ALL-CAPS, greek, small/medium/large).
145    static PRE_CAPS: OnceLock<Regex> = OnceLock::new();
146    let re_cap = PRE_CAPS
147        .get_or_init(|| Regex::new(r"([A-Z]+) Subunit").expect("pre-caps regex"));
148    out = re_cap.replace_all(&out, "Subunit $1").to_string();
149
150    static PRE_GREEK: OnceLock<Regex> = OnceLock::new();
151    let re_greek = PRE_GREEK.get_or_init(|| {
152        Regex::new(
153            r"(alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|my|ny|omikron|pi|rho|sigma) Subunit",
154        )
155        .expect("greek pre-regex")
156    });
157    out = re_greek.replace_all(&out, "Subunit $1").to_string();
158
159    static DASH_GREEK: OnceLock<Regex> = OnceLock::new();
160    let re_dash = DASH_GREEK.get_or_init(|| {
161        Regex::new(
162            r"(alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|my|ny|omikron|pi|rho|sigma)-Subunit",
163        )
164        .expect("dash greek regex")
165    });
166    out = re_dash.replace_all(&out, "Subunit $1").to_string();
167
168    static PRE_SIZE: OnceLock<Regex> = OnceLock::new();
169    let re_size =
170        PRE_SIZE.get_or_init(|| Regex::new(r"(small|medium|large) Subunit").expect("size regex"));
171    out = re_size.replace_all(&out, "Subunit $1").to_string();
172
173    out
174}
175
176fn apply_numeral_maps(raw: &str) -> String {
177    // Order matters: latin numerals first, then single letters, then greek,
178    // then size words (largest-first to avoid substring shadowing — e.g.
179    // `XIII` should match before `XII` / `XI`).
180
181    // Latin I..XV → 1..15 (case-insensitive, word-boundary only on the
182    // leading side).
183    const LATIN: &[(&str, &str)] = &[
184        ("XV", "15"),
185        ("XIV", "14"),
186        ("XIII", "13"),
187        ("XII", "12"),
188        ("XI", "11"),
189        ("X", "10"),
190        ("IX", "9"),
191        ("VIII", "8"),
192        ("VII", "7"),
193        ("VI", "6"),
194        ("V", "5"),
195        ("IV", "4"),
196        ("III", "3"),
197        ("II", "2"),
198        ("I", "1"),
199    ];
200    static LATIN_RES: OnceLock<Vec<(Regex, String)>> = OnceLock::new();
201    let latin_rs = LATIN_RES.get_or_init(|| {
202        LATIN
203            .iter()
204            .map(|(pat, rep)| {
205                (Regex::new(&format!(r"(?i)\b{pat}")).expect("latin re"), (*rep).to_string())
206            })
207            .collect()
208    });
209    let mut out = raw.to_string();
210    for (re, rep) in latin_rs {
211        out = re.replace_all(&out, rep).to_string();
212    }
213
214    // A..Z → 1..26 (case-insensitive, full word).
215    static LETTER_RES: OnceLock<Vec<(Regex, String)>> = OnceLock::new();
216    let letter_rs = LETTER_RES.get_or_init(|| {
217        (b'A'..=b'Z')
218            .enumerate()
219            .map(|(i, c)| {
220                (
221                    Regex::new(&format!(r"(?i)\b{}\b", c as char)).expect("letter re"),
222                    (i + 1).to_string(),
223                )
224            })
225            .collect()
226    });
227    for (re, rep) in letter_rs {
228        out = re.replace_all(&out, rep).to_string();
229    }
230
231    // Greek alpha..sigma → 1..17 (case-insensitive, full word).
232    const GREEK: &[&str] = &[
233        "alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta", "iota", "kappa",
234        "lambda", "my", "ny", "omikron", "pi", "rho", "sigma",
235    ];
236    static GREEK_RES: OnceLock<Vec<(Regex, String)>> = OnceLock::new();
237    let greek_rs = GREEK_RES.get_or_init(|| {
238        GREEK
239            .iter()
240            .enumerate()
241            .map(|(i, g)| {
242                (
243                    Regex::new(&format!(r"(?i)\b{g}\b")).expect("greek re"),
244                    (i + 1).to_string(),
245                )
246            })
247            .collect()
248    });
249    for (re, rep) in greek_rs {
250        out = re.replace_all(&out, rep).to_string();
251    }
252
253    // small/medium/large → 1/2/3. Note: R uses case-sensitive substring
254    // replacement here (no word boundaries) — we mirror that.
255    out = out.replace("small", "1").replace("medium", "2").replace("large", "3");
256    out
257}
258
259fn sub_num_letter_re() -> &'static Regex {
260    static RE: OnceLock<Regex> = OnceLock::new();
261    RE.get_or_init(|| Regex::new(r"^(Subunit [0-9]+)[A-Za-z]$").expect("sub-num-letter re"))
262}
263
264fn apply_low_count_filter(hits: &mut [Option<String>]) {
265    let mut tally: HashMap<&str, usize> = HashMap::new();
266    for h in hits.iter().flatten() {
267        *tally.entry(h.as_str()).or_insert(0) += 1;
268    }
269    if tally.is_empty() {
270        return;
271    }
272    let sum: usize = tally.values().sum();
273    let mean = sum as f64 / tally.len() as f64;
274    if mean < 10.0 {
275        return;
276    }
277    // Snapshot the low-count keys as owned strings to avoid borrow conflict.
278    let low: Vec<String> =
279        tally.iter().filter(|(_, c)| **c < 5).map(|(k, _)| (*k).to_string()).collect();
280    for h in hits.iter_mut() {
281        if let Some(inner) = h {
282            if low.iter().any(|lo| lo == inner) {
283                *h = None;
284            }
285        }
286    }
287}
288
289fn apply_numbered_quality_filter(hits: &mut [Option<String>]) {
290    static NUMBERED: OnceLock<Regex> = OnceLock::new();
291    let re = NUMBERED.get_or_init(|| Regex::new(r"^Subunit [0-9]+").expect("numbered re"));
292
293    let numbered_count = hits
294        .iter()
295        .flatten()
296        .filter(|s| re.is_match(s))
297        .count();
298    if hits.is_empty() {
299        return;
300    }
301    let cov = numbered_count as f64 / hits.len() as f64;
302    if cov < 0.66 {
303        return;
304    }
305    for h in hits.iter_mut() {
306        if let Some(inner) = h {
307            if !re.is_match(inner) {
308                *h = None;
309            }
310        }
311    }
312}
313
314#[cfg(test)]
315mod tests {
316    use super::*;
317
318    fn empty_dict() -> ComplexSubunitTable {
319        ComplexSubunitTable::default()
320    }
321
322    #[test]
323    fn extract_numeric_subunit() {
324        let r = extract_subunit_phrase("DNA-directed RNA polymerase subunit 2");
325        assert_eq!(r.as_deref(), Some("subunit 2"));
326    }
327
328    #[test]
329    fn extract_greek_subunit() {
330        let r = extract_subunit_phrase("ATP synthase alpha chain");
331        assert_eq!(r.as_deref(), Some("alpha chain"));
332    }
333
334    #[test]
335    fn extract_size_subunit() {
336        let r = extract_subunit_phrase("Ribulose bisphosphate carboxylase large chain");
337        assert_eq!(r.as_deref(), Some("large chain"));
338    }
339
340    #[test]
341    fn no_match() {
342        assert_eq!(extract_subunit_phrase("Acetaldehyde dehydrogenase"), None);
343    }
344
345    #[test]
346    fn canonicalize_reorders() {
347        assert_eq!(canonicalize_subunit("alpha chain"), "Subunit alpha");
348        assert_eq!(canonicalize_subunit("small subunit"), "Subunit small");
349        assert_eq!(canonicalize_subunit("ABC component"), "Subunit ABC");
350    }
351
352    #[test]
353    fn numeral_mapping_greek() {
354        assert_eq!(apply_numeral_maps("Subunit alpha"), "Subunit 1");
355        assert_eq!(apply_numeral_maps("Subunit sigma"), "Subunit 17");
356    }
357
358    #[test]
359    fn numeral_mapping_latin() {
360        assert_eq!(apply_numeral_maps("Subunit XIII"), "Subunit 13");
361        assert_eq!(apply_numeral_maps("Subunit IV"), "Subunit 4");
362    }
363
364    #[test]
365    fn numeral_mapping_letters() {
366        assert_eq!(apply_numeral_maps("Subunit A"), "Subunit 1");
367        assert_eq!(apply_numeral_maps("Subunit C"), "Subunit 3");
368    }
369
370    #[test]
371    fn numeral_mapping_size() {
372        assert_eq!(apply_numeral_maps("Subunit small"), "Subunit 1");
373        assert_eq!(apply_numeral_maps("Subunit large"), "Subunit 3");
374    }
375
376    #[test]
377    fn full_pipeline_on_mixed_headers() {
378        let dict = empty_dict();
379        let descriptors = &[
380            "DNA polymerase subunit alpha OS=...",
381            "DNA polymerase subunit beta OS=...",
382            "DNA polymerase subunit gamma OS=...",
383            "Unrelated protein OS=...",
384        ];
385        let out = detect_subunits("RXN-TEST", descriptors, &dict);
386        assert_eq!(out.len(), 4);
387        // With only 4 inputs and 3 matched, we're well above the 20%
388        // threshold — expect all three labeled, fourth None.
389        assert_eq!(out[0].as_deref(), Some("Subunit 1"));
390        assert_eq!(out[1].as_deref(), Some("Subunit 2"));
391        assert_eq!(out[2].as_deref(), Some("Subunit 3"));
392        assert_eq!(out[3], None);
393    }
394
395    #[test]
396    fn coverage_under_20_percent_blanks_all() {
397        let dict = empty_dict();
398        // 1 in 10 has subunit info (10%). Should blank.
399        let mut headers = vec!["random protein"; 9];
400        headers.push("DNA pol subunit alpha");
401        let out = detect_subunits("RXN", &headers, &dict);
402        assert!(out.iter().all(|h| h.is_none()));
403    }
404
405    #[test]
406    fn numbered_quality_filter_drops_non_numbered() {
407        let dict = empty_dict();
408        // 7 numbered subunits (1..7) + 1 greek + 1 random. 7/9 = 77% > 66%
409        // ⇒ non-numbered should be filtered out by step 8.
410        let hs = (1..=7)
411            .map(|i| format!("DNA pol subunit {i}"))
412            .collect::<Vec<_>>();
413        let mut input: Vec<String> = hs.clone();
414        input.push("Ribonuclease subunit alpha".into());
415        input.push("random".into());
416        let refs: Vec<&str> = input.iter().map(|s| s.as_str()).collect();
417        let out = detect_subunits("RXN", &refs, &dict);
418        // First 7 should be numbered.
419        for (i, h) in out.iter().take(7).enumerate() {
420            assert!(h.is_some(), "numbered input {} dropped: {h:?}", i + 1);
421            let s = h.as_ref().unwrap();
422            assert!(s.starts_with("Subunit "), "expected Subunit prefix: {s}");
423        }
424        // alpha becomes `Subunit 1` after numeral mapping, so it survives
425        // the numbered-quality filter (matches R's behavior exactly).
426        assert_eq!(out[7].as_deref(), Some("Subunit 1"));
427        // "random" has no subunit token at all.
428        assert_eq!(out[8], None);
429    }
430}