1use gapsmith_db::ComplexSubunitTable;
32use regex::Regex;
33use std::collections::HashMap;
34use std::sync::OnceLock;
35
36pub fn detect_subunits(
39 rxn_id: &str,
40 descriptors: &[&str],
41 dict: &ComplexSubunitTable,
42) -> Vec<Option<String>> {
43 if descriptors.is_empty() {
44 return Vec::new();
45 }
46
47 let mut hits: Vec<Option<String>> = descriptors
49 .iter()
50 .map(|s| extract_subunit_phrase(s))
51 .collect();
52
53 for h in hits.iter_mut().flatten() {
55 *h = canonicalize_subunit(h);
56 }
57
58 if let Some(entries) = dict.for_rxn(rxn_id) {
60 let lookup: HashMap<String, String> = entries
61 .iter()
62 .map(|(synonym, canonical)| {
63 (format!("Subunit {synonym}"), format!("Subunit {canonical}"))
64 })
65 .collect();
66 for h in hits.iter_mut().flatten() {
67 if let Some(repl) = lookup.get(h.as_str()) {
68 *h = repl.clone();
69 }
70 }
71 }
72
73 for h in hits.iter_mut().flatten() {
75 *h = apply_numeral_maps(h);
76 }
77
78 let re_sub_num_letter = sub_num_letter_re();
80 for h in hits.iter_mut().flatten() {
81 if let Some(caps) = re_sub_num_letter.captures(h) {
82 *h = caps.get(1).unwrap().as_str().to_string();
83 }
84 }
85
86 apply_low_count_filter(&mut hits);
88
89 apply_numbered_quality_filter(&mut hits);
91
92 let n = hits.len();
94 let any = hits.iter().filter(|h| h.is_some()).count();
95 if any * 5 <= n {
96 hits.fill(None);
97 }
98
99 hits
100}
101
102fn extract_subunit_phrase(descriptor: &str) -> Option<String> {
105 static COMPILED: OnceLock<Regex> = OnceLock::new();
108 let re = COMPILED.get_or_init(|| {
109 let syn = r"(?:subunit|chain|polypeptide|component)";
111 let numeric = format!(r"{syn} [1-9]+(?:[A-Z])?\b");
112 let caps_letters = format!(r"{syn} [A-Z]+\b");
113 let pre_caps = format!(r"\b[A-Z]+ {syn}");
114 let greek_pre = format!(
115 r"(?:alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|my|ny|omikron|pi|rho|sigma) {syn}"
116 );
117 let greek_post = format!(
118 r"{syn} (?:alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|my|ny|omikron|pi|rho|sigma)"
119 );
120 let titled = format!(r"{syn} [A-Z][A-Za-z]+\b");
121 let size_pre = format!(r"(?:large|medium|small) {syn}");
122 let greek_dash = format!(
123 r"(?:alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|my|ny|omikron|pi|rho|sigma)-{syn}"
124 );
125 let combined = format!(
128 r"({numeric}|{caps_letters}|{pre_caps}|{greek_pre}|{greek_post}|{titled}|{size_pre}|{greek_dash})"
129 );
130 Regex::new(&combined).expect("complex pattern should compile")
131 });
132 re.find(descriptor).map(|m| m.as_str().to_string())
133}
134
135fn canonicalize_subunit(raw: &str) -> String {
138 static SYN: OnceLock<Regex> = OnceLock::new();
139 let re_syn = SYN.get_or_init(|| {
140 Regex::new(r"subunit|chain|polypeptide|component").expect("syn regex")
141 });
142 let mut out = re_syn.replace_all(raw, "Subunit").to_string();
143
144 static PRE_CAPS: OnceLock<Regex> = OnceLock::new();
146 let re_cap = PRE_CAPS
147 .get_or_init(|| Regex::new(r"([A-Z]+) Subunit").expect("pre-caps regex"));
148 out = re_cap.replace_all(&out, "Subunit $1").to_string();
149
150 static PRE_GREEK: OnceLock<Regex> = OnceLock::new();
151 let re_greek = PRE_GREEK.get_or_init(|| {
152 Regex::new(
153 r"(alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|my|ny|omikron|pi|rho|sigma) Subunit",
154 )
155 .expect("greek pre-regex")
156 });
157 out = re_greek.replace_all(&out, "Subunit $1").to_string();
158
159 static DASH_GREEK: OnceLock<Regex> = OnceLock::new();
160 let re_dash = DASH_GREEK.get_or_init(|| {
161 Regex::new(
162 r"(alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|my|ny|omikron|pi|rho|sigma)-Subunit",
163 )
164 .expect("dash greek regex")
165 });
166 out = re_dash.replace_all(&out, "Subunit $1").to_string();
167
168 static PRE_SIZE: OnceLock<Regex> = OnceLock::new();
169 let re_size =
170 PRE_SIZE.get_or_init(|| Regex::new(r"(small|medium|large) Subunit").expect("size regex"));
171 out = re_size.replace_all(&out, "Subunit $1").to_string();
172
173 out
174}
175
176fn apply_numeral_maps(raw: &str) -> String {
177 const LATIN: &[(&str, &str)] = &[
184 ("XV", "15"),
185 ("XIV", "14"),
186 ("XIII", "13"),
187 ("XII", "12"),
188 ("XI", "11"),
189 ("X", "10"),
190 ("IX", "9"),
191 ("VIII", "8"),
192 ("VII", "7"),
193 ("VI", "6"),
194 ("V", "5"),
195 ("IV", "4"),
196 ("III", "3"),
197 ("II", "2"),
198 ("I", "1"),
199 ];
200 static LATIN_RES: OnceLock<Vec<(Regex, String)>> = OnceLock::new();
201 let latin_rs = LATIN_RES.get_or_init(|| {
202 LATIN
203 .iter()
204 .map(|(pat, rep)| {
205 (Regex::new(&format!(r"(?i)\b{pat}")).expect("latin re"), (*rep).to_string())
206 })
207 .collect()
208 });
209 let mut out = raw.to_string();
210 for (re, rep) in latin_rs {
211 out = re.replace_all(&out, rep).to_string();
212 }
213
214 static LETTER_RES: OnceLock<Vec<(Regex, String)>> = OnceLock::new();
216 let letter_rs = LETTER_RES.get_or_init(|| {
217 (b'A'..=b'Z')
218 .enumerate()
219 .map(|(i, c)| {
220 (
221 Regex::new(&format!(r"(?i)\b{}\b", c as char)).expect("letter re"),
222 (i + 1).to_string(),
223 )
224 })
225 .collect()
226 });
227 for (re, rep) in letter_rs {
228 out = re.replace_all(&out, rep).to_string();
229 }
230
231 const GREEK: &[&str] = &[
233 "alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta", "iota", "kappa",
234 "lambda", "my", "ny", "omikron", "pi", "rho", "sigma",
235 ];
236 static GREEK_RES: OnceLock<Vec<(Regex, String)>> = OnceLock::new();
237 let greek_rs = GREEK_RES.get_or_init(|| {
238 GREEK
239 .iter()
240 .enumerate()
241 .map(|(i, g)| {
242 (
243 Regex::new(&format!(r"(?i)\b{g}\b")).expect("greek re"),
244 (i + 1).to_string(),
245 )
246 })
247 .collect()
248 });
249 for (re, rep) in greek_rs {
250 out = re.replace_all(&out, rep).to_string();
251 }
252
253 out = out.replace("small", "1").replace("medium", "2").replace("large", "3");
256 out
257}
258
259fn sub_num_letter_re() -> &'static Regex {
260 static RE: OnceLock<Regex> = OnceLock::new();
261 RE.get_or_init(|| Regex::new(r"^(Subunit [0-9]+)[A-Za-z]$").expect("sub-num-letter re"))
262}
263
264fn apply_low_count_filter(hits: &mut [Option<String>]) {
265 let mut tally: HashMap<&str, usize> = HashMap::new();
266 for h in hits.iter().flatten() {
267 *tally.entry(h.as_str()).or_insert(0) += 1;
268 }
269 if tally.is_empty() {
270 return;
271 }
272 let sum: usize = tally.values().sum();
273 let mean = sum as f64 / tally.len() as f64;
274 if mean < 10.0 {
275 return;
276 }
277 let low: Vec<String> =
279 tally.iter().filter(|(_, c)| **c < 5).map(|(k, _)| (*k).to_string()).collect();
280 for h in hits.iter_mut() {
281 if let Some(inner) = h {
282 if low.iter().any(|lo| lo == inner) {
283 *h = None;
284 }
285 }
286 }
287}
288
289fn apply_numbered_quality_filter(hits: &mut [Option<String>]) {
290 static NUMBERED: OnceLock<Regex> = OnceLock::new();
291 let re = NUMBERED.get_or_init(|| Regex::new(r"^Subunit [0-9]+").expect("numbered re"));
292
293 let numbered_count = hits
294 .iter()
295 .flatten()
296 .filter(|s| re.is_match(s))
297 .count();
298 if hits.is_empty() {
299 return;
300 }
301 let cov = numbered_count as f64 / hits.len() as f64;
302 if cov < 0.66 {
303 return;
304 }
305 for h in hits.iter_mut() {
306 if let Some(inner) = h {
307 if !re.is_match(inner) {
308 *h = None;
309 }
310 }
311 }
312}
313
314#[cfg(test)]
315mod tests {
316 use super::*;
317
318 fn empty_dict() -> ComplexSubunitTable {
319 ComplexSubunitTable::default()
320 }
321
322 #[test]
323 fn extract_numeric_subunit() {
324 let r = extract_subunit_phrase("DNA-directed RNA polymerase subunit 2");
325 assert_eq!(r.as_deref(), Some("subunit 2"));
326 }
327
328 #[test]
329 fn extract_greek_subunit() {
330 let r = extract_subunit_phrase("ATP synthase alpha chain");
331 assert_eq!(r.as_deref(), Some("alpha chain"));
332 }
333
334 #[test]
335 fn extract_size_subunit() {
336 let r = extract_subunit_phrase("Ribulose bisphosphate carboxylase large chain");
337 assert_eq!(r.as_deref(), Some("large chain"));
338 }
339
340 #[test]
341 fn no_match() {
342 assert_eq!(extract_subunit_phrase("Acetaldehyde dehydrogenase"), None);
343 }
344
345 #[test]
346 fn canonicalize_reorders() {
347 assert_eq!(canonicalize_subunit("alpha chain"), "Subunit alpha");
348 assert_eq!(canonicalize_subunit("small subunit"), "Subunit small");
349 assert_eq!(canonicalize_subunit("ABC component"), "Subunit ABC");
350 }
351
352 #[test]
353 fn numeral_mapping_greek() {
354 assert_eq!(apply_numeral_maps("Subunit alpha"), "Subunit 1");
355 assert_eq!(apply_numeral_maps("Subunit sigma"), "Subunit 17");
356 }
357
358 #[test]
359 fn numeral_mapping_latin() {
360 assert_eq!(apply_numeral_maps("Subunit XIII"), "Subunit 13");
361 assert_eq!(apply_numeral_maps("Subunit IV"), "Subunit 4");
362 }
363
364 #[test]
365 fn numeral_mapping_letters() {
366 assert_eq!(apply_numeral_maps("Subunit A"), "Subunit 1");
367 assert_eq!(apply_numeral_maps("Subunit C"), "Subunit 3");
368 }
369
370 #[test]
371 fn numeral_mapping_size() {
372 assert_eq!(apply_numeral_maps("Subunit small"), "Subunit 1");
373 assert_eq!(apply_numeral_maps("Subunit large"), "Subunit 3");
374 }
375
376 #[test]
377 fn full_pipeline_on_mixed_headers() {
378 let dict = empty_dict();
379 let descriptors = &[
380 "DNA polymerase subunit alpha OS=...",
381 "DNA polymerase subunit beta OS=...",
382 "DNA polymerase subunit gamma OS=...",
383 "Unrelated protein OS=...",
384 ];
385 let out = detect_subunits("RXN-TEST", descriptors, &dict);
386 assert_eq!(out.len(), 4);
387 assert_eq!(out[0].as_deref(), Some("Subunit 1"));
390 assert_eq!(out[1].as_deref(), Some("Subunit 2"));
391 assert_eq!(out[2].as_deref(), Some("Subunit 3"));
392 assert_eq!(out[3], None);
393 }
394
395 #[test]
396 fn coverage_under_20_percent_blanks_all() {
397 let dict = empty_dict();
398 let mut headers = vec!["random protein"; 9];
400 headers.push("DNA pol subunit alpha");
401 let out = detect_subunits("RXN", &headers, &dict);
402 assert!(out.iter().all(|h| h.is_none()));
403 }
404
405 #[test]
406 fn numbered_quality_filter_drops_non_numbered() {
407 let dict = empty_dict();
408 let hs = (1..=7)
411 .map(|i| format!("DNA pol subunit {i}"))
412 .collect::<Vec<_>>();
413 let mut input: Vec<String> = hs.clone();
414 input.push("Ribonuclease subunit alpha".into());
415 input.push("random".into());
416 let refs: Vec<&str> = input.iter().map(|s| s.as_str()).collect();
417 let out = detect_subunits("RXN", &refs, &dict);
418 for (i, h) in out.iter().take(7).enumerate() {
420 assert!(h.is_some(), "numbered input {} dropped: {h:?}", i + 1);
421 let s = h.as_ref().unwrap();
422 assert!(s.starts_with("Subunit "), "expected Subunit prefix: {s}");
423 }
424 assert_eq!(out[7].as_deref(), Some("Subunit 1"));
427 assert_eq!(out[8], None);
429 }
430}