1use gapsmith_db::{PathwayRow, PathwayTable};
27use regex::Regex;
28use std::collections::HashSet;
29
30#[derive(Debug, Clone)]
32pub struct ExpandedReaction {
33 pub pathway: String,
34 pub pathway_name: String,
35 pub rxn: String,
36 pub name: String,
37 pub ec: String,
38 pub keyrea: bool,
39 pub spont: bool,
40}
41
42#[derive(Debug, Clone, Copy, Eq, PartialEq)]
44pub enum MatchMode {
45 Id,
47 Hierarchy,
50 Regex,
53}
54
55pub struct PathwaySelectOptions<'a> {
56 pub keyword: &'a str,
58 pub mode: MatchMode,
59 pub exclude_superpathways: bool,
62 pub only_active: bool,
64 pub valid_tax_ids: &'a [String],
68}
69
70impl<'a> Default for PathwaySelectOptions<'a> {
71 fn default() -> Self {
72 Self {
73 keyword: "all",
74 mode: MatchMode::Regex,
75 exclude_superpathways: true,
76 only_active: true,
77 valid_tax_ids: &[],
78 }
79 }
80}
81
82pub fn select(table: &PathwayTable, opts: &PathwaySelectOptions<'_>) -> Vec<ExpandedReaction> {
85 let pattern = resolve_keyword(opts.keyword);
86 let keys_lc: Vec<String> = pattern
89 .split('|')
90 .map(|s| s.trim().to_ascii_lowercase())
91 .filter(|s| !s.is_empty())
92 .collect();
93
94 let kw_re: Option<Regex> = if keys_lc.is_empty() || matches!(opts.mode, MatchMode::Hierarchy | MatchMode::Id) {
95 None
96 } else {
97 let alt = keys_lc.iter().map(|s| regex::escape(s)).collect::<Vec<_>>().join("|");
98 Some(
99 regex::RegexBuilder::new(&format!(r"\b(?:{alt})\b"))
100 .case_insensitive(true)
101 .build()
102 .expect("keyword regex compiles"),
103 )
104 };
105
106 let tax_re: Option<Regex> = if opts.valid_tax_ids.is_empty() {
107 None
108 } else {
109 let alt = opts
110 .valid_tax_ids
111 .iter()
112 .map(|s| regex::escape(s))
113 .collect::<Vec<_>>()
114 .join("|");
115 Some(Regex::new(&format!(r"TAX-(?:{alt})\b")).expect("tax regex compiles"))
116 };
117
118 let mut out = Vec::new();
119 let mut seen: HashSet<(String, String, String, String)> = HashSet::new();
120
121 for row in &table.rows {
122 if opts.only_active && !is_active(row) {
123 continue;
124 }
125 if !keys_lc.is_empty() {
126 let matched = match opts.mode {
127 MatchMode::Id => keys_lc.iter().any(|k| row.id.to_ascii_lowercase() == *k),
128 MatchMode::Hierarchy => hierarchy_matches_any(&row.hierarchy, &keys_lc),
129 MatchMode::Regex => kw_re.as_ref().unwrap().is_match(&row.id)
130 || kw_re.as_ref().unwrap().is_match(&row.hierarchy)
131 || kw_re.as_ref().unwrap().is_match(&row.name)
132 || kw_re.as_ref().unwrap().is_match(&row.altname),
133 };
134 if !matched {
135 continue;
136 }
137 }
138 if opts.exclude_superpathways && is_superpathway(row) {
140 continue;
141 }
142 if row.rea_id.trim().is_empty() {
144 continue;
145 }
146 if let Some(ref t_re) = tax_re {
148 if !row.taxrange.is_empty() && !t_re.is_match(&row.taxrange) {
149 continue;
150 }
151 }
152 expand_row(row, &mut out, &mut seen);
153 }
154 out
155}
156
157fn hierarchy_matches_any(hierarchy: &str, keys_lc: &[String]) -> bool {
158 for token in hierarchy.split(['|', ';']) {
161 let t = token.trim().to_ascii_lowercase();
162 if keys_lc.iter().any(|k| k == &t) {
163 return true;
164 }
165 }
166 false
167}
168
169fn is_superpathway(row: &PathwayRow) -> bool {
170 matches!(row.superpathway.to_ascii_lowercase().as_str(), "true" | "t" | "1")
171 || row.hierarchy.contains("Super-Pathways")
172}
173
174pub fn resolve_keyword(kw: &str) -> String {
177 let aa_syn =
178 "Amino-Acid-Biosynthesis|Nucleotide-Biosynthesis|Cofactor-Biosynthesis|Carbohydrates-Degradation|CARBO-BIOSYNTHESIS|Polyamine-Biosynthesis|Fatty-acid-biosynthesis|Energy-Metabolism|Terpenoid-Biosynthesis|Chorismate-Biosynthesis";
179 let min_pwys = "ETOH-ACETYLCOA-ANA-PWY|GLNSYN-PWY|GLUCONEO-PWY|GLUGLNSYN-PWY|GLUTAMATE-DEG1-PWY|GLUTAMATE-SYN2-PWY|GLUTAMINEFUM-PWY|GLUTSYNIII-PWY|GLYCOLYSIS|GLYOXYLATE-BYPASS|NONOXIPENT-PWY|OXIDATIVEPENT-PWY|P185-PWY|P21-PWY|PWY0-1312|PWY0-1315|PWY0-1329|PWY0-1334|PWY0-1335|PWY0-1353|PWY0-1517|PWY0-1565|PWY0-1567|PWY0-1568|PWY-4341|PWY-5084|PWY-5480|PWY-5482|PWY-5484|PWY-5690|PWY-5766|PWY-5913|PWY-6028|PWY-6333|PWY-6543|PWY-6549|PWY66-21|PWY66-398|PWY-6697|PWY-6964|PWY-7167|PWY-7685|PWY-7686|PWY-7980|PWY-8178|PWY-8215|PWY-8274|PWY-8404|PYRUVDEHYD-PWY|TCA-1|TCA";
180 match kw {
181 "" => String::new(),
182 "all" => "Pathways|Enzyme-Test|seed|kegg".into(),
183 "amino" => "Amino-Acid-Biosynthesis".into(),
184 "nucl" => "Nucleotide-Biosynthesis".into(),
185 "cofactor" => "Cofactor-Biosynthesis".into(),
186 "carbo" => "CARBO-BIOSYNTHESIS".into(),
187 "carbo-deg" => "Carbohydrates-Degradation".into(),
188 "polyamine" => "Polyamine-Biosynthesis".into(),
189 "fatty" => "Fatty-acid-biosynthesis".into(),
190 "energy" => "Energy-Metabolism".into(),
191 "terpenoid" => "Terpenoid-Biosynthesis".into(),
192 "degradation" => "Degradation".into(),
193 "core" => aa_syn.into(),
194 "min" => min_pwys.into(),
195 "kegg" => "kegg".into(),
196 other => other.into(), }
198}
199
200fn is_active(row: &PathwayRow) -> bool {
201 matches!(row.status.to_ascii_lowercase().as_str(), "true" | "t" | "1")
202}
203
204
205fn expand_row(
206 row: &PathwayRow,
207 out: &mut Vec<ExpandedReaction>,
208 seen: &mut HashSet<(String, String, String, String)>,
209) {
210 let rxns = split_positional(&row.rea_id, ',');
214 let ecs = split_positional(&row.rea_ec, ',');
215 let names = split_positional(&row.rea_name, ';');
216 let keys = split_csv(&row.key_rea);
217 let spont = split_csv(&row.spont);
218
219 let key_set: HashSet<&str> = keys.iter().copied().collect();
220 let spont_set: HashSet<&str> = spont.iter().copied().collect();
221
222 for (i, r) in rxns.iter().enumerate() {
226 let r = r.as_str();
227 if r.is_empty() {
228 continue;
229 }
230 let ec = ecs.get(i).map(|s| s.as_str()).unwrap_or("");
231 let name = names.get(i).map(|s| s.as_str()).unwrap_or("");
232 let key = r == row.id || key_set.contains(r);
233 let sp = spont_set.contains(r);
234 let dedup = (row.id.clone(), r.to_string(), name.to_string(), ec.to_string());
235 if seen.insert(dedup) {
236 out.push(ExpandedReaction {
237 pathway: row.id.clone(),
238 pathway_name: row.name.clone(),
239 rxn: r.to_string(),
240 name: name.to_string(),
241 ec: ec.to_string(),
242 keyrea: key,
243 spont: sp,
244 });
245 }
246 }
247}
248
249fn split_csv(s: &str) -> Vec<&str> {
250 s.split(',').map(str::trim).filter(|s| !s.is_empty()).collect()
251}
252
253fn split_positional(s: &str, sep: char) -> Vec<String> {
257 if s.is_empty() {
258 return Vec::new();
259 }
260 s.split(sep).map(|t| t.trim().to_string()).collect()
261}
262
263#[cfg(test)]
264mod tests {
265 use super::*;
266 use gapsmith_db::PwySource;
267
268 fn toy_table() -> PathwayTable {
269 PathwayTable {
270 source: Some(PwySource::MetaCyc),
271 rows: vec![
272 PathwayRow {
273 id: "PWY-A".into(),
274 name: "Glycolysis".into(),
275 altname: "".into(),
276 hierarchy: "Carbohydrate metabolism".into(),
277 taxrange: "".into(),
278 rea_id: "rxn1,rxn2,rxn3".into(),
279 rea_ec: "1.1.1.1,2.2.2.2,3.3.3.3".into(),
280 key_rea: "rxn1".into(),
281 rea_name: "enz1;enz2;enz3".into(),
282 rea_nr: 3,
283 ec_nr: 3,
284 superpathway: "".into(),
285 status: "TRUE".into(),
286 spont: "rxn3".into(),
287 source: PwySource::MetaCyc,
288 },
289 PathwayRow {
290 id: "PWY-B".into(),
291 name: "Lysine biosynthesis".into(),
292 altname: "".into(),
293 hierarchy: "Amino acid".into(),
294 taxrange: "".into(),
295 rea_id: "rxn10".into(),
296 rea_ec: "4.4.4.4".into(),
297 key_rea: "".into(),
298 rea_name: "enz10".into(),
299 rea_nr: 1,
300 ec_nr: 1,
301 superpathway: "".into(),
302 status: "TRUE".into(),
303 spont: "".into(),
304 source: PwySource::MetaCyc,
305 },
306 ],
307 }
308 }
309
310 fn opts_with_keyword(kw: &'static str) -> PathwaySelectOptions<'static> {
311 PathwaySelectOptions {
312 keyword: kw,
313 mode: MatchMode::Regex,
314 exclude_superpathways: true,
315 only_active: true,
316 valid_tax_ids: &[],
317 }
318 }
319
320 #[test]
321 fn keyword_matches_name_word_boundary() {
322 let t = toy_table();
323 let exp = select(&t, &opts_with_keyword("Glycolysis"));
324 assert_eq!(exp.len(), 3);
325 assert_eq!(exp[0].pathway, "PWY-A");
326 }
327
328 #[test]
329 fn keyword_all_expands_every_active_pathway() {
330 let mut t = toy_table();
334 for r in &mut t.rows {
335 r.hierarchy.push_str(",Pathways");
336 }
337 let exp = select(&t, &opts_with_keyword("all"));
338 assert_eq!(exp.len(), 4);
339 }
340
341 #[test]
342 fn keyword_shorthand_resolves_to_hierarchy() {
343 let mut t = toy_table();
344 t.rows[1].hierarchy = "Amino-Acid-Biosynthesis".into();
345 let exp = select(
346 &t,
347 &PathwaySelectOptions {
348 keyword: "amino",
349 mode: MatchMode::Hierarchy,
350 exclude_superpathways: true,
351 only_active: true,
352 valid_tax_ids: &[],
353 },
354 );
355 assert_eq!(exp.len(), 1);
356 assert_eq!(exp[0].pathway, "PWY-B");
357 }
358
359 #[test]
360 fn taxonomy_filter() {
361 let mut t = toy_table();
362 t.rows[0].taxrange = "|TAX-4751|".into(); t.rows[1].taxrange = "|TAX-2|".into(); let tax_ids = vec!["2".to_string(), "1224".to_string()];
365 let exp = select(
366 &t,
367 &PathwaySelectOptions {
368 keyword: "all",
369 mode: MatchMode::Regex,
370 exclude_superpathways: true,
371 only_active: true,
372 valid_tax_ids: &tax_ids,
373 },
374 );
375 let _ = exp; let exp = select(
379 &t,
380 &PathwaySelectOptions {
381 keyword: "Lysine",
382 mode: MatchMode::Regex,
383 exclude_superpathways: true,
384 only_active: true,
385 valid_tax_ids: &tax_ids,
386 },
387 );
388 assert_eq!(exp.len(), 1);
389 assert_eq!(exp[0].pathway, "PWY-B");
390 }
391}