Skip to main content

gapsmith_db/
stoich_parse.rs

1//! Parser for SEED stoichiometry strings.
2//!
3//! Format (one term per reaction participant, `;`-separated):
4//!
5//! ```text
6//! <coef>:<cpd_id>:<compartment>:<reserved>:"<name>"
7//! ```
8//!
9//! Example (from `dat/seed_reactions_corrected.tsv`):
10//!
11//! ```text
12//! -1:cpd00001:0:0:"H2O";-1:cpd00012:0:0:"PPi";2:cpd00009:0:0:"Phosphate";1:cpd00067:0:0:"H+"
13//! ```
14//!
15//! Cross-ref: `src/construct_full_model.R:28–35` in the R reference
16//! implementation.
17//!
18//! Design notes:
19//!
20//! - We split on the top-level `;` and `:` only — a `;` or `:` inside a
21//!   double-quoted `name` field must be preserved. In practice the current
22//!   gapseq data does not contain such cases, but the parser handles them
23//!   anyway so future DB updates don't break us silently.
24//! - The name field is optional (the whole trailing `:"..."` may be absent).
25//! - Leading/trailing whitespace is trimmed.
26
27use gapsmith_core::CpdId;
28
29#[derive(Clone, Debug, PartialEq)]
30pub struct StoichTerm {
31    pub coef: f64,
32    pub cpd: CpdId,
33    pub compartment: u8,
34    /// Reserved field seen as `0` in every real row. Preserved for fidelity.
35    pub reserved: u8,
36    /// Human-readable metabolite name as given in the stoichiometry field.
37    /// May be empty if absent from the source.
38    pub name: String,
39}
40
41#[derive(Debug, thiserror::Error)]
42pub enum StoichParseError {
43    #[error("term {index} (`{fragment}`): {msg}")]
44    BadTerm {
45        index: usize,
46        fragment: String,
47        msg: String,
48    },
49    #[error("empty stoichiometry string")]
50    Empty,
51}
52
53/// Parse a full stoichiometry field.
54pub fn parse_stoichiometry(s: &str) -> Result<Vec<StoichTerm>, StoichParseError> {
55    let s = s.trim();
56    if s.is_empty() {
57        return Err(StoichParseError::Empty);
58    }
59    let mut out = Vec::new();
60    for (i, raw_term) in split_top_level(s, ';').iter().enumerate() {
61        let t = raw_term.trim();
62        if t.is_empty() {
63            continue;
64        }
65        out.push(parse_term(i, t)?);
66    }
67    if out.is_empty() {
68        return Err(StoichParseError::Empty);
69    }
70    Ok(out)
71}
72
73fn parse_term(index: usize, t: &str) -> Result<StoichTerm, StoichParseError> {
74    let parts = split_top_level(t, ':');
75    if parts.len() < 3 {
76        return Err(StoichParseError::BadTerm {
77            index,
78            fragment: t.to_string(),
79            msg: format!("expected at least 3 `:`-separated fields, got {}", parts.len()),
80        });
81    }
82
83    let coef: f64 = parts[0].trim().parse().map_err(|_| StoichParseError::BadTerm {
84        index,
85        fragment: t.to_string(),
86        msg: format!("coefficient `{}` is not a number", parts[0]),
87    })?;
88    let cpd = CpdId::new(parts[1].trim());
89    let compartment: u8 = parts[2].trim().parse().map_err(|_| StoichParseError::BadTerm {
90        index,
91        fragment: t.to_string(),
92        msg: format!("compartment `{}` is not u8", parts[2]),
93    })?;
94    let reserved: u8 = if parts.len() >= 4 {
95        parts[3].trim().parse().unwrap_or(0)
96    } else {
97        0
98    };
99    let name = if parts.len() >= 5 {
100        // Remaining fields may contain `:` inside quoted names — rejoin.
101        let joined = parts[4..].join(":");
102        strip_quotes(joined.trim())
103    } else {
104        String::new()
105    };
106    Ok(StoichTerm { coef, cpd, compartment, reserved, name })
107}
108
109/// Split on `delim` at the top level — i.e. outside any `"..."` double-quoted
110/// section. Quote state is reset by a non-escaped `"`.
111fn split_top_level(s: &str, delim: char) -> Vec<String> {
112    let mut out = Vec::new();
113    let mut buf = String::new();
114    let mut in_quotes = false;
115    let mut prev_backslash = false;
116    for c in s.chars() {
117        if prev_backslash {
118            buf.push(c);
119            prev_backslash = false;
120            continue;
121        }
122        match c {
123            '\\' => {
124                buf.push(c);
125                prev_backslash = true;
126            }
127            '"' => {
128                in_quotes = !in_quotes;
129                buf.push(c);
130            }
131            c if c == delim && !in_quotes => {
132                out.push(std::mem::take(&mut buf));
133            }
134            _ => buf.push(c),
135        }
136    }
137    out.push(buf);
138    out
139}
140
141fn strip_quotes(s: &str) -> String {
142    let bytes = s.as_bytes();
143    if bytes.len() >= 2 && bytes.first() == Some(&b'"') && bytes.last() == Some(&b'"') {
144        s[1..s.len() - 1].to_string()
145    } else {
146        s.to_string()
147    }
148}
149
150#[cfg(test)]
151mod tests {
152    use super::*;
153
154    #[test]
155    fn canonical_row() {
156        let s = r#"-1:cpd00001:0:0:"H2O";-1:cpd00012:0:0:"PPi";2:cpd00009:0:0:"Phosphate";1:cpd00067:0:0:"H+""#;
157        let terms = parse_stoichiometry(s).unwrap();
158        assert_eq!(terms.len(), 4);
159        assert_eq!(terms[0].coef, -1.0);
160        assert_eq!(terms[0].cpd.as_str(), "cpd00001");
161        assert_eq!(terms[0].compartment, 0);
162        assert_eq!(terms[0].name, "H2O");
163        assert_eq!(terms[2].coef, 2.0);
164        assert_eq!(terms[3].name, "H+");
165    }
166
167    #[test]
168    fn five_term_real_row() {
169        // Actual row observed in `dat/seed_reactions_corrected.tsv`.
170        let s = r#"-1:cpd00001:0:0:"H2O";-3:cpd00067:0:0:"H+";-1:cpd00742:0:0:"Allophanate";2:cpd00011:0:0:"CO2";2:cpd00013:0:0:"NH3""#;
171        let terms = parse_stoichiometry(s).unwrap();
172        assert_eq!(terms.len(), 5);
173        assert_eq!(terms.iter().map(|t| t.coef).sum::<f64>(), -1.0 - 3.0 - 1.0 + 2.0 + 2.0);
174    }
175
176    #[test]
177    fn colon_inside_quoted_name_preserved() {
178        let s = r#"-1:cpd00001:0:0:"X: Y: Z";1:cpd00002:0:0:"W""#;
179        let terms = parse_stoichiometry(s).unwrap();
180        assert_eq!(terms.len(), 2);
181        assert_eq!(terms[0].name, "X: Y: Z");
182    }
183
184    #[test]
185    fn semicolon_inside_quoted_name_preserved() {
186        let s = r#"-1:cpd00001:0:0:"A; B";1:cpd00002:0:0:"C""#;
187        let terms = parse_stoichiometry(s).unwrap();
188        assert_eq!(terms.len(), 2);
189        assert_eq!(terms[0].name, "A; B");
190        assert_eq!(terms[1].cpd.as_str(), "cpd00002");
191    }
192
193    #[test]
194    fn missing_name_is_ok() {
195        let terms = parse_stoichiometry("-1:cpd00001:0:0").unwrap();
196        assert_eq!(terms.len(), 1);
197        assert_eq!(terms[0].name, "");
198    }
199
200    #[test]
201    fn fractional_coef() {
202        let terms = parse_stoichiometry("-0.5:cpd00001:0:0:\"A\"").unwrap();
203        assert_eq!(terms[0].coef, -0.5);
204    }
205
206    #[test]
207    fn empty_is_error() {
208        assert!(matches!(parse_stoichiometry("").unwrap_err(), StoichParseError::Empty));
209        assert!(matches!(parse_stoichiometry("   ").unwrap_err(), StoichParseError::Empty));
210        assert!(matches!(parse_stoichiometry(";;;").unwrap_err(), StoichParseError::Empty));
211    }
212
213    #[test]
214    fn bad_term_reports_index() {
215        let err = parse_stoichiometry("-1:cpd:0:0:X;bad").unwrap_err();
216        assert!(matches!(err, StoichParseError::BadTerm { index: 1, .. }));
217    }
218}