Skip to main content

gapsmith_db/
biomass.rs

1//! Biomass JSON template loader.
2//!
3//! Input: `dat/biomass/biomass_{Gram_pos,Gram_neg,archaea}.json`.
4//!
5//! Schema (inferred from the real files — no schema document ships with
6//! gapseq):
7//!
8//! ```json
9//! {
10//!   "id":        "Gram_neg",
11//!   "name":      "Bacterial Gram-negative biomass reaction",
12//!   "ref":       "derived from ...",
13//!   "energy_GAM": 40,
14//!   "domain":    "Bacteria",
15//!   "met_groups": [
16//!     {
17//!       "group_name":      "DNA",
18//!       "mass":            0.031,
19//!       "unit_group":      "g",
20//!       "unit_components": "MOLFRACTION",
21//!       "components": [
22//!         {"id":"cpd00115","name":"dATP","comp":"c","coef":0.246,
23//!          "link":"cpd00012:-1"}
24//!       ]
25//!     }
26//!   ]
27//! }
28//! ```
29//!
30//! Some fields carry gapseq-specific quirks (see `src/parse_BMjson.R:1–108`):
31//!
32//! - `link` is optional and empty-string means "no coupled product".
33//!   It encodes a coupled metabolite: `"<cpd>:<coef>"`.
34//! - `comp` is a single character (`c`, `e`, `p`) that maps to the standard
35//!   compartments at model-build time.
36//! - `unit_group` is the measurement unit for the *group* mass; gapseq
37//!   assumes `"g"`.
38//! - `unit_components` describes how component coefficients combine:
39//!   `"MOLFRACTION"` → per-mole fraction, others exist but are rare.
40
41use serde::{Deserialize, Serialize};
42use std::path::Path;
43
44#[derive(Debug, thiserror::Error)]
45pub enum BiomassError {
46    #[error("i/o error on `{path}`: {source}")]
47    Io {
48        path: std::path::PathBuf,
49        #[source]
50        source: std::io::Error,
51    },
52    #[error("JSON parse error on `{path}`: {source}")]
53    Json {
54        path: std::path::PathBuf,
55        #[source]
56        source: serde_json::Error,
57    },
58}
59
60#[derive(Debug, Clone, Serialize, Deserialize)]
61pub struct BiomassComponent {
62    pub id: String,
63    #[serde(default)]
64    pub name: String,
65    /// Compartment code: `c` (cytosol), `e` (extracellular), `p` (periplasm).
66    pub comp: String,
67    pub coef: f64,
68    /// Optional coupled metabolite, encoded as `"<cpd>:<coef>"`. Empty when absent.
69    #[serde(default, skip_serializing_if = "String::is_empty")]
70    pub link: String,
71}
72
73impl BiomassComponent {
74    /// Decode the `link` field into (metabolite id, stoichiometric coefficient).
75    ///
76    /// Single link form. For entries like `"cpd01997:-1|cpd03422:-1"` with
77    /// multiple coupled metabolites, use [`Self::links`] instead.
78    pub fn link(&self) -> Option<(&str, f64)> {
79        self.links().into_iter().next()
80    }
81
82    /// Decode the `link` field into every `(cpd, coef)` it encodes. The
83    /// `|`-separator is how gapseq chains multiple couplings on one
84    /// biomass component (e.g., Calomide → consume both cpd01997 and
85    /// cpd03422).
86    pub fn links(&self) -> Vec<(&str, f64)> {
87        if self.link.is_empty() {
88            return Vec::new();
89        }
90        self.link
91            .split('|')
92            .filter_map(|term| {
93                let (cpd, coef) = term.split_once(':')?;
94                coef.trim().parse::<f64>().ok().map(|c| (cpd.trim(), c))
95            })
96            .collect()
97    }
98}
99
100#[derive(Debug, Clone, Serialize, Deserialize)]
101pub struct BiomassGroup {
102    pub group_name: String,
103    pub mass: f64,
104    #[serde(default)]
105    pub unit_group: String,
106    #[serde(default)]
107    pub unit_components: String,
108    #[serde(default)]
109    pub components: Vec<BiomassComponent>,
110}
111
112#[derive(Debug, Clone, Serialize, Deserialize)]
113pub struct BiomassTemplate {
114    pub id: String,
115    pub name: String,
116    #[serde(default, rename = "ref", skip_serializing_if = "String::is_empty")]
117    pub reference: String,
118    #[serde(default, rename = "energy_GAM")]
119    pub energy_gam: f64,
120    #[serde(default)]
121    pub domain: String,
122    #[serde(default)]
123    pub met_groups: Vec<BiomassGroup>,
124}
125
126impl BiomassTemplate {
127    pub fn load(path: impl AsRef<Path>) -> Result<Self, BiomassError> {
128        let path = path.as_ref();
129        let f = std::fs::File::open(path).map_err(|e| BiomassError::Io {
130            path: path.to_path_buf(),
131            source: e,
132        })?;
133        let r = std::io::BufReader::new(f);
134        let t: BiomassTemplate = serde_json::from_reader(r).map_err(|e| BiomassError::Json {
135            path: path.to_path_buf(),
136            source: e,
137        })?;
138        Ok(t)
139    }
140
141    /// Load returning `None` if the file doesn't exist. Useful when the
142    /// user's `dat/` root only ships a subset of biomass templates.
143    pub fn load_opt(path: impl AsRef<Path>) -> Result<Option<Self>, BiomassError> {
144        let path = path.as_ref();
145        if !path.exists() {
146            tracing::warn!(path = %path.display(), "biomass template not found; skipping");
147            return Ok(None);
148        }
149        Self::load(path).map(Some)
150    }
151
152    /// Return every component across all groups (convenience iterator).
153    pub fn iter_components(&self) -> impl Iterator<Item = (&BiomassGroup, &BiomassComponent)> {
154        self.met_groups
155            .iter()
156            .flat_map(|g| g.components.iter().map(move |c| (g, c)))
157    }
158}
159
160#[cfg(test)]
161mod tests {
162    use super::*;
163    use std::io::Write;
164
165    const MINIMAL_JSON: &str = r#"
166    { "id" : "Gram_neg",
167      "name" : "Bacterial Gram-negative biomass reaction",
168      "ref"  : "test",
169      "energy_GAM" : 40,
170      "domain" : "Bacteria",
171      "met_groups" : [
172        { "group_name" : "DNA",
173          "mass" : 0.031,
174          "unit_group" : "g",
175          "unit_components" : "MOLFRACTION",
176          "components" : [
177            { "id":"cpd00115","name":"dATP","comp":"c","coef":0.246,"link":"cpd00012:-1" },
178            { "id":"cpd00357","name":"dTTP","comp":"c","coef":0.246 }
179          ]
180        }
181      ]
182    }"#;
183
184    #[test]
185    fn parses_minimal_biomass() {
186        let d = tempfile::tempdir().unwrap();
187        let p = d.path().join("bm.json");
188        std::fs::File::create(&p).unwrap().write_all(MINIMAL_JSON.as_bytes()).unwrap();
189        let t = BiomassTemplate::load(&p).unwrap();
190        assert_eq!(t.id, "Gram_neg");
191        assert_eq!(t.met_groups.len(), 1);
192        assert_eq!(t.met_groups[0].components.len(), 2);
193        let (cpd, c) = t.met_groups[0].components[0].link().unwrap();
194        assert_eq!(cpd, "cpd00012");
195        assert_eq!(c, -1.0);
196        assert!(t.met_groups[0].components[1].link.is_empty());
197        assert!(t.met_groups[0].components[1].link().is_none());
198    }
199
200    #[test]
201    fn load_opt_missing_file() {
202        let d = tempfile::tempdir().unwrap();
203        let t = BiomassTemplate::load_opt(d.path().join("missing.json")).unwrap();
204        assert!(t.is_none());
205    }
206}