Skip to main content

gapsmith_db/
medium_rules.rs

1//! `dat/medium_prediction_rules.tsv` loader.
2//!
3//! Columns: `Nutrient, cpd.id, rule, maxFlux, proton.balance, Comment, Category`.
4//! The `rule` column is a Boolean expression compiled at medium-prediction
5//! time (see `src/predict_medium.R:46–86`). For now we keep it verbatim —
6//! parsing happens later in the `gapsmith-medium` crate.
7
8use crate::common::{csv_err, io_err, DbError};
9use serde::{Deserialize, Serialize};
10use std::path::Path;
11
12#[derive(Debug, Clone, Serialize, Deserialize)]
13pub struct MediumRule {
14    pub nutrient: String,
15    pub cpd_id: String,
16    pub rule: String,
17    /// `None` for rows that ship `NA` (e.g. "do not supply O2 under anaerobic").
18    pub max_flux: Option<f64>,
19    pub proton_balance: String,
20    #[serde(default, skip_serializing_if = "String::is_empty")]
21    pub comment: String,
22    #[serde(default, skip_serializing_if = "String::is_empty")]
23    pub category: String,
24}
25
26pub fn load(path: impl AsRef<Path>) -> Result<Vec<MediumRule>, DbError> {
27    let path = path.as_ref();
28    let f = std::fs::File::open(path).map_err(|e| io_err(path, e))?;
29    let mut rdr = csv::ReaderBuilder::new()
30        .delimiter(b'\t')
31        .has_headers(true)
32        .quoting(false)
33        .flexible(true)
34        .from_reader(f);
35
36    // `medium_prediction_rules.tsv` in the reference data is Latin-1, not
37    // UTF-8 (old source material with `“ ”` smart quotes). We therefore read
38    // `byte_records()` and convert each field via `String::from_utf8_lossy`.
39    let headers_raw = rdr.byte_headers().map_err(|e| csv_err(path, e))?.clone();
40    let headers: Vec<String> = headers_raw
41        .iter()
42        .map(|h| String::from_utf8_lossy(h).trim().to_string())
43        .collect();
44    let col = |name: &str| -> Option<usize> {
45        headers.iter().position(|h| h == name)
46    };
47    let c_nut = col("Nutrient").ok_or_else(|| DbError::Parse {
48        path: path.to_path_buf(),
49        line: 1,
50        msg: "missing `Nutrient` column".into(),
51    })?;
52    let c_cpd = col("cpd.id").ok_or_else(|| DbError::Parse {
53        path: path.to_path_buf(),
54        line: 1,
55        msg: "missing `cpd.id` column".into(),
56    })?;
57    let c_rule = col("rule").ok_or_else(|| DbError::Parse {
58        path: path.to_path_buf(),
59        line: 1,
60        msg: "missing `rule` column".into(),
61    })?;
62    let c_flux = col("maxFlux").ok_or_else(|| DbError::Parse {
63        path: path.to_path_buf(),
64        line: 1,
65        msg: "missing `maxFlux` column".into(),
66    })?;
67    let c_proton = col("proton.balance").unwrap_or(usize::MAX);
68    let c_comment = col("Comment").unwrap_or(usize::MAX);
69    let c_cat = col("Category").unwrap_or(usize::MAX);
70
71    let mut out = Vec::new();
72    for rec in rdr.byte_records() {
73        let rec = rec.map_err(|e| csv_err(path, e))?;
74        let get = |c: usize| -> String {
75            if c == usize::MAX {
76                String::new()
77            } else {
78                rec.get(c)
79                    .map(|b| String::from_utf8_lossy(b).trim().to_string())
80                    .unwrap_or_default()
81            }
82        };
83        let nutrient = get(c_nut);
84        let cpd_id = get(c_cpd);
85        let rule = get(c_rule);
86        // Blank separator rows have every field empty — skip them silently.
87        if nutrient.is_empty() && cpd_id.is_empty() && rule.is_empty() {
88            continue;
89        }
90        let flux_raw = get(c_flux);
91        let max_flux = match flux_raw.as_str() {
92            "" | "NA" | "na" | "N/A" => None,
93            other => Some(other.parse::<f64>().map_err(|_| DbError::Parse {
94                path: path.to_path_buf(),
95                line: rec.position().map(|p| p.line()).unwrap_or(0),
96                msg: format!("maxFlux `{other}` is not a number"),
97            })?),
98        };
99        out.push(MediumRule {
100            nutrient,
101            cpd_id,
102            rule,
103            max_flux,
104            proton_balance: get(c_proton),
105            comment: get(c_comment),
106            category: get(c_cat),
107        });
108    }
109    tracing::info!(path = %path.display(), rows = out.len(), "loaded medium rules");
110    Ok(out)
111}
112
113#[cfg(test)]
114mod tests {
115    use super::*;
116    use std::io::Write;
117
118    #[test]
119    fn parses_rules() {
120        let d = tempfile::tempdir().unwrap();
121        let p = d.path().join("r.tsv");
122        let mut f = std::fs::File::create(&p).unwrap();
123        writeln!(
124            f,
125            "Nutrient\tcpd.id\trule\tmaxFlux\tproton.balance\tComment\tCategory"
126        )
127        .unwrap();
128        writeln!(f, "Water\tcpd00001\tTRUE\t100\tFALSE\tCore medium compound\tInorganics").unwrap();
129        writeln!(f, "O2\tcpd00007\tpwy1\tNA\tFALSE\tno O2\tInorganics").unwrap();
130        writeln!(f, "\t\t\t\t\t\t").unwrap(); // blank separator
131        writeln!(f, "Glc\tcpd00027\trxn1\t5\tTRUE\t\tSaccharides").unwrap();
132        let rows = load(&p).unwrap();
133        assert_eq!(rows.len(), 3);
134        assert_eq!(rows[0].cpd_id, "cpd00001");
135        assert_eq!(rows[0].max_flux, Some(100.0));
136        assert_eq!(rows[1].max_flux, None);
137        assert_eq!(rows[2].cpd_id, "cpd00027");
138    }
139}