1use crate::common::{csv_err, io_err, DbError};
13use crate::stoich_parse::{parse_stoichiometry, StoichTerm};
14use gapsmith_core::{CpdId, Reversibility, RxnId, SeedStatus};
15use serde::{Deserialize, Serialize};
16use std::path::Path;
17
18#[derive(Debug, Clone, Serialize, Deserialize)]
20pub struct SeedRxnRow {
21 pub id: RxnId,
22 pub abbreviation: String,
23 pub name: String,
24 #[serde(default, skip_serializing_if = "String::is_empty")]
25 pub code: String,
26 pub stoichiometry: String,
28 #[serde(default)]
29 pub is_transport: u8,
30 #[serde(default, skip_serializing_if = "String::is_empty")]
31 pub equation: String,
32 #[serde(default, skip_serializing_if = "String::is_empty")]
33 pub definition: String,
34 #[serde(default, skip_serializing_if = "String::is_empty")]
36 pub reversibility: String,
37 #[serde(default, skip_serializing_if = "String::is_empty")]
39 pub direction: String,
40 #[serde(default, skip_serializing_if = "String::is_empty")]
41 pub abstract_reaction: String,
42 #[serde(default, skip_serializing_if = "String::is_empty")]
43 pub pathways: String,
44 #[serde(default, skip_serializing_if = "String::is_empty")]
45 pub aliases: String,
46 #[serde(default, skip_serializing_if = "String::is_empty")]
47 pub ec_numbers: String,
48 #[serde(default, skip_serializing_if = "String::is_empty")]
49 pub deltag: String,
50 #[serde(default, skip_serializing_if = "String::is_empty")]
51 pub deltagerr: String,
52 #[serde(default, skip_serializing_if = "String::is_empty")]
53 pub compound_ids: String,
54 #[serde(default, skip_serializing_if = "String::is_empty")]
55 pub status: String,
56 #[serde(default)]
57 pub is_obsolete: u8,
58 #[serde(default, skip_serializing_if = "String::is_empty")]
59 pub linked_reaction: String,
60 #[serde(default, skip_serializing_if = "String::is_empty")]
61 pub notes: String,
62 #[serde(default, skip_serializing_if = "String::is_empty")]
63 pub is_copy_of: String,
64 pub gapseq_status: SeedStatus,
66
67 #[serde(skip, default)]
72 pub stoich_hash: Option<String>,
73}
74
75impl SeedRxnRow {
76 pub fn parse_stoich(&self) -> Result<Vec<StoichTerm>, crate::stoich_parse::StoichParseError> {
77 parse_stoichiometry(&self.stoichiometry)
78 }
79
80 pub fn reversibility(&self) -> Option<Reversibility> {
81 self.reversibility.chars().next().and_then(Reversibility::from_code)
82 }
83
84 pub fn ec_list(&self) -> Vec<&str> {
86 self.ec_numbers
87 .split(',')
88 .map(str::trim)
89 .filter(|s| !s.is_empty())
90 .collect()
91 }
92
93 pub fn pathway_list(&self) -> Vec<&str> {
95 self.pathways
96 .split(',')
97 .map(str::trim)
98 .filter(|s| !s.is_empty())
99 .collect()
100 }
101}
102
103#[derive(Debug, Clone, Serialize, Deserialize)]
105pub struct SeedCpdRow {
106 pub id: CpdId,
107 #[serde(rename = "MNX_ID", default, skip_serializing_if = "String::is_empty")]
108 pub mnx_id: String,
109 #[serde(default, skip_serializing_if = "String::is_empty")]
110 pub abbreviation: String,
111 pub name: String,
112 #[serde(default, skip_serializing_if = "String::is_empty")]
113 pub formula: String,
114 #[serde(default, skip_serializing_if = "String::is_empty")]
115 pub mass: String,
116 #[serde(default, skip_serializing_if = "String::is_empty")]
117 pub source: String,
118 #[serde(default)]
119 pub charge: i32,
120 #[serde(default)]
121 pub is_core: u8,
122 #[serde(default)]
123 pub is_obsolete: u8,
124 #[serde(default, skip_serializing_if = "String::is_empty")]
125 pub linked_compound: String,
126 #[serde(default)]
127 pub is_cofactor: u8,
128 #[serde(default, skip_serializing_if = "String::is_empty")]
129 pub deltag: String,
130 #[serde(default, skip_serializing_if = "String::is_empty")]
131 pub deltagerr: String,
132 #[serde(default, skip_serializing_if = "String::is_empty")]
133 pub pka: String,
134 #[serde(default, skip_serializing_if = "String::is_empty")]
135 pub pkb: String,
136 #[serde(default, skip_serializing_if = "String::is_empty")]
137 pub abstract_compound: String,
138 #[serde(default, skip_serializing_if = "String::is_empty")]
139 pub comprised_of: String,
140 #[serde(default, skip_serializing_if = "String::is_empty")]
141 pub aliases: String,
142 #[serde(default, skip_serializing_if = "String::is_empty")]
143 pub smiles: String,
144 #[serde(rename = "InChIKey", default, skip_serializing_if = "String::is_empty")]
145 pub inchikey: String,
146 #[serde(rename = "hmdbID", default, skip_serializing_if = "String::is_empty")]
147 pub hmdb_id: String,
148 #[serde(rename = "reactomeID", default, skip_serializing_if = "String::is_empty")]
149 pub reactome_id: String,
150 #[serde(rename = "chebiID", default, skip_serializing_if = "String::is_empty")]
151 pub chebi_id: String,
152 #[serde(rename = "InChI", default, skip_serializing_if = "String::is_empty")]
153 pub inchi: String,
154 #[serde(rename = "keggID", default, skip_serializing_if = "String::is_empty")]
155 pub kegg_id: String,
156 #[serde(rename = "biggID", default, skip_serializing_if = "String::is_empty")]
157 pub bigg_id: String,
158 #[serde(rename = "biocycID", default, skip_serializing_if = "String::is_empty")]
159 pub biocyc_id: String,
160}
161
162pub fn load_seed_reactions(path: impl AsRef<Path>) -> Result<Vec<SeedRxnRow>, DbError> {
168 let path = path.as_ref();
169 let mut rdr = csv::ReaderBuilder::new()
170 .delimiter(b'\t')
171 .has_headers(true)
172 .quoting(false) .flexible(true)
174 .from_path(path)
175 .map_err(|e| csv_err(path, e))?;
176
177 let headers = rdr.headers().map_err(|e| csv_err(path, e))?.clone();
182 let col = |name: &str| -> Option<usize> {
183 headers.iter().position(|h| h.trim() == name)
184 };
185 let c_id = col("id").ok_or_else(|| DbError::Parse {
186 path: path.to_path_buf(),
187 line: 1,
188 msg: "missing `id` column".into(),
189 })?;
190 let c_abbr = col("abbreviation").unwrap_or(usize::MAX);
191 let c_name = col("name").ok_or_else(|| DbError::Parse {
192 path: path.to_path_buf(),
193 line: 1,
194 msg: "missing `name` column".into(),
195 })?;
196 let c_code = col("code").unwrap_or(usize::MAX);
197 let c_stoich = col("stoichiometry").ok_or_else(|| DbError::Parse {
198 path: path.to_path_buf(),
199 line: 1,
200 msg: "missing `stoichiometry` column".into(),
201 })?;
202 let c_is_transport = col("is_transport").unwrap_or(usize::MAX);
203 let c_equation = col("equation").unwrap_or(usize::MAX);
204 let c_definition = col("definition").unwrap_or(usize::MAX);
205 let c_rev = col("reversibility").unwrap_or(usize::MAX);
206 let c_dir = col("direction").unwrap_or(usize::MAX);
207 let c_abs = col("abstract_reaction").unwrap_or(usize::MAX);
208 let c_pwy = col("pathways").unwrap_or(usize::MAX);
209 let c_ali = col("aliases").unwrap_or(usize::MAX);
210 let c_ec = col("ec_numbers").unwrap_or(usize::MAX);
211 let c_deltag = col("deltag").unwrap_or(usize::MAX);
212 let c_deltagerr = col("deltagerr").unwrap_or(usize::MAX);
213 let c_cpd_ids = col("compound_ids").unwrap_or(usize::MAX);
214 let c_status = col("status").unwrap_or(usize::MAX);
215 let c_is_obs = col("is_obsolete").unwrap_or(usize::MAX);
216 let c_linked = col("linked_reaction").unwrap_or(usize::MAX);
217 let c_notes = col("notes").unwrap_or(usize::MAX);
218 let c_copy = col("is_copy_of").unwrap_or(usize::MAX);
219 let c_gapseq = col("gapseq.status").ok_or_else(|| DbError::Parse {
220 path: path.to_path_buf(),
221 line: 1,
222 msg: "missing `gapseq.status` column".into(),
223 })?;
224
225 let mut out = Vec::with_capacity(35_000);
226 for (i, rec) in rdr.records().enumerate() {
227 let rec = rec.map_err(|e| csv_err(path, e))?;
228 let get = |c: usize| -> String {
229 if c == usize::MAX { String::new() } else { rec.get(c).unwrap_or("").to_string() }
230 };
231 let get_u8 = |c: usize| -> u8 { get(c).trim().parse().unwrap_or(0) };
232 let gapseq_status = match get(c_gapseq).as_str() {
233 "approved" => SeedStatus::Approved,
234 "corrected" => SeedStatus::Corrected,
235 "not.assessed" | "not_assessed" => SeedStatus::NotAssessed,
236 "removed" => SeedStatus::Removed,
237 _ => SeedStatus::None,
238 };
239 out.push(SeedRxnRow {
240 id: RxnId::new(get(c_id).trim()),
241 abbreviation: get(c_abbr),
242 name: get(c_name),
243 code: get(c_code),
244 stoichiometry: get(c_stoich),
245 is_transport: get_u8(c_is_transport),
246 equation: get(c_equation),
247 definition: get(c_definition),
248 reversibility: get(c_rev),
249 direction: get(c_dir),
250 abstract_reaction: get(c_abs),
251 pathways: get(c_pwy),
252 aliases: get(c_ali),
253 ec_numbers: get(c_ec),
254 deltag: get(c_deltag),
255 deltagerr: get(c_deltagerr),
256 compound_ids: get(c_cpd_ids),
257 status: get(c_status),
258 is_obsolete: get_u8(c_is_obs),
259 linked_reaction: get(c_linked),
260 notes: get(c_notes),
261 is_copy_of: get(c_copy),
262 gapseq_status,
263 stoich_hash: None,
264 });
265 let _ = i;
266 }
267 for row in &mut out {
271 if !row.stoichiometry.is_empty() {
272 row.stoich_hash =
273 crate::stoich_hash::rxn_stoich_hash(&row.stoichiometry, &row.reversibility).ok();
274 }
275 }
276 tracing::info!(path = %path.display(), rows = out.len(), "loaded SEED reactions");
277 Ok(out)
278}
279
280pub fn load_seed_metabolites(path: impl AsRef<Path>) -> Result<Vec<SeedCpdRow>, DbError> {
282 let path = path.as_ref();
283 let f = std::fs::File::open(path).map_err(|e| io_err(path, e))?;
284 let mut rdr = csv::ReaderBuilder::new()
285 .delimiter(b'\t')
286 .has_headers(true)
287 .quoting(false)
288 .flexible(true)
289 .from_reader(f);
290
291 let mut out = Vec::with_capacity(28_000);
292 for rec in rdr.deserialize::<SeedCpdRow>() {
293 match rec {
294 Ok(r) => out.push(r),
295 Err(e) => return Err(csv_err(path, e)),
296 }
297 }
298 tracing::info!(path = %path.display(), rows = out.len(), "loaded SEED metabolites");
299 Ok(out)
300}
301
302#[cfg(test)]
303mod tests {
304 use super::*;
305 use std::io::Write;
306
307 #[test]
308 fn parses_minimal_reactions_tsv() {
309 let dir = tempfile::tempdir().unwrap();
310 let path = dir.path().join("r.tsv");
311 let mut f = std::fs::File::create(&path).unwrap();
312 writeln!(
313 f,
314 "id\tabbreviation\tname\tcode\tstoichiometry\tis_transport\tequation\tdefinition\treversibility\tdirection\tabstract_reaction\tpathways\taliases\tec_numbers\tdeltag\tdeltagerr\tcompound_ids\tstatus\tis_obsolete\tlinked_reaction\tnotes\tis_copy_of\tgapseq.status"
315 )
316 .unwrap();
317 writeln!(
318 f,
319 "rxn00001\tR00004\tsome name\t\t-1:cpd00001:0:0:\"H2O\";1:cpd00002:0:0:\"X\"\t0\t\t\t>\t=\t\t\t\t3.5.1.2\t\t\t\tOK\t0\t\t\t\tapproved"
320 )
321 .unwrap();
322 writeln!(
323 f,
324 "rxn00002\tR00005\tother\t\t-1:cpd00002:0:0:\"X\"\t1\t\t\t=\t=\t\t\t\t\t\t\t\tOK\t0\t\t\t\tremoved"
325 )
326 .unwrap();
327
328 let rows = load_seed_reactions(&path).unwrap();
329 assert_eq!(rows.len(), 2);
330 assert_eq!(rows[0].id.as_str(), "rxn00001");
331 assert_eq!(rows[0].gapseq_status, SeedStatus::Approved);
332 assert_eq!(rows[0].reversibility(), Some(Reversibility::Forward));
333 assert_eq!(rows[0].ec_list(), vec!["3.5.1.2"]);
334 let stoich = rows[0].parse_stoich().unwrap();
335 assert_eq!(stoich.len(), 2);
336 assert_eq!(stoich[0].cpd.as_str(), "cpd00001");
337
338 assert_eq!(rows[1].gapseq_status, SeedStatus::Removed);
339 assert_eq!(rows[1].reversibility(), Some(Reversibility::Reversible));
340 }
341
342 #[test]
343 fn parses_minimal_metabolites_tsv() {
344 let dir = tempfile::tempdir().unwrap();
345 let path = dir.path().join("m.tsv");
346 let mut f = std::fs::File::create(&path).unwrap();
347 writeln!(
348 f,
349 "id\tMNX_ID\tabbreviation\tname\tformula\tmass\tsource\tcharge\tis_core\tis_obsolete\tlinked_compound\tis_cofactor\tdeltag\tdeltagerr\tpka\tpkb\tabstract_compound\tcomprised_of\taliases\tsmiles\tInChIKey\thmdbID\treactomeID\tchebiID\tInChI\tkeggID\tbiggID\tbiocycID"
350 )
351 .unwrap();
352 writeln!(
353 f,
354 "cpd00001\tMNXM2\th2o\tH2O\tH2O\t18\tModelSEED\t0\t1\t0\tnull\t0\t-56.687\t0.5\t1:15.7\t1:-1.8\tnull\tnull\tnull\tO\tXLYOFNOQVPJJNP-UHFFFAOYSA-N\t\t\t\t\tC00001\th2o\tWATER"
355 )
356 .unwrap();
357
358 let rows = load_seed_metabolites(&path).unwrap();
359 assert_eq!(rows.len(), 1);
360 assert_eq!(rows[0].id.as_str(), "cpd00001");
361 assert_eq!(rows[0].mnx_id, "MNXM2");
362 assert_eq!(rows[0].name, "H2O");
363 assert_eq!(rows[0].charge, 0);
364 }
365}