assembly_theory/
loader.rs

1//! Parse molecules in the `.mol` file format.
2//!
3//! # Example
4//! ```
5//! # use std::{fs, path::PathBuf};
6//! use assembly_theory::loader::parse_molfile_str;
7//!
8//! # fn main() -> Result<(), std::io::Error> {
9//! let path = PathBuf::from(format!("./data/checks/anthracene.mol"));
10//! let molfile = fs::read_to_string(path)?;
11//! let anthracene = parse_molfile_str(&molfile).expect("Parsing failure.");
12//! # Ok(())
13//! # }
14//! ```
15
16use std::{error::Error, fmt::Display};
17
18use clap::error::Result;
19
20use crate::molecule::{Atom, Bond, Element::Hydrogen, MGraph, Molecule};
21
22/// Thrown by [`parse_molfile_str`] when errors occur.
23#[derive(Debug, Clone, PartialEq, Eq)]
24pub enum ParserError {
25    /// In the counts line, atom count is not an integer value.
26    AtomCountNotInt(usize),
27    /// In the counts line, bond count is not an integer value.
28    BondCountNotInt(usize),
29    /// In the counts line, `.mol` file version is not `V2000`.
30    FileVersionIsNotV2000(usize),
31    /// In an atom line, element symbol is not one of those recognized by
32    /// [`Atom`].
33    BadElementSymbol(usize, String),
34    /// In a bond line, bond number is not an integer value.
35    BondNumberNotInt(usize),
36    /// In a bond line, bond type is not an integer value.
37    BondTypeNotInt(usize),
38    /// In a bond line, bond type is not one of those recognized by [`Bond`].
39    BadBondType(usize),
40    /// The `.mol` file has insufficient lines to reconstruct the molecule.
41    NotEnoughLines,
42    /// An unknown error that should be reported to the crate maintainers.
43    ThisShouldNotHappen,
44}
45
46impl Error for ParserError {}
47
48impl Display for ParserError {
49    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
50        match self {
51            Self::AtomCountNotInt(line) => {
52                write!(f, "Line {line}: Atom count is not an integer")
53            }
54            Self::BondCountNotInt(line) => {
55                write!(f, "Line {line}: Bond count is not an integer")
56            }
57            Self::FileVersionIsNotV2000(line) => {
58                write!(f, "Line {line}: File version is not V2000")
59            }
60            Self::BadElementSymbol(line, sym) => {
61                write!(f, "Line {line}: Bad element symbol '{sym}'")
62            }
63            Self::BondNumberNotInt(line) => {
64                write!(f, "Line {line}: Bond number is not an integer")
65            }
66            Self::BondTypeNotInt(line) => {
67                write!(f, "Line {line}: Bond type is not an integer")
68            }
69            Self::BadBondType(line) => {
70                write!(f, "Line {line}: Bond type is not 1, 2, or 3")
71            }
72            Self::NotEnoughLines => {
73                write!(f, "File does not have enough lines")
74            }
75            Self::ThisShouldNotHappen => {
76                write!(f, "This should not happen, report it as a bug")
77            }
78        }
79    }
80}
81
82/// Parse the contents of a `.mol` file as a [`Molecule`].
83///
84/// If the `.mol` file contents are malformed, a [`ParserError`] is thrown.
85///
86/// # Example
87/// ```
88/// # use std::{fs, path::PathBuf};
89/// use assembly_theory::loader::parse_molfile_str;
90///
91/// # fn main() -> Result<(), std::io::Error> {
92/// let path = PathBuf::from(format!("./data/checks/anthracene.mol"));
93/// let molfile = fs::read_to_string(path)?;
94/// let anthracene = parse_molfile_str(&molfile).expect("Parsing failure.");
95/// # Ok(())
96/// # }
97/// ```
98pub fn parse_molfile_str(input: &str) -> Result<Molecule, ParserError> {
99    let mut lines = input.lines().enumerate().skip(3); // Skip header block
100    let (ix, counts_line) = lines.next().ok_or(ParserError::NotEnoughLines)?;
101    let (n_atoms, n_bonds) = parse_counts_line(ix, counts_line)?;
102
103    let mut graph = MGraph::new_undirected();
104    let mut atom_indices = Vec::with_capacity(n_atoms); // original atom index -> Option<NodeIndex>
105
106    // Atom parsing with hydrogen exclusion
107    lines.by_ref().take(n_atoms).try_for_each(|(i, line)| {
108        let atom = parse_atom_line(i, line)?;
109        if atom.element() == Hydrogen {
110            atom_indices.push(None); // skip H
111        } else {
112            let idx = graph.add_node(atom);
113            atom_indices.push(Some(idx));
114        }
115        Ok(())
116    })?;
117
118    // Bond parsing with skipped H handling
119    lines.by_ref().take(n_bonds).try_for_each(|(i, line)| {
120        let (first, second, bond) = parse_bond_line(i, line)?;
121        let a = atom_indices.get(first - 1).copied().flatten();
122        let b = atom_indices.get(second - 1).copied().flatten();
123        if let (Some(ai), Some(bi)) = (a, b) {
124            graph.add_edge(ai, bi, bond);
125        }
126        Ok(())
127    })?;
128
129    Ok(Molecule::from_graph(graph))
130}
131
132fn parse_counts_line(line_ix: usize, counts_line: &str) -> Result<(usize, usize), ParserError> {
133    let n_atoms = counts_line[0..3]
134        .trim()
135        .parse()
136        .map_err(|_| ParserError::AtomCountNotInt(line_ix))?;
137    let n_bonds = counts_line[3..6]
138        .trim()
139        .parse()
140        .map_err(|_| ParserError::BondCountNotInt(line_ix))?;
141    let version_number = counts_line[33..39].trim().to_uppercase();
142    if version_number != "V2000" {
143        Err(ParserError::FileVersionIsNotV2000(line_ix))
144    } else {
145        Ok((n_atoms, n_bonds))
146    }
147}
148
149fn parse_atom_line(line_ix: usize, atom_line: &str) -> Result<Atom, ParserError> {
150    let elem_str = atom_line[31..34].trim();
151    let element = elem_str
152        .parse()
153        .map_err(|_| ParserError::BadElementSymbol(line_ix, elem_str.to_owned()))?;
154    let capacity = atom_line[44..47].trim().parse::<u32>().unwrap_or(0);
155    Ok(Atom::new(element, capacity))
156}
157
158fn parse_bond_line(line_ix: usize, bond_line: &str) -> Result<(usize, usize, Bond), ParserError> {
159    let first_atom = bond_line[0..3]
160        .trim()
161        .parse()
162        .map_err(|_| ParserError::BondNumberNotInt(line_ix))?;
163    let second_atom = bond_line[3..6]
164        .trim()
165        .parse()
166        .map_err(|_| ParserError::BondNumberNotInt(line_ix))?;
167    let bond = bond_line[6..9]
168        .trim()
169        .parse::<usize>()
170        .map_err(|_| ParserError::BondTypeNotInt(line_ix))?
171        .try_into()
172        .map_err(|_| ParserError::BadBondType(line_ix))?;
173    Ok((first_atom, second_atom, bond))
174}