assembly_theory/
loader.rs

1//! Parse molecule files in the `.mol` file format.
2//!
3//! # Example
4//! ```
5//! # use std::fs;
6//! # use std::path::PathBuf;
7//! # use assembly_theory::{loader, molecule::Molecule};
8//! # fn main() -> Result<(), std::io::Error> {
9//! # let path = PathBuf::from(format!("./data/checks/benzene.mol"));
10//! // Read a molecule data file as a string of lines
11//! let molfile = fs::read_to_string(path)?;
12//!
13//! let molecule = loader::parse_molfile_str(&molfile).expect("Cannot parse molfile");
14//! # Ok(())
15//! # }
16//! ```
17use crate::molecule::{Atom, Bond, MGraph, Molecule};
18use clap::error::Result;
19use pyo3::exceptions::PyOSError;
20use pyo3::PyErr;
21use std::error::Error;
22use std::fmt::Display;
23
24/// Molecule data file parsing functions return a `ParserError` type when an error occurs.
25///
26/// Describe the specifc error type along with the line number of the molecule data file where
27/// error occured.
28#[derive(Debug, Clone, PartialEq, Eq)]
29pub enum ParserError {
30    /// Atom count is not an integer value, occurs while parsing the counts line.
31    AtomCountNotInt(usize),
32    /// Bond count is not an integer value, occurs while parsing the counts line.
33    BondCountNotInt(usize),
34    /// The version of the molecule data file is not `V2000`.
35    FileVersionIsNotV2000(usize),
36    /// Cannot parse the atom's symbol as one of the exisiting [`crate::molecule::Atom`] symbols,
37    /// occurs while parsing the atom line.
38    BadElementSymbol(usize, String),
39    /// Cannot parse the Bond Number as an integer value, occurs while parsing the bond line.
40    BadBondNumber(usize),
41    /// Cannot parse the Bond Type as an integer value, occurs while parsing the bond line.
42    BondTypeNotInt(usize),
43    /// Cannot parse the Bond Type as one of the exisiting [`crate::molecule::Bond`] types, occurs
44    /// while parsing the bond line.
45    BondTypeOutOfBounds(usize),
46    /// Unknown error which if occured, should be reported to the maintainers of the crate.
47    ThisShouldNotHappen,
48    /// The molecule data file does not have all the lines to reconstruct the molecule.
49    NotEnoughLines,
50}
51
52impl Error for ParserError {}
53
54// Needed for Python library
55impl From<ParserError> for PyErr {
56    fn from(err: ParserError) -> PyErr {
57        PyOSError::new_err(err.to_string())
58    }
59}
60
61/// Parse a `.sdf` molecule data file and return a [`crate::molecule::Molecule`] object. `To be
62/// implemented`
63pub fn parse_sdfile_str(_input: &str) -> Result<Molecule, ParserError> {
64    todo!("SDfile parser unimplemented!")
65}
66
67/// Parse a string containing the contents of a `.mol` molecule data file and return a
68/// [`crate::molecule::Molecule`] object.
69///
70/// If the file string is malformed, a [`self::ParserError`] is thrown.
71///
72/// # Example
73/// ```
74/// # use std::fs;
75/// # use std::path::PathBuf;
76/// # use assembly_theory::{loader, molecule::Molecule};
77/// # fn main() -> Result<(), std::io::Error> {
78/// # let path = PathBuf::from(format!("./data/checks/benzene.mol"));
79/// // Read a molecule data file as a string of lines
80/// let molfile = fs::read_to_string(path)?;
81///
82/// let molecule = loader::parse_molfile_str(&molfile).expect("Cannot parse molfile.");
83/// # Ok(())
84/// # }
85/// ```
86pub fn parse_molfile_str(input: &str) -> Result<Molecule, ParserError> {
87    let mut lines = input.lines().enumerate().skip(3); // Skip the header block, 3 lines
88    let (ix, counts_line) = lines.next().ok_or(ParserError::NotEnoughLines)?;
89    let (n_atoms, n_bonds) = parse_counts_line(ix, counts_line)?;
90
91    let mut graph = MGraph::new_undirected();
92    let mut atom_indices = Vec::new();
93
94    lines
95        .by_ref()
96        .take(n_atoms)
97        .try_fold(&mut graph, |g, (i, l)| {
98            parse_atom_line(i, l).map(|atom| {
99                atom_indices.push(g.add_node(atom));
100                g
101            })
102        })?;
103
104    lines
105        .by_ref()
106        .take(n_bonds)
107        .try_fold(&mut graph, |g, (i, l)| {
108            parse_bond_line(i, l).map(|(first, second, bond)| {
109                g.add_edge(atom_indices[first - 1], atom_indices[second - 1], bond);
110                g
111            })
112        })?;
113
114    Ok(Molecule::from_graph(graph))
115}
116
117fn parse_counts_line(line_ix: usize, counts_line: &str) -> Result<(usize, usize), ParserError> {
118    let n_atoms = counts_line[0..3]
119        .trim()
120        .parse()
121        .map_err(|_| ParserError::AtomCountNotInt(line_ix))?;
122    let n_bonds = counts_line[3..6]
123        .trim()
124        .parse()
125        .map_err(|_| ParserError::BondCountNotInt(line_ix))?;
126    let version_number = counts_line[33..39].trim().to_uppercase();
127    if version_number != "V2000" {
128        Err(ParserError::FileVersionIsNotV2000(line_ix))
129    } else {
130        Ok((n_atoms, n_bonds))
131    }
132}
133
134fn parse_atom_line(line_ix: usize, atom_line: &str) -> Result<Atom, ParserError> {
135    let elem_str = atom_line[31..34].trim();
136    let element = elem_str
137        .parse()
138        .map_err(|_| ParserError::BadElementSymbol(line_ix, elem_str.to_owned()))?;
139    let capacity = atom_line[44..47].trim().parse::<u32>().unwrap_or(0);
140    Ok(Atom::new(element, capacity))
141}
142
143fn parse_bond_line(line_ix: usize, bond_line: &str) -> Result<(usize, usize, Bond), ParserError> {
144    let first_atom = bond_line[0..3]
145        .trim()
146        .parse()
147        .map_err(|_| ParserError::BadBondNumber(line_ix))?;
148    let second_atom = bond_line[3..6]
149        .trim()
150        .parse()
151        .map_err(|_| ParserError::BadBondNumber(line_ix))?;
152    let bond = bond_line[6..9]
153        .trim()
154        .parse::<usize>()
155        .map_err(|_| ParserError::BondTypeNotInt(line_ix))?
156        .try_into()
157        .map_err(|_| ParserError::BondTypeOutOfBounds(line_ix))?;
158    Ok((first_atom, second_atom, bond))
159}
160
161impl Display for ParserError {
162    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
163        match self {
164            Self::AtomCountNotInt(line) => {
165                write!(f, "Line {line}: Atom count is not an integer")
166            }
167            Self::BondCountNotInt(line) => {
168                write!(f, "Line {line}: Bond count is not an integer")
169            }
170            Self::FileVersionIsNotV2000(line) => {
171                write!(f, "Line {line}: File version is not V2000")
172            }
173            Self::BondTypeNotInt(line) => {
174                write!(f, "Line {line}: Bond type is not an integer")
175            }
176            Self::BondTypeOutOfBounds(line) => {
177                write!(f, "Line {line}: Bond type is not 1, 2, or 3")
178            }
179            Self::BadElementSymbol(line, sym) => {
180                write!(f, "Line {line}: Bad element symbol {sym}")
181            }
182            Self::BadBondNumber(line) => {
183                write!(f, "Line {line}: Bad bond number")
184            }
185            Self::NotEnoughLines => {
186                write!(f, "File does not have enough lines")
187            }
188            Self::ThisShouldNotHappen => {
189                write!(f, "This should not happen, report it as a bug")
190            }
191        }
192    }
193}