Skip to main content

cosmolkit_core/io/
xyz.rs

1//! RDKit-aligned XYZ reader.
2//!
3//! XYZ files contain atom identities and Cartesian coordinates only. This
4//! reader intentionally does not infer connectivity or bond orders.
5
6use std::num::ParseFloatError;
7
8use crate::{AtomSpec, Element, Molecule, MoleculeBuildError, MoleculeBuilder};
9
10#[derive(Debug, Clone, PartialEq, Eq, thiserror::Error)]
11pub enum XyzReadError {
12    #[error("empty XYZ block")]
13    EmptyBlock,
14    #[error(
15        "unable to recognize the number of atoms: cannot convert '{value}' to unsigned int on line 0"
16    )]
17    AtomCount { value: String },
18    #[error("EOF hit while reading atoms")]
19    UnexpectedEof,
20    #[error("missing coordinates on line {line}")]
21    MissingCoordinates { line: usize },
22    #[error("cannot convert '{value}' to double on line {line}")]
23    Coordinate {
24        value: String,
25        line: usize,
26        #[source]
27        source: ParseFloatError,
28    },
29    #[error("{message}")]
30    AtomSymbol { message: String },
31    #[error(transparent)]
32    Build(#[from] MoleculeBuildError),
33}
34
35fn parse_float_error() -> ParseFloatError {
36    "x".parse::<f64>().unwrap_err()
37}
38
39fn rdkit_to_unsigned(value: &str) -> Result<usize, XyzReadError> {
40    // BEGIN RDKIT CPP FUNCTION FileParserUtils::toUnsigned(input, true)
41    // RDKit✔️✔️: const char *txt = input.data();
42    // RDKit✔️✔️: for (size_t i = 0u; i < input.size() && *txt != '\x00'; ++i) {
43    // RDKit✔️✔️:   if ((*txt >= '0' && *txt <= '9') || (acceptSpaces && *txt == ' ') ||
44    // RDKit✔️✔️:       *txt == '+') {
45    // RDKit✔️✔️:     ++txt;
46    // RDKit✔️✔️:   } else {
47    // RDKit✔️✔️:     throw boost::bad_lexical_cast();
48    // RDKit✔️✔️:   }
49    // RDKit✔️✔️: }
50    let checked = value.split_once('\0').map_or(value, |(prefix, _)| prefix);
51    if !checked
52        .bytes()
53        .all(|byte| byte.is_ascii_digit() || byte == b' ' || byte == b'+')
54    {
55        return Err(XyzReadError::AtomCount {
56            value: value.to_string(),
57        });
58    }
59
60    // RDKit✔️✔️: txt = input.data();
61    // RDKit✔️✔️: unsigned int sz = input.size();
62    // RDKit✔️✔️: if (acceptSpaces) {
63    // RDKit✔️✔️:   while (*txt == ' ') {
64    // RDKit✔️✔️:     ++txt;
65    // RDKit✔️✔️:     --sz;
66    // RDKit✔️✔️:     if (sz < 1U) {
67    // RDKit✔️✔️:       return 0;
68    // RDKit✔️✔️:     }
69    // RDKit✔️✔️:   }
70    // RDKit✔️✔️: }
71    let trimmed_start = checked.trim_start_matches(' ');
72    if checked.len() != 0 && trimmed_start.is_empty() {
73        return Ok(0);
74    }
75
76    // RDKit✔️✔️: unsigned int res = 0;
77    // RDKit✔️✔️: std::from_chars(txt, txt + sz, res);
78    let digit_prefix = trimmed_start.bytes().take_while(u8::is_ascii_digit).count();
79    if digit_prefix == 0 {
80        return Ok(0);
81    }
82    trimmed_start[..digit_prefix]
83        .parse::<usize>()
84        .map_err(|_| XyzReadError::AtomCount {
85            value: value.to_string(),
86        })
87    // END RDKIT CPP FUNCTION
88}
89
90fn rdkit_to_double_no_spaces(value: &str, line: usize) -> Result<f64, XyzReadError> {
91    // BEGIN RDKIT CPP FUNCTION FileParserUtils::toDouble(input, false)
92    // RDKit✔️✔️: const char *txt = input.data();
93    // RDKit✔️✔️: for (size_t i = 0u; i < input.size() && *txt != '\x00'; ++i) {
94    // RDKit✔️✔️:   if ((*txt >= '0' && *txt <= '9') || (acceptSpaces && *txt == ' ') ||
95    // RDKit✔️✔️:       *txt == '+' || *txt == '-' || *txt == ',' || *txt == '.') {
96    // RDKit✔️✔️:     ++txt;
97    // RDKit✔️✔️:   } else {
98    // RDKit✔️✔️:     throw boost::bad_lexical_cast();
99    // RDKit✔️✔️:   }
100    // RDKit✔️✔️: }
101    let checked = value.split_once('\0').map_or(value, |(prefix, _)| prefix);
102    if !checked.bytes().all(|byte| {
103        byte.is_ascii_digit() || byte == b'+' || byte == b'-' || byte == b',' || byte == b'.'
104    }) {
105        return Err(XyzReadError::Coordinate {
106            value: value.to_string(),
107            line,
108            source: value.parse::<f64>().err().unwrap_or_else(parse_float_error),
109        });
110    }
111
112    // RDKit✔️✔️: double res = atof(input.data());
113    let numeric_prefix = checked
114        .split_once(',')
115        .map_or(checked, |(prefix, _)| prefix);
116    numeric_prefix
117        .parse::<f64>()
118        .map_err(|source| XyzReadError::Coordinate {
119            value: value.to_string(),
120            line,
121            source,
122        })
123    // END RDKIT CPP FUNCTION
124}
125
126#[must_use]
127fn normalize_xyz_symbol(raw: &str) -> String {
128    let mut chars = raw.chars().collect::<Vec<_>>();
129    if chars.len() == 2 && chars[1].is_ascii_uppercase() {
130        chars[1] = chars[1].to_ascii_lowercase();
131    }
132    chars.into_iter().collect()
133}
134
135fn element_from_rdkit_symbol(symbol: &str) -> Result<Element, XyzReadError> {
136    for atomic_number in 0..=118u8 {
137        if crate::chemistry::valence::rdkit_element_symbol(atomic_number).ok() == Some(symbol) {
138            return Element::from_atomic_number(atomic_number).ok_or_else(|| {
139                XyzReadError::AtomSymbol {
140                    message: format!("Bad atomic number for element symbol {symbol}"),
141                }
142            });
143        }
144    }
145    Err(XyzReadError::AtomSymbol {
146        message: format!("Element '{symbol}' not found"),
147    })
148}
149
150fn parse_xyz_atom_line(atom_line: &str, line: usize) -> Result<(Element, [f64; 3]), XyzReadError> {
151    // BEGIN RDKIT CPP FUNCTION ParseXYZFileAtomLine
152    // RDKit✔️✔️: std::string whitespace{" \t"};
153    // RDKit✔️✔️: size_t delims[8];
154    // RDKit✔️✔️: size_t prev = 0;
155    // RDKit✔️✔️: for (unsigned int i = 0; i < 7; i++) {
156    // RDKit✔️✔️:   if (i % 2 == 0) {
157    // RDKit✔️✔️:     delims[i] = atomLine.find_first_not_of(whitespace, prev);
158    // RDKit✔️✔️:   } else {
159    // RDKit✔️✔️:     delims[i] = atomLine.find_first_of(whitespace, prev);
160    // RDKit✔️✔️:   }
161    // RDKit✔️✔️:   if (delims[i] == std::string::npos) {
162    // RDKit✔️✔️:     std::ostringstream errout;
163    // RDKit✔️✔️:     errout << "Missing coordinates on line " << line << std::endl;
164    // RDKit✔️✔️:     throw FileParseException(errout.str());
165    // RDKit✔️✔️:   }
166    // RDKit✔️✔️:   prev = delims[i];
167    // RDKit✔️✔️: }
168    // RDKit✔️✔️: delims[7] = atomLine.find_last_not_of(whitespace) + 1;
169    let mut delims = [0usize; 8];
170    let mut prev = 0usize;
171    for (i, delim) in delims.iter_mut().take(7).enumerate() {
172        *delim = if i % 2 == 0 {
173            atom_line[prev..]
174                .find(|ch: char| ch != ' ' && ch != '\t')
175                .map(|idx| prev + idx)
176        } else {
177            atom_line[prev..].find([' ', '\t']).map(|idx| prev + idx)
178        }
179        .ok_or(XyzReadError::MissingCoordinates { line })?;
180        prev = *delim;
181    }
182    delims[7] = atom_line
183        .rfind(|ch: char| ch != ' ' && ch != '\t')
184        .map_or(0, |idx| idx + 1);
185
186    // RDKit✔️✔️: pos.x = FileParserUtils::toDouble(
187    // RDKit✔️✔️:     atomLine.substr(delims[2], delims[3] - delims[2]), false);
188    // RDKit✔️✔️: pos.y = FileParserUtils::toDouble(
189    // RDKit✔️✔️:     atomLine.substr(delims[4], delims[5] - delims[4]), false);
190    // RDKit✔️✔️: pos.z = FileParserUtils::toDouble(
191    // RDKit✔️✔️:     atomLine.substr(delims[6], delims[7] - delims[6]), false);
192    let pos = [
193        rdkit_to_double_no_spaces(&atom_line[delims[2]..delims[3]], line)?,
194        rdkit_to_double_no_spaces(&atom_line[delims[4]..delims[5]], line)?,
195        rdkit_to_double_no_spaces(&atom_line[delims[6]..delims[7]], line)?,
196    ];
197
198    // RDKit✔️✔️: std::string symb{atomLine.substr(delims[0], delims[1] - delims[0])};
199    // RDKit✔️✔️: if (symb.size() == 2 && symb[1] >= 'A' && symb[1] <= 'Z') {
200    // RDKit✔️✔️:   symb[1] = static_cast<char>(tolower(symb[1]));
201    // RDKit✔️✔️: }
202    // RDKit✔️✔️: atom = new Atom(PeriodicTable::getTable()->getAtomicNumber(symb));
203    let symbol = normalize_xyz_symbol(&atom_line[delims[0]..delims[1]]);
204    let element = element_from_rdkit_symbol(&symbol)?;
205    Ok((element, pos))
206    // END RDKIT CPP FUNCTION
207}
208
209fn parse_extra_line(extra_line: &str) -> Result<(), XyzReadError> {
210    // BEGIN RDKIT CPP FUNCTION ParseExtraLine
211    // RDKit✔️✔️: std::string whitespace{" \t"};
212    // RDKit✔️✔️: if (extraLine.find_first_not_of(whitespace) != std::string::npos) {
213    // RDKit✔️✔️:   std::ostringstream errout;
214    // RDKit✔️✔️:   errout << "More lines than expected" << std::endl;
215    // RDKit✔️✔️:   throw FileParseException(errout.str());
216    // RDKit✔️✔️: }
217    if extra_line.trim_matches([' ', '\t']).is_empty() {
218        Ok(())
219    } else {
220        Err(XyzReadError::AtomSymbol {
221            message: "More lines than expected".to_string(),
222        })
223    }
224    // END RDKIT CPP FUNCTION
225}
226
227pub fn read_xyz_from_str(block: &str) -> Result<Molecule, XyzReadError> {
228    // BEGIN RDKIT CPP FUNCTION MolFromXYZDataStream / MolFromXYZBlock
229    // RDKit✔️✔️: std::unique_ptr<RWMol> MolFromXYZBlock(const std::string &xyzBlock) {
230    // RDKit✔️✔️:   std::istringstream xyz(xyzBlock);
231    // RDKit✔️✔️:   xyz.peek();
232    // RDKit✔️✔️:   if (!xyz.eof()) {
233    // RDKit✔️✔️:     return MolFromXYZDataStream(xyz);
234    // RDKit✔️✔️:   } else {
235    // RDKit✔️✔️:     return nullptr;
236    // RDKit✔️✔️:   }
237    if block.is_empty() {
238        return Err(XyzReadError::EmptyBlock);
239    }
240
241    let mut lines = block.lines();
242    // RDKit✔️✔️: std::string num{getLine(inStream)};
243    // RDKit✔️✔️: numAtoms = FileParserUtils::toUnsigned(num);
244    let atom_count_line = lines.next().ok_or(XyzReadError::EmptyBlock)?;
245    let num_atoms = rdkit_to_unsigned(atom_count_line)?;
246
247    // RDKit✔️✔️: std::string comment{getLine(inStream)};
248    let comment = lines.next().unwrap_or_default();
249
250    // RDKit✔️✔️: auto mol = std::make_unique<RWMol>();
251    // RDKit✔️✔️: if (numAtoms) {
252    // RDKit✔️✔️:   Conformer *conf = new Conformer(numAtoms);
253    // RDKit✔️✔️:   if (!comment.empty()) {
254    // RDKit✔️✔️:     mol->setProp("_FileComments", comment);
255    // RDKit✔️✔️:   }
256    let mut builder =
257        MoleculeBuilder::new().with_topology_trust(crate::TopologyTrust::CoordinateOnly);
258    let mut coords = Vec::with_capacity(num_atoms);
259    if num_atoms > 0 && !comment.is_empty() {
260        builder = builder.with_property("_FileComments", comment);
261    }
262
263    // RDKit✔️✔️: for (unsigned int i = 0; i < numAtoms; i++) {
264    // RDKit✔️✔️:   if (inStream.eof()) {
265    // RDKit✔️✔️:     throw FileParseException("EOF hit while reading atoms");
266    // RDKit✔️✔️:   }
267    // RDKit✔️✔️:   RDGeom::Point3D pos;
268    // RDKit✔️✔️:   std::string atomLine{getLine(inStream)};
269    // RDKit✔️✔️:   Atom *atom = ParseXYZFileAtomLine(atomLine, pos, i + 2);
270    // RDKit✔️✔️:   unsigned int idx = mol->addAtom(atom, false, true);
271    // RDKit✔️✔️:   conf->setAtomPos(idx, pos);
272    // RDKit✔️✔️: }
273    for i in 0..num_atoms {
274        let atom_line = lines.next().ok_or(XyzReadError::UnexpectedEof)?;
275        let (element, coord) = parse_xyz_atom_line(atom_line, i + 2)?;
276        builder.add_atom(AtomSpec::new(element));
277        coords.push(coord);
278    }
279    if num_atoms > 0 {
280        builder.add_3d_conformer(coords)?;
281    }
282
283    // RDKit✔️✔️: while (!inStream.eof()) {
284    // RDKit✔️✔️:   std::string extraLine{getLine(inStream)};
285    // RDKit✔️✔️:   ParseExtraLine(extraLine);
286    // RDKit✔️✔️: }
287    for extra_line in lines {
288        parse_extra_line(extra_line)?;
289    }
290
291    builder.build().map_err(Into::into)
292    // END RDKIT CPP FUNCTION
293}
294
295#[cfg(test)]
296mod tests {
297    use super::read_xyz_from_str;
298
299    #[test]
300    fn xyz_reader_parses_atoms_coordinates_comment_and_no_bonds_like_rdkit() {
301        let mol = read_xyz_from_str(
302            "5\nmethane\nC 0.0 0.0 0.0\nH -0.635 -0.635 0.635\nH -0.635 0.635 -0.635\nH 0.635 -0.635 -0.635\nH 0.635 0.635 0.635\n",
303        )
304        .expect("xyz parse");
305
306        assert_eq!(mol.num_atoms(), 5);
307        assert_eq!(mol.num_bonds(), 0);
308        assert_eq!(mol.atomic_numbers(), vec![6, 1, 1, 1, 1]);
309        assert_eq!(mol.properties().prop("_FileComments"), Some("methane"));
310        assert_eq!(mol.conformers_3d().len(), 1);
311        assert_eq!(
312            mol.conformers_3d()[0].coordinates()[1],
313            [-0.635, -0.635, 0.635]
314        );
315    }
316
317    #[test]
318    fn xyz_reader_accepts_uppercase_second_symbol_letter_like_rdkit() {
319        let mol = read_xyz_from_str("1\n\nCL 1 2 3\n").expect("xyz parse");
320
321        assert_eq!(mol.atomic_numbers(), vec![17]);
322        assert_eq!(mol.conformers_3d()[0].coordinates()[0], [1.0, 2.0, 3.0]);
323    }
324
325    #[test]
326    fn xyz_reader_rejects_extra_atom_line_fields_like_rdkit() {
327        let err = read_xyz_from_str("1\n\nC 1 2 3 extra\n").expect_err("xyz parse should fail");
328
329        assert!(err.to_string().contains("cannot convert '3 extra'"));
330    }
331
332    #[test]
333    fn xyz_reader_rejects_scientific_notation_like_rdkit_to_double_false() {
334        let err = read_xyz_from_str("1\n\nC 1e0 2 3\n").expect_err("xyz parse should fail");
335
336        assert!(err.to_string().contains("cannot convert '1e0'"));
337    }
338
339    #[test]
340    fn xyz_reader_atom_count_accepts_only_rdkit_unsigned_space_rules() {
341        let mol = read_xyz_from_str("   \ncomment\n").expect("space-only count is zero");
342        assert_eq!(mol.num_atoms(), 0);
343
344        let mol = read_xyz_from_str("1 2\n\nC 1 2 3\n").expect("from_chars reads prefix");
345        assert_eq!(mol.num_atoms(), 1);
346
347        let err = read_xyz_from_str("\t1\n\nC 1 2 3\n").expect_err("tab is not accepted");
348        assert!(
349            err.to_string()
350                .contains("unable to recognize the number of atoms")
351        );
352    }
353}