1use std::num::ParseFloatError;
7
8use crate::{AtomSpec, Element, Molecule, MoleculeBuildError, MoleculeBuilder};
9
10#[derive(Debug, Clone, PartialEq, Eq, thiserror::Error)]
11pub enum XyzReadError {
12 #[error("empty XYZ block")]
13 EmptyBlock,
14 #[error(
15 "unable to recognize the number of atoms: cannot convert '{value}' to unsigned int on line 0"
16 )]
17 AtomCount { value: String },
18 #[error("EOF hit while reading atoms")]
19 UnexpectedEof,
20 #[error("missing coordinates on line {line}")]
21 MissingCoordinates { line: usize },
22 #[error("cannot convert '{value}' to double on line {line}")]
23 Coordinate {
24 value: String,
25 line: usize,
26 #[source]
27 source: ParseFloatError,
28 },
29 #[error("{message}")]
30 AtomSymbol { message: String },
31 #[error(transparent)]
32 Build(#[from] MoleculeBuildError),
33}
34
35fn parse_float_error() -> ParseFloatError {
36 "x".parse::<f64>().unwrap_err()
37}
38
39fn rdkit_to_unsigned(value: &str) -> Result<usize, XyzReadError> {
40 let checked = value.split_once('\0').map_or(value, |(prefix, _)| prefix);
51 if !checked
52 .bytes()
53 .all(|byte| byte.is_ascii_digit() || byte == b' ' || byte == b'+')
54 {
55 return Err(XyzReadError::AtomCount {
56 value: value.to_string(),
57 });
58 }
59
60 let trimmed_start = checked.trim_start_matches(' ');
72 if checked.len() != 0 && trimmed_start.is_empty() {
73 return Ok(0);
74 }
75
76 let digit_prefix = trimmed_start.bytes().take_while(u8::is_ascii_digit).count();
79 if digit_prefix == 0 {
80 return Ok(0);
81 }
82 trimmed_start[..digit_prefix]
83 .parse::<usize>()
84 .map_err(|_| XyzReadError::AtomCount {
85 value: value.to_string(),
86 })
87 }
89
90fn rdkit_to_double_no_spaces(value: &str, line: usize) -> Result<f64, XyzReadError> {
91 let checked = value.split_once('\0').map_or(value, |(prefix, _)| prefix);
102 if !checked.bytes().all(|byte| {
103 byte.is_ascii_digit() || byte == b'+' || byte == b'-' || byte == b',' || byte == b'.'
104 }) {
105 return Err(XyzReadError::Coordinate {
106 value: value.to_string(),
107 line,
108 source: value.parse::<f64>().err().unwrap_or_else(parse_float_error),
109 });
110 }
111
112 let numeric_prefix = checked
114 .split_once(',')
115 .map_or(checked, |(prefix, _)| prefix);
116 numeric_prefix
117 .parse::<f64>()
118 .map_err(|source| XyzReadError::Coordinate {
119 value: value.to_string(),
120 line,
121 source,
122 })
123 }
125
126#[must_use]
127fn normalize_xyz_symbol(raw: &str) -> String {
128 let mut chars = raw.chars().collect::<Vec<_>>();
129 if chars.len() == 2 && chars[1].is_ascii_uppercase() {
130 chars[1] = chars[1].to_ascii_lowercase();
131 }
132 chars.into_iter().collect()
133}
134
135fn element_from_rdkit_symbol(symbol: &str) -> Result<Element, XyzReadError> {
136 for atomic_number in 0..=118u8 {
137 if crate::chemistry::valence::rdkit_element_symbol(atomic_number).ok() == Some(symbol) {
138 return Element::from_atomic_number(atomic_number).ok_or_else(|| {
139 XyzReadError::AtomSymbol {
140 message: format!("Bad atomic number for element symbol {symbol}"),
141 }
142 });
143 }
144 }
145 Err(XyzReadError::AtomSymbol {
146 message: format!("Element '{symbol}' not found"),
147 })
148}
149
150fn parse_xyz_atom_line(atom_line: &str, line: usize) -> Result<(Element, [f64; 3]), XyzReadError> {
151 let mut delims = [0usize; 8];
170 let mut prev = 0usize;
171 for (i, delim) in delims.iter_mut().take(7).enumerate() {
172 *delim = if i % 2 == 0 {
173 atom_line[prev..]
174 .find(|ch: char| ch != ' ' && ch != '\t')
175 .map(|idx| prev + idx)
176 } else {
177 atom_line[prev..].find([' ', '\t']).map(|idx| prev + idx)
178 }
179 .ok_or(XyzReadError::MissingCoordinates { line })?;
180 prev = *delim;
181 }
182 delims[7] = atom_line
183 .rfind(|ch: char| ch != ' ' && ch != '\t')
184 .map_or(0, |idx| idx + 1);
185
186 let pos = [
193 rdkit_to_double_no_spaces(&atom_line[delims[2]..delims[3]], line)?,
194 rdkit_to_double_no_spaces(&atom_line[delims[4]..delims[5]], line)?,
195 rdkit_to_double_no_spaces(&atom_line[delims[6]..delims[7]], line)?,
196 ];
197
198 let symbol = normalize_xyz_symbol(&atom_line[delims[0]..delims[1]]);
204 let element = element_from_rdkit_symbol(&symbol)?;
205 Ok((element, pos))
206 }
208
209fn parse_extra_line(extra_line: &str) -> Result<(), XyzReadError> {
210 if extra_line.trim_matches([' ', '\t']).is_empty() {
218 Ok(())
219 } else {
220 Err(XyzReadError::AtomSymbol {
221 message: "More lines than expected".to_string(),
222 })
223 }
224 }
226
227pub fn read_xyz_from_str(block: &str) -> Result<Molecule, XyzReadError> {
228 if block.is_empty() {
238 return Err(XyzReadError::EmptyBlock);
239 }
240
241 let mut lines = block.lines();
242 let atom_count_line = lines.next().ok_or(XyzReadError::EmptyBlock)?;
245 let num_atoms = rdkit_to_unsigned(atom_count_line)?;
246
247 let comment = lines.next().unwrap_or_default();
249
250 let mut builder =
257 MoleculeBuilder::new().with_topology_trust(crate::TopologyTrust::CoordinateOnly);
258 let mut coords = Vec::with_capacity(num_atoms);
259 if num_atoms > 0 && !comment.is_empty() {
260 builder = builder.with_property("_FileComments", comment);
261 }
262
263 for i in 0..num_atoms {
274 let atom_line = lines.next().ok_or(XyzReadError::UnexpectedEof)?;
275 let (element, coord) = parse_xyz_atom_line(atom_line, i + 2)?;
276 builder.add_atom(AtomSpec::new(element));
277 coords.push(coord);
278 }
279 if num_atoms > 0 {
280 builder.add_3d_conformer(coords)?;
281 }
282
283 for extra_line in lines {
288 parse_extra_line(extra_line)?;
289 }
290
291 builder.build().map_err(Into::into)
292 }
294
295#[cfg(test)]
296mod tests {
297 use super::read_xyz_from_str;
298
299 #[test]
300 fn xyz_reader_parses_atoms_coordinates_comment_and_no_bonds_like_rdkit() {
301 let mol = read_xyz_from_str(
302 "5\nmethane\nC 0.0 0.0 0.0\nH -0.635 -0.635 0.635\nH -0.635 0.635 -0.635\nH 0.635 -0.635 -0.635\nH 0.635 0.635 0.635\n",
303 )
304 .expect("xyz parse");
305
306 assert_eq!(mol.num_atoms(), 5);
307 assert_eq!(mol.num_bonds(), 0);
308 assert_eq!(mol.atomic_numbers(), vec![6, 1, 1, 1, 1]);
309 assert_eq!(mol.properties().prop("_FileComments"), Some("methane"));
310 assert_eq!(mol.conformers_3d().len(), 1);
311 assert_eq!(
312 mol.conformers_3d()[0].coordinates()[1],
313 [-0.635, -0.635, 0.635]
314 );
315 }
316
317 #[test]
318 fn xyz_reader_accepts_uppercase_second_symbol_letter_like_rdkit() {
319 let mol = read_xyz_from_str("1\n\nCL 1 2 3\n").expect("xyz parse");
320
321 assert_eq!(mol.atomic_numbers(), vec![17]);
322 assert_eq!(mol.conformers_3d()[0].coordinates()[0], [1.0, 2.0, 3.0]);
323 }
324
325 #[test]
326 fn xyz_reader_rejects_extra_atom_line_fields_like_rdkit() {
327 let err = read_xyz_from_str("1\n\nC 1 2 3 extra\n").expect_err("xyz parse should fail");
328
329 assert!(err.to_string().contains("cannot convert '3 extra'"));
330 }
331
332 #[test]
333 fn xyz_reader_rejects_scientific_notation_like_rdkit_to_double_false() {
334 let err = read_xyz_from_str("1\n\nC 1e0 2 3\n").expect_err("xyz parse should fail");
335
336 assert!(err.to_string().contains("cannot convert '1e0'"));
337 }
338
339 #[test]
340 fn xyz_reader_atom_count_accepts_only_rdkit_unsigned_space_rules() {
341 let mol = read_xyz_from_str(" \ncomment\n").expect("space-only count is zero");
342 assert_eq!(mol.num_atoms(), 0);
343
344 let mol = read_xyz_from_str("1 2\n\nC 1 2 3\n").expect("from_chars reads prefix");
345 assert_eq!(mol.num_atoms(), 1);
346
347 let err = read_xyz_from_str("\t1\n\nC 1 2 3\n").expect_err("tab is not accepted");
348 assert!(
349 err.to_string()
350 .contains("unable to recognize the number of atoms")
351 );
352 }
353}