use std::collections::HashMap;
use std::io::BufRead;
use crate::atom::Atom;
use crate::error::{Result, SdfError};
use crate::molecule::Molecule;
fn atomic_number_to_symbol(num: u8) -> Option<&'static str> {
match num {
1 => Some("H"),
2 => Some("He"),
3 => Some("Li"),
4 => Some("Be"),
5 => Some("B"),
6 => Some("C"),
7 => Some("N"),
8 => Some("O"),
9 => Some("F"),
10 => Some("Ne"),
11 => Some("Na"),
12 => Some("Mg"),
13 => Some("Al"),
14 => Some("Si"),
15 => Some("P"),
16 => Some("S"),
17 => Some("Cl"),
18 => Some("Ar"),
19 => Some("K"),
20 => Some("Ca"),
26 => Some("Fe"),
29 => Some("Cu"),
30 => Some("Zn"),
35 => Some("Br"),
53 => Some("I"),
_ => None,
}
}
fn normalize_element(element: &str) -> String {
let element = element.trim();
if element.is_empty() {
return String::new();
}
let mut chars = element.chars();
let first = chars.next().unwrap().to_uppercase().to_string();
let rest: String = chars.collect::<String>().to_lowercase();
first + &rest
}
pub struct XyzParser<R> {
reader: R,
line_number: usize,
current_line: String,
peeked: bool,
}
impl<R: BufRead> XyzParser<R> {
pub fn new(reader: R) -> Self {
Self {
reader,
line_number: 0,
current_line: String::new(),
peeked: false,
}
}
fn read_line(&mut self) -> Result<bool> {
if self.peeked {
self.peeked = false;
return Ok(!self.current_line.is_empty());
}
self.current_line.clear();
let bytes_read = self.reader.read_line(&mut self.current_line)?;
if bytes_read > 0 {
self.line_number += 1;
if self.current_line.ends_with('\n') {
self.current_line.pop();
if self.current_line.ends_with('\r') {
self.current_line.pop();
}
}
Ok(true)
} else {
Ok(false)
}
}
fn skip_blank_lines(&mut self) -> Result<bool> {
loop {
if !self.read_line()? {
return Ok(false);
}
if !self.current_line.trim().is_empty() {
self.peeked = true;
return Ok(true);
}
}
}
fn parse_atom_count_line(&self) -> Result<usize> {
let trimmed = self.current_line.trim();
trimmed
.parse::<usize>()
.map_err(|_| SdfError::InvalidCountsLine(format!("Invalid atom count: {}", trimmed)))
}
fn parse_atom_line(&self, index: usize) -> Result<Atom> {
let parts: Vec<&str> = self.current_line.split_whitespace().collect();
if parts.len() < 4 {
return Err(SdfError::Parse {
line: self.line_number,
message: format!("Atom line too short: {}", self.current_line),
});
}
let element_str = parts[0];
let element = if let Ok(atomic_num) = element_str.parse::<u8>() {
atomic_number_to_symbol(atomic_num)
.map(|s| s.to_string())
.ok_or_else(|| SdfError::Parse {
line: self.line_number,
message: format!("Unknown atomic number: {}", atomic_num),
})?
} else {
normalize_element(element_str)
};
let x: f64 = parts[1]
.parse()
.map_err(|_| SdfError::InvalidCoordinate(parts[1].to_string()))?;
let y: f64 = parts[2]
.parse()
.map_err(|_| SdfError::InvalidCoordinate(parts[2].to_string()))?;
let z: f64 = parts[3]
.parse()
.map_err(|_| SdfError::InvalidCoordinate(parts[3].to_string()))?;
Ok(Atom {
index,
element,
x,
y,
z,
formal_charge: 0,
mass_difference: 0,
stereo_parity: None,
hydrogen_count: None,
valence: None,
v3000_id: None,
atom_atom_mapping: None,
rgroup_label: None,
radical: None,
})
}
pub fn parse_molecule(&mut self) -> Result<Option<Molecule>> {
if !self.skip_blank_lines()? {
return Ok(None);
}
if !self.read_line()? {
return Ok(None);
}
let num_atoms = self.parse_atom_count_line()?;
if !self.read_line()? {
return Err(SdfError::MissingSection("XYZ comment line".to_string()));
}
let name = self.current_line.trim().to_string();
let mut atoms = Vec::with_capacity(num_atoms);
for i in 0..num_atoms {
if !self.read_line()? {
return Err(SdfError::AtomCountMismatch {
expected: num_atoms,
found: i,
});
}
let atom = self.parse_atom_line(i)?;
atoms.push(atom);
}
Ok(Some(Molecule {
name,
program_line: None,
comment: None,
atoms,
bonds: Vec::new(), properties: HashMap::new(),
format_version: crate::molecule::SdfFormat::V2000,
stereogroups: Vec::new(),
sgroups: Vec::new(),
collections: Vec::new(),
}))
}
}
pub struct XyzIterator<R> {
parser: XyzParser<R>,
finished: bool,
}
impl<R: BufRead> XyzIterator<R> {
pub fn new(reader: R) -> Self {
Self {
parser: XyzParser::new(reader),
finished: false,
}
}
}
impl<R: BufRead> Iterator for XyzIterator<R> {
type Item = Result<Molecule>;
fn next(&mut self) -> Option<Self::Item> {
if self.finished {
return None;
}
match self.parser.parse_molecule() {
Ok(Some(mol)) => Some(Ok(mol)),
Ok(None) => {
self.finished = true;
None
}
Err(e) => {
self.finished = true;
Some(Err(e))
}
}
}
}
pub fn parse_xyz_string(content: &str) -> Result<Molecule> {
let cursor = std::io::Cursor::new(content);
let reader = std::io::BufReader::new(cursor);
let mut parser = XyzParser::new(reader);
parser.parse_molecule()?.ok_or(SdfError::EmptyFile)
}
pub fn parse_xyz_string_multi(content: &str) -> Result<Vec<Molecule>> {
let cursor = std::io::Cursor::new(content);
let reader = std::io::BufReader::new(cursor);
let iter = XyzIterator::new(reader);
iter.collect()
}
pub fn parse_xyz_file<P: AsRef<std::path::Path>>(path: P) -> Result<Molecule> {
#[cfg(feature = "gzip")]
{
let reader = super::compression::open_maybe_gz(&path)?;
let mut parser = XyzParser::new(reader);
parser.parse_molecule()?.ok_or(SdfError::EmptyFile)
}
#[cfg(not(feature = "gzip"))]
{
if path
.as_ref()
.extension()
.is_some_and(|ext| ext.eq_ignore_ascii_case("gz"))
{
return Err(SdfError::GzipNotEnabled);
}
let file = std::fs::File::open(path)?;
let reader = std::io::BufReader::new(file);
let mut parser = XyzParser::new(reader);
parser.parse_molecule()?.ok_or(SdfError::EmptyFile)
}
}
pub fn parse_xyz_file_multi<P: AsRef<std::path::Path>>(path: P) -> Result<Vec<Molecule>> {
#[cfg(feature = "gzip")]
{
let reader = super::compression::open_maybe_gz(&path)?;
let iter = XyzIterator::new(reader);
iter.collect()
}
#[cfg(not(feature = "gzip"))]
{
if path
.as_ref()
.extension()
.is_some_and(|ext| ext.eq_ignore_ascii_case("gz"))
{
return Err(SdfError::GzipNotEnabled);
}
let file = std::fs::File::open(path)?;
let reader = std::io::BufReader::new(file);
let iter = XyzIterator::new(reader);
iter.collect()
}
}
#[cfg(feature = "gzip")]
pub fn iter_xyz_file<P: AsRef<std::path::Path>>(
path: P,
) -> Result<XyzIterator<super::compression::MaybeGzReader>> {
let reader = super::compression::open_maybe_gz(&path)?;
Ok(XyzIterator::new(reader))
}
#[cfg(not(feature = "gzip"))]
pub fn iter_xyz_file<P: AsRef<std::path::Path>>(
path: P,
) -> Result<XyzIterator<std::io::BufReader<std::fs::File>>> {
if path
.as_ref()
.extension()
.is_some_and(|ext| ext.eq_ignore_ascii_case("gz"))
{
return Err(SdfError::GzipNotEnabled);
}
let file = std::fs::File::open(path)?;
let reader = std::io::BufReader::new(file);
Ok(XyzIterator::new(reader))
}
#[cfg(test)]
mod tests {
use super::*;
const WATER_XYZ: &str = r#"3
water molecule
O 0.000000 0.000000 0.117300
H 0.756950 0.000000 -0.469200
H -0.756950 0.000000 -0.469200
"#;
const METHANE_XYZ: &str = r#"5
methane
C 0.000000 0.000000 0.000000
H 0.628900 0.628900 0.628900
H -0.628900 -0.628900 0.628900
H -0.628900 0.628900 -0.628900
H 0.628900 -0.628900 -0.628900
"#;
#[test]
fn test_parse_water() {
let mol = parse_xyz_string(WATER_XYZ).unwrap();
assert_eq!(mol.name, "water molecule");
assert_eq!(mol.atom_count(), 3);
assert_eq!(mol.bond_count(), 0); assert_eq!(mol.formula(), "H2O");
assert_eq!(mol.atoms[0].element, "O");
assert!((mol.atoms[0].x - 0.0).abs() < 1e-6);
assert!((mol.atoms[0].y - 0.0).abs() < 1e-6);
assert!((mol.atoms[0].z - 0.1173).abs() < 1e-6);
assert_eq!(mol.atoms[1].element, "H");
assert!((mol.atoms[1].x - 0.75695).abs() < 1e-6);
}
#[test]
fn test_parse_methane() {
let mol = parse_xyz_string(METHANE_XYZ).unwrap();
assert_eq!(mol.name, "methane");
assert_eq!(mol.atom_count(), 5);
assert_eq!(mol.formula(), "CH4");
}
#[test]
fn test_parse_multi() {
let multi = format!("{}{}", WATER_XYZ, METHANE_XYZ);
let mols = parse_xyz_string_multi(&multi).unwrap();
assert_eq!(mols.len(), 2);
assert_eq!(mols[0].name, "water molecule");
assert_eq!(mols[1].name, "methane");
}
#[test]
fn test_parse_atomic_numbers() {
let xyz = r#"3
atomic number test
8 0.0 0.0 0.0
1 1.0 0.0 0.0
1 -1.0 0.0 0.0
"#;
let mol = parse_xyz_string(xyz).unwrap();
assert_eq!(mol.atoms[0].element, "O");
assert_eq!(mol.atoms[1].element, "H");
assert_eq!(mol.atoms[2].element, "H");
}
#[test]
fn test_normalize_element_case() {
let xyz = r#"3
case test
o 0.0 0.0 0.0
h 1.0 0.0 0.0
CA -1.0 0.0 0.0
"#;
let mol = parse_xyz_string(xyz).unwrap();
assert_eq!(mol.atoms[0].element, "O");
assert_eq!(mol.atoms[1].element, "H");
assert_eq!(mol.atoms[2].element, "Ca");
}
#[test]
fn test_extra_columns_ignored() {
let xyz = r#"2
extra columns test
C 0.0 0.0 0.0 0.5 extra_data
H 1.0 0.0 0.0 -0.2
"#;
let mol = parse_xyz_string(xyz).unwrap();
assert_eq!(mol.atom_count(), 2);
assert_eq!(mol.atoms[0].element, "C");
assert_eq!(mol.atoms[1].element, "H");
}
#[test]
fn test_blank_lines_between_molecules() {
let xyz = format!("{}\n\n{}", WATER_XYZ, METHANE_XYZ);
let mols = parse_xyz_string_multi(&xyz).unwrap();
assert_eq!(mols.len(), 2);
}
#[test]
fn test_empty_file() {
let result = parse_xyz_string("");
assert!(result.is_err());
assert!(matches!(result.unwrap_err(), SdfError::EmptyFile));
}
#[test]
fn test_invalid_atom_count() {
let xyz = r#"abc
test
C 0.0 0.0 0.0
"#;
let result = parse_xyz_string(xyz);
assert!(result.is_err());
assert!(matches!(
result.unwrap_err(),
SdfError::InvalidCountsLine(_)
));
}
#[test]
fn test_fewer_atoms_than_declared() {
let xyz = r#"5
missing atoms
C 0.0 0.0 0.0
H 1.0 0.0 0.0
"#;
let result = parse_xyz_string(xyz);
assert!(result.is_err());
assert!(matches!(
result.unwrap_err(),
SdfError::AtomCountMismatch {
expected: 5,
found: 2
}
));
}
#[test]
fn test_invalid_coordinate() {
let xyz = r#"1
bad coords
C abc 0.0 0.0
"#;
let result = parse_xyz_string(xyz);
assert!(result.is_err());
assert!(matches!(
result.unwrap_err(),
SdfError::InvalidCoordinate(_)
));
}
#[test]
fn test_iterator() {
let multi = format!("{}{}{}", WATER_XYZ, METHANE_XYZ, WATER_XYZ);
let cursor = std::io::Cursor::new(multi);
let reader = std::io::BufReader::new(cursor);
let iter = XyzIterator::new(reader);
let mols: Vec<_> = iter.map(|r| r.unwrap()).collect();
assert_eq!(mols.len(), 3);
assert_eq!(mols[0].name, "water molecule");
assert_eq!(mols[1].name, "methane");
assert_eq!(mols[2].name, "water molecule");
}
}