use std::collections::HashMap;
use std::io::BufRead;
use crate::atom::Atom;
use crate::bond::{Bond, BondOrder, BondStereo};
use crate::error::{Result, SdfError};
use crate::molecule::Molecule;
fn mol2_bond_type_to_order(bond_type: &str) -> BondOrder {
match bond_type {
"1" | "1n" => BondOrder::Single,
"2" => BondOrder::Double,
"3" => BondOrder::Triple,
"ar" | "am" => BondOrder::Aromatic,
"du" => BondOrder::Single, "un" => BondOrder::Single, "nc" => BondOrder::Single, _ => BondOrder::Single, }
}
pub struct Mol2Parser<R> {
reader: R,
line_number: usize,
current_line: String,
peeked: bool,
}
impl<R: BufRead> Mol2Parser<R> {
pub fn new(reader: R) -> Self {
Self {
reader,
line_number: 0,
current_line: String::new(),
peeked: false,
}
}
fn read_line(&mut self) -> Result<bool> {
if self.peeked {
self.peeked = false;
return Ok(!self.current_line.is_empty());
}
self.current_line.clear();
let bytes_read = self.reader.read_line(&mut self.current_line)?;
if bytes_read > 0 {
self.line_number += 1;
if self.current_line.ends_with('\n') {
self.current_line.pop();
if self.current_line.ends_with('\r') {
self.current_line.pop();
}
}
Ok(true)
} else {
Ok(false)
}
}
fn peek_line(&mut self) -> Result<Option<&str>> {
if !self.peeked {
if !self.read_line()? {
return Ok(None);
}
self.peeked = true;
}
Ok(Some(&self.current_line))
}
fn skip_to_section(&mut self) -> Result<Option<String>> {
loop {
if !self.read_line()? {
return Ok(None);
}
if self.current_line.starts_with("@<TRIPOS>") {
let section = self.current_line[9..].trim().to_uppercase();
return Ok(Some(section));
}
}
}
pub fn parse_molecule(&mut self) -> Result<Option<Molecule>> {
loop {
match self.skip_to_section()? {
Some(section) if section == "MOLECULE" => break,
Some(_) => continue,
None => return Ok(None),
}
}
if !self.read_line()? {
return Err(SdfError::MissingSection("MOLECULE name".to_string()));
}
let name = self.current_line.trim().to_string();
if !self.read_line()? {
return Err(SdfError::MissingSection("MOLECULE counts".to_string()));
}
let counts: Vec<&str> = self.current_line.split_whitespace().collect();
if counts.len() < 2 {
return Err(SdfError::InvalidCountsLine(self.current_line.clone()));
}
let num_atoms: usize = counts[0].parse().map_err(|_| {
SdfError::InvalidCountsLine(format!("Invalid atom count: {}", counts[0]))
})?;
let num_bonds: usize = counts[1].parse().map_err(|_| {
SdfError::InvalidCountsLine(format!("Invalid bond count: {}", counts[1]))
})?;
if !self.read_line()? {
return Err(SdfError::MissingSection("MOLECULE type".to_string()));
}
if !self.read_line()? {
return Err(SdfError::MissingSection("MOLECULE charge type".to_string()));
}
let mut atoms = Vec::with_capacity(num_atoms);
let mut bonds = Vec::with_capacity(num_bonds);
let properties = HashMap::new();
loop {
match self.peek_line()? {
None => break,
Some(line) if line.starts_with("@<TRIPOS>MOLECULE") => break,
Some(line) if line.starts_with("@<TRIPOS>") => {
let section = line[9..].trim().to_uppercase();
self.read_line()?;
match section.as_str() {
"ATOM" => {
atoms = self.parse_atom_section(num_atoms)?;
}
"BOND" => {
bonds = self.parse_bond_section(num_bonds, atoms.len())?;
}
"SUBSTRUCTURE" | "COMMENT" | "HEADTAIL" | "SET" | "DICT" | "EXTENSION" => {
self.skip_section()?;
}
_ => {
self.skip_section()?;
}
}
}
_ => {
self.read_line()?; }
}
}
if atoms.len() != num_atoms {
return Err(SdfError::AtomCountMismatch {
expected: num_atoms,
found: atoms.len(),
});
}
Ok(Some(Molecule {
name,
program_line: None,
comment: None,
atoms,
bonds,
properties,
format_version: crate::molecule::SdfFormat::V2000,
stereogroups: Vec::new(),
sgroups: Vec::new(),
collections: Vec::new(),
}))
}
fn parse_atom_section(&mut self, expected_count: usize) -> Result<Vec<Atom>> {
let mut atoms = Vec::with_capacity(expected_count);
for i in 0..expected_count {
if !self.read_line()? {
return Err(SdfError::AtomCountMismatch {
expected: expected_count,
found: i,
});
}
if self.current_line.starts_with("@<TRIPOS>") {
self.peeked = true;
return Err(SdfError::AtomCountMismatch {
expected: expected_count,
found: i,
});
}
let atom = self.parse_atom_line(i)?;
atoms.push(atom);
}
Ok(atoms)
}
fn parse_atom_line(&self, index: usize) -> Result<Atom> {
let parts: Vec<&str> = self.current_line.split_whitespace().collect();
if parts.len() < 6 {
return Err(SdfError::Parse {
line: self.line_number,
message: format!("Atom line too short: {}", self.current_line),
});
}
let x: f64 = parts[2]
.parse()
.map_err(|_| SdfError::InvalidCoordinate(parts[2].to_string()))?;
let y: f64 = parts[3]
.parse()
.map_err(|_| SdfError::InvalidCoordinate(parts[3].to_string()))?;
let z: f64 = parts[4]
.parse()
.map_err(|_| SdfError::InvalidCoordinate(parts[4].to_string()))?;
let atom_type = parts[5];
let element = atom_type.split('.').next().unwrap_or(atom_type).to_string();
let partial_charge: f64 = if parts.len() > 8 {
parts[8].parse().unwrap_or(0.0)
} else {
0.0
};
let formal_charge = partial_charge.round() as i8;
Ok(Atom {
index,
element,
x,
y,
z,
formal_charge,
mass_difference: 0,
stereo_parity: None,
hydrogen_count: None,
valence: None,
v3000_id: None,
atom_atom_mapping: None,
rgroup_label: None,
radical: None,
})
}
fn parse_bond_section(
&mut self,
expected_count: usize,
atom_count: usize,
) -> Result<Vec<Bond>> {
let mut bonds = Vec::with_capacity(expected_count);
for i in 0..expected_count {
if !self.read_line()? {
return Err(SdfError::BondCountMismatch {
expected: expected_count,
found: i,
});
}
if self.current_line.starts_with("@<TRIPOS>") {
self.peeked = true;
return Err(SdfError::BondCountMismatch {
expected: expected_count,
found: i,
});
}
let bond = self.parse_bond_line(atom_count)?;
bonds.push(bond);
}
Ok(bonds)
}
fn parse_bond_line(&self, atom_count: usize) -> Result<Bond> {
let parts: Vec<&str> = self.current_line.split_whitespace().collect();
if parts.len() < 4 {
return Err(SdfError::Parse {
line: self.line_number,
message: format!("Bond line too short: {}", self.current_line),
});
}
let atom1: usize = parts[1]
.parse::<usize>()
.map_err(|_| SdfError::Parse {
line: self.line_number,
message: "Invalid atom1 index".to_string(),
})?
.checked_sub(1)
.ok_or(SdfError::InvalidAtomIndex {
index: 0,
atom_count,
})?;
let atom2: usize = parts[2]
.parse::<usize>()
.map_err(|_| SdfError::Parse {
line: self.line_number,
message: "Invalid atom2 index".to_string(),
})?
.checked_sub(1)
.ok_or(SdfError::InvalidAtomIndex {
index: 0,
atom_count,
})?;
if atom1 >= atom_count {
return Err(SdfError::InvalidAtomIndex {
index: atom1 + 1,
atom_count,
});
}
if atom2 >= atom_count {
return Err(SdfError::InvalidAtomIndex {
index: atom2 + 1,
atom_count,
});
}
let bond_type = parts[3].to_lowercase();
let order = mol2_bond_type_to_order(&bond_type);
Ok(Bond {
atom1,
atom2,
order,
stereo: BondStereo::None,
topology: None,
v3000_id: None,
reacting_center: None,
})
}
fn skip_section(&mut self) -> Result<()> {
loop {
match self.peek_line()? {
None => return Ok(()),
Some(line) if line.starts_with("@<TRIPOS>") => return Ok(()),
_ => {
self.read_line()?;
}
}
}
}
}
pub struct Mol2Iterator<R> {
parser: Mol2Parser<R>,
finished: bool,
}
impl<R: BufRead> Mol2Iterator<R> {
pub fn new(reader: R) -> Self {
Self {
parser: Mol2Parser::new(reader),
finished: false,
}
}
}
impl<R: BufRead> Iterator for Mol2Iterator<R> {
type Item = Result<Molecule>;
fn next(&mut self) -> Option<Self::Item> {
if self.finished {
return None;
}
match self.parser.parse_molecule() {
Ok(Some(mol)) => Some(Ok(mol)),
Ok(None) => {
self.finished = true;
None
}
Err(e) => {
self.finished = true;
Some(Err(e))
}
}
}
}
pub fn parse_mol2_string(content: &str) -> Result<Molecule> {
let cursor = std::io::Cursor::new(content);
let reader = std::io::BufReader::new(cursor);
let mut parser = Mol2Parser::new(reader);
parser.parse_molecule()?.ok_or(SdfError::EmptyFile)
}
pub fn parse_mol2_string_multi(content: &str) -> Result<Vec<Molecule>> {
let cursor = std::io::Cursor::new(content);
let reader = std::io::BufReader::new(cursor);
let iter = Mol2Iterator::new(reader);
iter.collect()
}
pub fn parse_mol2_file<P: AsRef<std::path::Path>>(path: P) -> Result<Molecule> {
#[cfg(feature = "gzip")]
{
let reader = super::compression::open_maybe_gz(&path)?;
let mut parser = Mol2Parser::new(reader);
parser.parse_molecule()?.ok_or(SdfError::EmptyFile)
}
#[cfg(not(feature = "gzip"))]
{
if path
.as_ref()
.extension()
.is_some_and(|ext| ext.eq_ignore_ascii_case("gz"))
{
return Err(SdfError::GzipNotEnabled);
}
let file = std::fs::File::open(path)?;
let reader = std::io::BufReader::new(file);
let mut parser = Mol2Parser::new(reader);
parser.parse_molecule()?.ok_or(SdfError::EmptyFile)
}
}
pub fn parse_mol2_file_multi<P: AsRef<std::path::Path>>(path: P) -> Result<Vec<Molecule>> {
#[cfg(feature = "gzip")]
{
let reader = super::compression::open_maybe_gz(&path)?;
let iter = Mol2Iterator::new(reader);
iter.collect()
}
#[cfg(not(feature = "gzip"))]
{
if path
.as_ref()
.extension()
.is_some_and(|ext| ext.eq_ignore_ascii_case("gz"))
{
return Err(SdfError::GzipNotEnabled);
}
let file = std::fs::File::open(path)?;
let reader = std::io::BufReader::new(file);
let iter = Mol2Iterator::new(reader);
iter.collect()
}
}
#[cfg(feature = "gzip")]
pub fn iter_mol2_file<P: AsRef<std::path::Path>>(
path: P,
) -> Result<Mol2Iterator<super::compression::MaybeGzReader>> {
let reader = super::compression::open_maybe_gz(&path)?;
Ok(Mol2Iterator::new(reader))
}
#[cfg(not(feature = "gzip"))]
pub fn iter_mol2_file<P: AsRef<std::path::Path>>(
path: P,
) -> Result<Mol2Iterator<std::io::BufReader<std::fs::File>>> {
if path
.as_ref()
.extension()
.is_some_and(|ext| ext.eq_ignore_ascii_case("gz"))
{
return Err(SdfError::GzipNotEnabled);
}
let file = std::fs::File::open(path)?;
let reader = std::io::BufReader::new(file);
Ok(Mol2Iterator::new(reader))
}
#[cfg(test)]
mod tests {
use super::*;
const SIMPLE_MOL2: &str = r#"@<TRIPOS>MOLECULE
methane
5 4 0 0 0
SMALL
NO_CHARGES
@<TRIPOS>ATOM
1 C1 0.0000 0.0000 0.0000 C.3 1 MOL 0.0000
2 H1 0.6289 0.6289 0.6289 H 1 MOL 0.0000
3 H2 -0.6289 -0.6289 0.6289 H 1 MOL 0.0000
4 H3 -0.6289 0.6289 -0.6289 H 1 MOL 0.0000
5 H4 0.6289 -0.6289 -0.6289 H 1 MOL 0.0000
@<TRIPOS>BOND
1 1 2 1
2 1 3 1
3 1 4 1
4 1 5 1
"#;
#[test]
fn test_parse_simple_mol2() {
let mol = parse_mol2_string(SIMPLE_MOL2).unwrap();
assert_eq!(mol.name, "methane");
assert_eq!(mol.atom_count(), 5);
assert_eq!(mol.bond_count(), 4);
assert_eq!(mol.formula(), "CH4");
}
#[test]
fn test_parse_mol2_atoms() {
let mol = parse_mol2_string(SIMPLE_MOL2).unwrap();
assert_eq!(mol.atoms[0].element, "C");
assert_eq!(mol.atoms[0].x, 0.0);
assert_eq!(mol.atoms[1].element, "H");
}
#[test]
fn test_parse_mol2_bonds() {
let mol = parse_mol2_string(SIMPLE_MOL2).unwrap();
for bond in &mol.bonds {
assert_eq!(bond.order, BondOrder::Single);
}
assert_eq!(mol.bonds[0].atom1, 0);
assert_eq!(mol.bonds[0].atom2, 1);
}
#[test]
fn test_parse_mol2_aromatic() {
let benzene_mol2 = r#"@<TRIPOS>MOLECULE
benzene
6 6 0 0 0
SMALL
NO_CHARGES
@<TRIPOS>ATOM
1 C1 1.2124 0.7000 0.0000 C.ar 1 MOL 0.0000
2 C2 1.2124 -0.7000 0.0000 C.ar 1 MOL 0.0000
3 C3 0.0000 -1.4000 0.0000 C.ar 1 MOL 0.0000
4 C4 -1.2124 -0.7000 0.0000 C.ar 1 MOL 0.0000
5 C5 -1.2124 0.7000 0.0000 C.ar 1 MOL 0.0000
6 C6 0.0000 1.4000 0.0000 C.ar 1 MOL 0.0000
@<TRIPOS>BOND
1 1 2 ar
2 2 3 ar
3 3 4 ar
4 4 5 ar
5 5 6 ar
6 6 1 ar
"#;
let mol = parse_mol2_string(benzene_mol2).unwrap();
assert_eq!(mol.name, "benzene");
assert_eq!(mol.atom_count(), 6);
assert_eq!(mol.bond_count(), 6);
for bond in &mol.bonds {
assert_eq!(bond.order, BondOrder::Aromatic);
}
}
#[test]
fn test_parse_mol2_with_charges() {
let charged_mol2 = r#"@<TRIPOS>MOLECULE
ammonium
5 4 0 0 0
SMALL
USER_CHARGES
@<TRIPOS>ATOM
1 N1 0.0000 0.0000 0.0000 N.4 1 MOL 1.0000
2 H1 0.6289 0.6289 0.6289 H 1 MOL 0.0000
3 H2 -0.6289 -0.6289 0.6289 H 1 MOL 0.0000
4 H3 -0.6289 0.6289 -0.6289 H 1 MOL 0.0000
5 H4 0.6289 -0.6289 -0.6289 H 1 MOL 0.0000
@<TRIPOS>BOND
1 1 2 1
2 1 3 1
3 1 4 1
4 1 5 1
"#;
let mol = parse_mol2_string(charged_mol2).unwrap();
assert_eq!(mol.name, "ammonium");
assert_eq!(mol.atoms[0].formal_charge, 1);
assert_eq!(mol.total_charge(), 1);
}
#[test]
fn test_parse_multi_mol2() {
let multi = format!("{}{}", SIMPLE_MOL2, SIMPLE_MOL2);
let mols = parse_mol2_string_multi(&multi).unwrap();
assert_eq!(mols.len(), 2);
assert_eq!(mols[0].name, "methane");
assert_eq!(mols[1].name, "methane");
}
}