dreid-forge 0.4.1

A pure Rust library and CLI that automates DREIDING force field parameterization by orchestrating structure repair, topology perception, and charge calculation for both biological and chemical systems.
Documentation
use crate::io::{Format, error::Error, util};
use crate::model::{
    atom::Atom,
    system::{Bond, System},
};
use std::io::BufRead;

pub fn read<R: BufRead>(reader: R) -> Result<System, Error> {
    let lines = collect_first_block(reader)?;
    if lines.len() < 4 {
        return Err(Error::parse(
            Format::Sdf,
            1,
            "SDF block must contain at least a header and counts line",
        ));
    }

    let counts_line_no = lines[3].0;
    let counts_line = &lines[3].1;
    if counts_line.contains("V3000") {
        return Err(Error::parse(
            Format::Sdf,
            counts_line_no,
            "V3000 is not supported",
        ));
    }

    let (atom_count, bond_count) = parse_counts(counts_line, counts_line_no)?;
    let atom_start = 4;
    let bond_start = atom_start + atom_count;

    if lines.len() < bond_start + bond_count {
        return Err(Error::parse(
            Format::Sdf,
            lines.last().map(|(ln, _)| *ln).unwrap_or(counts_line_no),
            "SDF block ended before atoms/bonds were fully specified",
        ));
    }

    let atoms = parse_atoms(&lines[atom_start..atom_start + atom_count])?;
    let bonds = parse_bonds(&lines[bond_start..bond_start + bond_count], atom_count)?;

    Ok(System {
        atoms,
        bonds,
        box_vectors: None,
        bio_metadata: None,
    })
}

fn collect_first_block<R: BufRead>(reader: R) -> Result<Vec<(usize, String)>, Error> {
    let mut lines = Vec::new();
    for (i, line) in reader.lines().enumerate() {
        let content = line.map_err(|e| Error::Io { source: e })?;
        let ln = i + 1;
        if content.trim() == "$$$$" && !lines.is_empty() {
            break;
        }
        lines.push((ln, content));
    }
    Ok(lines)
}

fn parse_counts(line: &str, line_no: usize) -> Result<(usize, usize), Error> {
    let tokens: Vec<_> = line.split_whitespace().collect();
    if tokens.len() < 2 {
        return Err(Error::parse(
            Format::Sdf,
            line_no,
            "counts line must contain atom and bond counts",
        ));
    }
    let atoms = tokens[0]
        .parse::<usize>()
        .map_err(|_| Error::parse(Format::Sdf, line_no, "invalid atom count"))?;
    let bonds = tokens[1]
        .parse::<usize>()
        .map_err(|_| Error::parse(Format::Sdf, line_no, "invalid bond count"))?;
    Ok((atoms, bonds))
}

fn parse_atoms(lines: &[(usize, String)]) -> Result<Vec<Atom>, Error> {
    let mut atoms = Vec::with_capacity(lines.len());
    for (ln, raw) in lines {
        let fixed = (|| -> Result<([f64; 3], String), Error> {
            let padded = format!("{raw:<40}");
            let x = padded
                .get(0..10)
                .unwrap_or("")
                .trim()
                .parse::<f64>()
                .map_err(|_| Error::parse(Format::Sdf, *ln, "invalid x coordinate in atom line"))?;
            let y = padded
                .get(10..20)
                .unwrap_or("")
                .trim()
                .parse::<f64>()
                .map_err(|_| Error::parse(Format::Sdf, *ln, "invalid y coordinate in atom line"))?;
            let z = padded
                .get(20..30)
                .unwrap_or("")
                .trim()
                .parse::<f64>()
                .map_err(|_| Error::parse(Format::Sdf, *ln, "invalid z coordinate in atom line"))?;
            Ok(([x, y, z], padded.get(31..34).unwrap_or("").to_string()))
        })();

        let (coords, element_token) = match fixed {
            Ok(val) => val,
            Err(_) => {
                let tokens: Vec<_> = raw.split_whitespace().collect();
                if tokens.len() < 4 {
                    return Err(Error::parse(Format::Sdf, *ln, "invalid atom line"));
                }
                let x = tokens[0].parse::<f64>().map_err(|_| {
                    Error::parse(Format::Sdf, *ln, "invalid x coordinate in atom line")
                })?;
                let y = tokens[1].parse::<f64>().map_err(|_| {
                    Error::parse(Format::Sdf, *ln, "invalid y coordinate in atom line")
                })?;
                let z = tokens[2].parse::<f64>().map_err(|_| {
                    Error::parse(Format::Sdf, *ln, "invalid z coordinate in atom line")
                })?;
                ([x, y, z], tokens[3].to_string())
            }
        };

        let element = util::guess_element_symbol(element_token.trim())
            .ok_or_else(|| Error::parse(Format::Sdf, *ln, "unable to infer element symbol"))?;
        atoms.push(Atom::new(element, coords));
    }
    Ok(atoms)
}

fn parse_bonds(lines: &[(usize, String)], atom_count: usize) -> Result<Vec<Bond>, Error> {
    let mut bonds = Vec::with_capacity(lines.len());
    for (ln, raw) in lines {
        let tokens: Vec<_> = raw.split_whitespace().collect();
        if tokens.len() < 3 {
            return Err(Error::parse(Format::Sdf, *ln, "invalid bond line"));
        }

        let a1 = tokens[0]
            .parse::<usize>()
            .map_err(|_| Error::parse(Format::Sdf, *ln, "invalid first atom index"))?;
        let a2 = tokens[1]
            .parse::<usize>()
            .map_err(|_| Error::parse(Format::Sdf, *ln, "invalid second atom index"))?;
        let order_val = tokens[2]
            .parse::<i32>()
            .map_err(|_| Error::parse(Format::Sdf, *ln, "invalid bond order value"))?;

        let order = util::bond_order_from_ctfile(order_val)
            .ok_or_else(|| Error::parse(Format::Sdf, *ln, "unsupported bond order in bond line"))?;

        if a1 == 0 || a2 == 0 || a1 > atom_count || a2 > atom_count {
            return Err(Error::parse(
                Format::Sdf,
                *ln,
                "bond references atom outside declared range",
            ));
        }

        bonds.push(Bond::new(a1 - 1, a2 - 1, order));
    }
    Ok(bonds)
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::model::{atom::Atom, types::Element};
    use std::io::Cursor;

    fn sample_sdf() -> String {
        r#"Methanol
        Program
        Comment
        4  3  0  0  0  0  0  0  0999 V2000
            0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
            1.0900    0.0000    0.0000 O   0  0  0  0  0  0  0  0  0  0  0  0
        -0.5400    0.9350    0.0000 H   0  0  0  0  0  0  0  0  0  0  0  0
        -0.5400   -0.9350    0.0000 H   0  0  0  0  0  0  0  0  0  0  0  0
        1  2  1  0  0  0  0
        1  3  1  0  0  0  0
        1  4  1  0  0  0  0
        M  END
        $$$$
        "#
        .to_string()
    }

    #[test]
    fn parses_first_block_only() {
        let mut multi = sample_sdf();
        multi.push_str(&sample_sdf());
        let system = read(Cursor::new(multi)).expect("parse sdf");
        assert_eq!(system.atom_count(), 4);
        assert_eq!(system.bond_count(), 3);
        assert_eq!(system.atoms[1], Atom::new(Element::O, [1.09, 0.0, 0.0]));
    }

    #[test]
    fn errors_on_v3000() {
        let mut sdf = sample_sdf();
        sdf = sdf.replace("V2000", "V3000");
        let err = read(Cursor::new(sdf)).expect_err("V3000 should be rejected");
        match err {
            Error::Parse { format, .. } => assert_eq!(format, Format::Sdf),
            other => panic!("unexpected error: {other:?}"),
        }
    }
}