dreid-forge 0.4.1

A pure Rust library and CLI that automates DREIDING force field parameterization by orchestrating structure repair, topology perception, and charge calculation for both biological and chemical systems.
Documentation
use crate::io::{Format, error::Error, util};
use crate::model::{
    atom::Atom,
    system::{Bond, System},
    types::BondOrder,
};
use std::collections::HashMap;
use std::io::BufRead;

pub fn read<R: BufRead>(reader: R) -> Result<System, Error> {
    let lines = collect_lines(reader)?;

    let mol_idx = find_section(&lines, "@<TRIPOS>MOLECULE")
        .ok_or_else(|| Error::parse(Format::Mol2, 1, "missing @<TRIPOS>MOLECULE section"))?;

    let mut cursor = mol_idx + 1;
    let _name = next_data_line(&lines, &mut cursor).unwrap_or_else(|| (0, String::from("MOL2")));

    let (count_line_no, count_line) = next_data_line(&lines, &mut cursor)
        .ok_or_else(|| Error::parse(Format::Mol2, cursor + 1, "missing counts line"))?;
    let (atom_count, bond_count) = parse_counts(&count_line, count_line_no)?;

    let atom_section = find_section(&lines, "@<TRIPOS>ATOM")
        .ok_or_else(|| Error::parse(Format::Mol2, cursor + 1, "missing @<TRIPOS>ATOM section"))?;
    let bond_section = find_section(&lines, "@<TRIPOS>BOND").unwrap_or(lines.len());

    let (atoms, id_map) = parse_atoms(&lines, atom_section + 1, bond_section, atom_count)?;
    let bonds = parse_bonds(&lines, bond_section + 1, atom_count, bond_count, &id_map)?;

    Ok(System {
        atoms,
        bonds,
        box_vectors: None,
        bio_metadata: None,
    })
}

fn collect_lines<R: BufRead>(reader: R) -> Result<Vec<(usize, String)>, Error> {
    reader
        .lines()
        .enumerate()
        .map(|(i, line)| {
            line.map(|v| (i + 1, v))
                .map_err(|e| Error::Io { source: e })
        })
        .collect()
}

fn find_section(lines: &[(usize, String)], name: &str) -> Option<usize> {
    lines
        .iter()
        .position(|(_, line)| line.trim().eq_ignore_ascii_case(name))
}

fn next_data_line(lines: &[(usize, String)], cursor: &mut usize) -> Option<(usize, String)> {
    while *cursor < lines.len() {
        let (ln, content) = &lines[*cursor];
        *cursor += 1;
        let trimmed = content.trim();
        if trimmed.is_empty() || trimmed.starts_with('#') {
            continue;
        }
        return Some((*ln, content.clone()));
    }
    None
}

fn parse_counts(line: &str, line_no: usize) -> Result<(usize, usize), Error> {
    let parts: Vec<_> = line.split_whitespace().collect();
    if parts.len() < 2 {
        return Err(Error::parse(
            Format::Mol2,
            line_no,
            "counts line must have at least atom and bond counts",
        ));
    }
    let atoms = parts[0]
        .parse::<usize>()
        .map_err(|_| Error::parse(Format::Mol2, line_no, "invalid atom count in counts line"))?;
    let bonds = parts[1]
        .parse::<usize>()
        .map_err(|_| Error::parse(Format::Mol2, line_no, "invalid bond count in counts line"))?;
    Ok((atoms, bonds))
}

fn parse_atoms(
    lines: &[(usize, String)],
    start: usize,
    end: usize,
    expected: usize,
) -> Result<(Vec<Atom>, HashMap<usize, usize>), Error> {
    let mut atoms = Vec::with_capacity(expected);
    let mut id_map = HashMap::new();

    for idx in 0..expected {
        let line_idx = start + idx;
        if line_idx >= end {
            return Err(Error::parse(
                Format::Mol2,
                lines.last().map(|(ln, _)| *ln).unwrap_or(0),
                "ATOM section ended before expected atom count",
            ));
        }
        let (ln, raw) = &lines[line_idx];
        let parts: Vec<_> = raw.split_whitespace().collect();
        if parts.len() < 6 {
            return Err(Error::parse(Format::Mol2, *ln, "invalid ATOM line"));
        }

        let atom_id = parts[0]
            .parse::<usize>()
            .map_err(|_| Error::parse(Format::Mol2, *ln, "invalid atom id in ATOM line"))?;
        let x = parts[2]
            .parse::<f64>()
            .map_err(|_| Error::parse(Format::Mol2, *ln, "invalid x coordinate in ATOM line"))?;
        let y = parts[3]
            .parse::<f64>()
            .map_err(|_| Error::parse(Format::Mol2, *ln, "invalid y coordinate in ATOM line"))?;
        let z = parts[4]
            .parse::<f64>()
            .map_err(|_| Error::parse(Format::Mol2, *ln, "invalid z coordinate in ATOM line"))?;

        let element = util::guess_element_symbol(parts[5])
            .or_else(|| util::guess_element_symbol(parts[1]))
            .ok_or_else(|| Error::parse(Format::Mol2, *ln, "unable to infer element"))?;

        id_map.insert(atom_id, atoms.len());
        atoms.push(Atom::new(element, [x, y, z]));
    }

    Ok((atoms, id_map))
}

fn parse_bonds(
    lines: &[(usize, String)],
    start: usize,
    atom_count: usize,
    expected: usize,
    id_map: &HashMap<usize, usize>,
) -> Result<Vec<Bond>, Error> {
    let mut bonds = Vec::with_capacity(expected);

    for idx in 0..expected {
        let line_idx = start + idx;
        if line_idx >= lines.len() {
            return Err(Error::parse(
                Format::Mol2,
                lines.last().map(|(ln, _)| *ln).unwrap_or(0),
                "BOND section ended before expected bond count",
            ));
        }
        let (ln, raw) = &lines[line_idx];
        let parts: Vec<_> = raw.split_whitespace().collect();
        if parts.len() < 4 {
            return Err(Error::parse(Format::Mol2, *ln, "invalid BOND line"));
        }

        let a1 = parts[1]
            .parse::<usize>()
            .map_err(|_| Error::parse(Format::Mol2, *ln, "invalid first atom id in BOND line"))?;
        let a2 = parts[2]
            .parse::<usize>()
            .map_err(|_| Error::parse(Format::Mol2, *ln, "invalid second atom id in BOND line"))?;

        let order: BondOrder = util::bond_order_from_mol2(parts[3])
            .ok_or_else(|| Error::parse(Format::Mol2, *ln, "unsupported bond type in BOND line"))?;

        let i = *id_map
            .get(&a1)
            .ok_or_else(|| Error::parse(Format::Mol2, *ln, "bond references unknown atom id"))?;
        let j = *id_map
            .get(&a2)
            .ok_or_else(|| Error::parse(Format::Mol2, *ln, "bond references unknown atom id"))?;

        if i >= atom_count || j >= atom_count {
            return Err(Error::parse(
                Format::Mol2,
                *ln,
                "bond references atom beyond declared count",
            ));
        }

        bonds.push(Bond::new(i, j, order));
    }

    Ok(bonds)
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::model::{atom::Atom, types::Element};
    use std::io::Cursor;

    fn sample_mol2() -> String {
        r#"@<TRIPOS>MOLECULE
        benzene
        12 12 0 0 0
        SMALL
        NO_CHARGES
        ****

        @<TRIPOS>ATOM
                1 C1         -0.7600    1.1691   -0.0005 C.ar    1  BENZENE       0.000
                2 C2          0.6329    1.2447   -0.0012 C.ar    1  BENZENE       0.000
                3 C3          1.3947    0.0765    0.0004 C.ar    1  BENZENE       0.000
                4 C4          0.7641   -1.1677    0.0027 C.ar    1  BENZENE       0.000
                5 C5         -0.6288   -1.2432    0.0001 C.ar    1  BENZENE       0.000
                6 C6         -1.3907   -0.0751   -0.0015 C.ar    1  BENZENE       0.000
                7 H7         -1.3536    2.0792    0.0005 H       1  BENZENE       0.000
                8 H8          1.1243    2.2140   -0.0028 H       1  BENZENE       0.000
                9 H9          2.4799    0.1355   -0.0000 H       1  BENZENE       0.000
               10 H10         1.3576   -2.0778    0.0063 H       1  BENZENE       0.000
               11 H11        -1.1202   -2.2126   -0.0005 H       1  BENZENE       0.000
               12 H12        -2.4759   -0.1340   -0.0035 H       1  BENZENE       0.000
        @<TRIPOS>BOND
                1     1     2   ar
                2     2     3   ar
                3     3     4   ar
                4     4     5   ar
                5     5     6   ar
                6     1     6   ar
                7     1     7    1
                8     2     8    1
                9     3     9    1
               10     4    10    1
               11     5    11    1
               12     6    12    1
        "#
        .to_string()
    }

    #[test]
    fn parses_atoms_and_bonds() {
        let system = read(Cursor::new(sample_mol2())).expect("parse mol2");
        assert_eq!(system.atom_count(), 12);
        assert_eq!(system.bond_count(), 12);
        assert_eq!(
            system.atoms[0],
            Atom::new(Element::C, [-0.7600, 1.1691, -0.0005])
        );
        assert_eq!(system.bonds[0].i, 0);
        assert_eq!(system.bonds[0].j, 1);
        assert_eq!(system.bonds[0].order, BondOrder::Aromatic);
        assert_eq!(system.bonds[6].order, BondOrder::Single);
    }

    #[test]
    fn errors_on_unknown_bond_type() {
        let mut bad = sample_mol2();
        bad = bad.replace("ar", "xx");
        let err = read(Cursor::new(bad)).expect_err("bond type should fail");
        match err {
            Error::Parse { format, .. } => assert_eq!(format, Format::Mol2),
            other => panic!("unexpected error: {other:?}"),
        }
    }
}