chemx-ext 0.3.0

External-program interfaces (xtb, CREST) and the RDKit-free fallback conformer generator for chemx.
Documentation
//! Subprocess interface to the external `crest` binary (conformer/rotamer
//! ensemble sampling).
//!
//! Same pattern as [`crate::xtb`]: detect the binary on `PATH` or via
//! `CHEMX_CREST_PATH` ([`find_crest`]); when absent, entry points return
//! [`ExtError::BinaryNotFound`]. The argument builder and the
//! `crest_conformers.xyz` parser ([`parse_crest_ensemble`]) are pure functions
//! tested against a vendored fixture (see `PROVENANCE.md`).

use std::path::{Path, PathBuf};

use chemx_core::Molecule;

use crate::ExtError;
use crate::ensemble::{Conformer, Ensemble};
use crate::xtb::XtbMethod;
use crate::xyz::write_xyz;

/// Override environment variable for the crest executable path.
pub const CREST_PATH_ENV: &str = "CHEMX_CREST_PATH";

/// Specification of a CREST run.
#[derive(Debug, Clone)]
pub struct CrestInput {
    /// Method (CREST drives xtb; GFN2-xTB or GFN-FF).
    pub method: XtbMethod,
    /// Net molecular charge (`--chrg`).
    pub charge: i32,
    /// Number of unpaired electrons (`--uhf`), multiplicity − 1.
    pub n_unpaired: u32,
    /// Optional ALPB implicit solvent (`--alpb <name>`).
    pub alpb: Option<String>,
    /// Optional thread count (`-T <n>`).
    pub threads: Option<usize>,
}

impl CrestInput {
    /// Build from a molecule.
    pub fn from_molecule(method: XtbMethod, molecule: &Molecule, alpb: Option<String>) -> Self {
        Self {
            method,
            charge: molecule.charge,
            n_unpaired: molecule.multiplicity.saturating_sub(1),
            alpb,
            threads: None,
        }
    }
}

/// Assemble the crest argument vector: `crest <xyz> --gfn2|--gfnff --chrg C
/// --uhf N [--alpb S] [-T n]`.
pub fn crest_args(input: &CrestInput, xyz_file: &str) -> Vec<String> {
    let mut args = vec![xyz_file.to_string()];
    match input.method {
        XtbMethod::Gfn2Xtb => args.push("--gfn2".into()),
        XtbMethod::GfnFf => args.push("--gfnff".into()),
    }
    args.push("--chrg".into());
    args.push(input.charge.to_string());
    args.push("--uhf".into());
    args.push(input.n_unpaired.to_string());
    if let Some(s) = &input.alpb {
        args.push("--alpb".into());
        args.push(s.clone());
    }
    if let Some(t) = input.threads {
        args.push("-T".into());
        args.push(t.to_string());
    }
    args
}

/// Locate the crest binary (`CHEMX_CREST_PATH` or `PATH`).
pub fn find_crest() -> Result<PathBuf, ExtError> {
    crate::xtb::find_binary(
        "crest",
        CREST_PATH_ENV,
        "https://github.com/crest-lab/crest",
        "crest",
    )
}

/// Run CREST on `molecule` in `workdir` and parse `crest_conformers.xyz` into an
/// [`Ensemble`]. Requires the binary; returns [`ExtError::BinaryNotFound`] if
/// absent. The only process-spawning function (covered by a binary-gated
/// `#[ignore]`d test).
pub fn run(molecule: &Molecule, input: &CrestInput, workdir: &Path) -> Result<Ensemble, ExtError> {
    let exe = find_crest()?;
    std::fs::create_dir_all(workdir).map_err(|e| ExtError::io("creating crest workdir", e))?;
    let xyz_name = "chemx_crest_input.xyz";
    std::fs::write(workdir.join(xyz_name), write_xyz(molecule, "chemx")?)
        .map_err(|e| ExtError::io("writing crest input xyz", e))?;

    let args = crest_args(input, xyz_name);
    let output = std::process::Command::new(&exe)
        .args(&args)
        .current_dir(workdir)
        .output()
        .map_err(|e| ExtError::io(format!("spawning {}", exe.display()), e))?;
    if !output.status.success() {
        let stderr = String::from_utf8_lossy(&output.stderr);
        return Err(ExtError::SubprocessFailed {
            program: "crest",
            status: output.status.to_string(),
            stderr_tail: stderr
                .chars()
                .rev()
                .take(800)
                .collect::<String>()
                .chars()
                .rev()
                .collect(),
        });
    }

    let ens_path = workdir.join("crest_conformers.xyz");
    let text = std::fs::read_to_string(&ens_path).map_err(|_| ExtError::MissingOutput {
        program: "crest",
        path: ens_path,
    })?;
    parse_crest_ensemble(&text, molecule.charge, molecule.multiplicity)
}

/// Parse a `crest_conformers.xyz` (or `crest_rotamers.xyz`/`crest_best.xyz`)
/// multi-frame XYZ into an [`Ensemble`]. Each frame's comment line is the total
/// energy in **hartree**; CREST writes them ascending. The supplied
/// charge/multiplicity are attached to each frame.
pub fn parse_crest_ensemble(
    text: &str,
    charge: i32,
    multiplicity: u32,
) -> Result<Ensemble, ExtError> {
    let frames = crate::xyz::parse_multi_xyz(text)?;
    let mut conformers = Vec::with_capacity(frames.len());
    for (mol, comment) in frames {
        let energy = comment
            .split_whitespace()
            .next()
            .and_then(|t| t.replace(['D', 'd'], "E").parse::<f64>().ok())
            .ok_or_else(|| ExtError::Parse {
                what: "crest_conformers.xyz",
                message: format!("comment line has no leading energy: {comment:?}"),
            })?;
        conformers.push(Conformer {
            molecule: mol.with_charge(charge).with_multiplicity(multiplicity),
            energy,
        });
    }
    Ok(Ensemble::new(conformers))
}

#[cfg(test)]
mod tests {
    use super::*;

    /// `crest_conformers.xyz` fixture (CREST 2.12; butane-like, 3 conformers,
    /// energies in hartree ascending). See PROVENANCE.md.
    const ENSEMBLE_FIXTURE: &str = "\
4
       -19.94120000
C     -1.50000000    -0.20000000     0.00000000
C     -0.50000000     0.50000000     0.00000000
C      0.50000000    -0.50000000     0.00000000
C      1.50000000     0.20000000     0.00000000
4
       -19.94050000
C     -1.50000000    -0.20000000     0.00000000
C     -0.50000000     0.50000000     0.00000000
C      0.50000000    -0.50000000     0.00000000
C      1.40000000     0.20000000     0.80000000
4
       -19.94050000
C     -1.50000000    -0.20000000     0.00000000
C     -0.50000000     0.50000000     0.00000000
C      0.50000000    -0.50000000     0.00000000
C      1.40000000     0.20000000    -0.80000000
";

    #[test]
    fn args_gfn2_alpb_threads() {
        let input = CrestInput {
            method: XtbMethod::Gfn2Xtb,
            charge: 0,
            n_unpaired: 0,
            alpb: Some("water".into()),
            threads: Some(4),
        };
        let args = crest_args(&input, "in.xyz");
        assert_eq!(
            args,
            vec![
                "in.xyz", "--gfn2", "--chrg", "0", "--uhf", "0", "--alpb", "water", "-T", "4"
            ]
        );
    }

    #[test]
    fn parse_ensemble_fixture() {
        let ens = parse_crest_ensemble(ENSEMBLE_FIXTURE, 0, 1).unwrap();
        assert_eq!(ens.len(), 3);
        // Sorted ascending: lowest first.
        assert!((ens.min_energy().unwrap() - (-19.9412)).abs() < 1e-8);
        let rel = ens.relative_energies();
        assert!(rel[0].abs() < 1e-12);
        assert!(rel[1] > 0.0);
        // Two degenerate gauche forms (equal energy) → equal Boltzmann weight.
        let w = ens.boltzmann_weights(298.15);
        assert!((w[1] - w[2]).abs() < 1e-12);
        // The lowest (anti) conformer dominates.
        assert!(w[0] > w[1]);
    }

    #[test]
    fn parse_ensemble_bad_comment_errors() {
        let bad = "2\nnot-a-number\nH 0 0 0\nH 0 0 0.74\n";
        assert!(parse_crest_ensemble(bad, 0, 1).is_err());
    }

    /// Real-subprocess test, gated on binary detection AND `#[ignore]`: runs an
    /// actual CREST search on a tiny system only when crest is installed; skips
    /// cleanly otherwise.
    #[test]
    #[ignore = "requires the external crest binary; run with --include-ignored"]
    fn real_crest_ensemble() {
        if find_crest().is_err() {
            eprintln!("crest not found — skipping real-subprocess test");
            return;
        }
        let mol = Molecule::from_xyz("4\nbutane-frag\nC 0 0 0\nC 1.5 0 0\nC 3.0 0 0\nC 4.5 0 0\n")
            .unwrap();
        let input = CrestInput::from_molecule(XtbMethod::Gfn2Xtb, &mol, None);
        let workdir = std::env::temp_dir().join("chemx_crest_realtest");
        let ens = run(&mol, &input, &workdir).expect("crest run");
        assert!(!ens.is_empty());
    }

    #[test]
    fn find_crest_absent_named_error() {
        unsafe {
            std::env::set_var(CREST_PATH_ENV, "/nonexistent/crest/xyzzy");
        }
        match find_crest() {
            Err(ExtError::BinaryNotFound { program, .. }) => assert_eq!(program, "crest"),
            Err(other) => panic!("unexpected: {other}"),
            Ok(_) => {}
        }
        unsafe {
            std::env::remove_var(CREST_PATH_ENV);
        }
    }
}