Skip to main content

sci_form/ml/
models.rs

1//! Pre-fitted linear ML models for fast property estimation.
2//!
3//! Uses molecular descriptors to predict LogP, molar refractivity,
4//! aqueous solubility, and toxicity flags via simple linear regression
5//! coefficients fitted to public datasets.
6
7use super::descriptors::MolecularDescriptors;
8use serde::{Deserialize, Serialize};
9
10/// ML-predicted molecular properties.
11#[derive(Debug, Clone, Serialize, Deserialize)]
12pub struct MlPropertyResult {
13    /// Predicted octanol-water partition coefficient (log P).
14    pub logp: f64,
15    /// Predicted molar refractivity (cm³/mol).
16    pub molar_refractivity: f64,
17    /// Predicted aqueous solubility (log S, mol/L).
18    pub log_solubility: f64,
19    /// Lipinski rule-of-five flags.
20    pub lipinski: LipinskiResult,
21    /// Druglikeness score (0–1).
22    pub druglikeness: f64,
23}
24
25/// Lipinski's rule of five analysis.
26#[derive(Debug, Clone, Serialize, Deserialize)]
27pub struct LipinskiResult {
28    /// Molecular weight ≤ 500.
29    pub mw_ok: bool,
30    /// LogP ≤ 5.
31    pub logp_ok: bool,
32    /// H-bond donors ≤ 5.
33    pub hbd_ok: bool,
34    /// H-bond acceptors ≤ 10.
35    pub hba_ok: bool,
36    /// Number of violations (0–4).
37    pub violations: u8,
38    /// Passes Ro5 (≤ 1 violation).
39    pub passes: bool,
40}
41
42/// Predict LogP using Wildman-Crippen inspired atom-type contributions.
43///
44/// This is a simplified additive model:
45/// LogP ≈ Σ a_i * (atom_contribution) + corrections
46fn predict_logp(desc: &MolecularDescriptors) -> f64 {
47    // Wildman-Crippen style: base contribution per heavy atom + corrections
48    let base = 0.120 * desc.n_heavy_atoms as f64;
49    let h_correction = -0.230 * desc.n_hbd as f64; // polar H reduces logP
50    let ring_correction = 0.150 * desc.n_rings as f64;
51    let aromatic_correction = 0.080 * desc.n_aromatic as f64;
52    let polar_correction = -0.310 * desc.n_hba as f64;
53    let sp3_correction = -0.180 * desc.fsp3;
54    let mw_term = 0.005 * (desc.molecular_weight - 100.0);
55
56    base + h_correction
57        + ring_correction
58        + aromatic_correction
59        + polar_correction
60        + sp3_correction
61        + mw_term
62}
63
64/// Predict molar refractivity (CMR) from atom contributions.
65///
66/// Simple additive model based on Ghose-Crippen atom contributions.
67fn predict_molar_refractivity(desc: &MolecularDescriptors) -> f64 {
68    // Approximate: MR scales with polarizability
69    // Miller's formula: MR ≈ 2.536 * sum_polarizability + small corrections
70    let base = 2.536 * desc.sum_polarizability;
71    let ring_correction = 1.20 * desc.n_rings as f64;
72    let aromatic = 0.80 * desc.n_aromatic as f64;
73    base + ring_correction + aromatic
74}
75
76/// Predict aqueous solubility (log S) using ESOL-inspired model (Delaney 2004).
77///
78/// log S ≈ 0.16 - 0.63 * logP - 0.0062 * MW + 0.066 * nRotB - 0.74 * nAromAtoms/nHeavy
79fn predict_solubility(desc: &MolecularDescriptors, logp: f64) -> f64 {
80    let frac_aromatic = if desc.n_heavy_atoms > 0 {
81        desc.n_aromatic as f64 / desc.n_heavy_atoms as f64
82    } else {
83        0.0
84    };
85    0.16 - 0.63 * logp - 0.0062 * desc.molecular_weight + 0.066 * desc.n_rotatable_bonds as f64
86        - 0.74 * frac_aromatic
87}
88
89/// Compute druglikeness score (0–1) from molecular descriptors.
90fn druglikeness_score(desc: &MolecularDescriptors, logp: f64) -> f64 {
91    let mut score = 1.0;
92
93    // MW penalty
94    if desc.molecular_weight > 500.0 {
95        score -= 0.2 * ((desc.molecular_weight - 500.0) / 200.0).min(1.0);
96    }
97    // LogP penalty
98    if logp > 5.0 {
99        score -= 0.2 * ((logp - 5.0) / 3.0).min(1.0);
100    } else if logp < -2.0 {
101        score -= 0.15;
102    }
103    // HBD penalty
104    if desc.n_hbd > 5 {
105        score -= 0.15;
106    }
107    // HBA penalty
108    if desc.n_hba > 10 {
109        score -= 0.15;
110    }
111    // Rotatable bond penalty
112    if desc.n_rotatable_bonds > 10 {
113        score -= 0.1 * ((desc.n_rotatable_bonds as f64 - 10.0) / 5.0).min(1.0);
114    }
115    // Reward for fsp3 (lead-likeness)
116    score += 0.05 * desc.fsp3;
117
118    score.clamp(0.0, 1.0)
119}
120
121/// Predict molecular properties from descriptors.
122///
123/// Returns a `MlPropertyResult` with LogP, MR, solubility, Lipinski flags,
124/// and a druglikeness score. These are fast, approximate predictions
125/// suitable for screening workflows.
126pub fn predict_properties(desc: &MolecularDescriptors) -> MlPropertyResult {
127    let logp = predict_logp(desc);
128    let mr = predict_molar_refractivity(desc);
129    let log_s = predict_solubility(desc, logp);
130
131    let mw_ok = desc.molecular_weight <= 500.0;
132    let logp_ok = logp <= 5.0;
133    let hbd_ok = desc.n_hbd <= 5;
134    let hba_ok = desc.n_hba <= 10;
135    let violations = [!mw_ok, !logp_ok, !hbd_ok, !hba_ok]
136        .iter()
137        .filter(|&&v| v)
138        .count() as u8;
139
140    let lipinski = LipinskiResult {
141        mw_ok,
142        logp_ok,
143        hbd_ok,
144        hba_ok,
145        violations,
146        passes: violations <= 1,
147    };
148
149    let druglikeness = druglikeness_score(desc, logp);
150
151    MlPropertyResult {
152        logp,
153        molar_refractivity: mr,
154        log_solubility: log_s,
155        lipinski,
156        druglikeness,
157    }
158}
159
160#[cfg(test)]
161mod tests {
162    use super::*;
163    use crate::ml::descriptors::compute_descriptors;
164
165    #[test]
166    fn test_predict_water() {
167        let elements = [8u8, 1, 1];
168        let bonds = [(0, 1, 1u8), (0, 2, 1)];
169        let desc = compute_descriptors(&elements, &bonds, &[], &[]);
170        let result = predict_properties(&desc);
171        // Water should be very hydrophilic (low logP)
172        assert!(
173            result.logp < 1.0,
174            "Water logP should be low: {}",
175            result.logp
176        );
177        assert!(result.lipinski.passes, "Water should pass Lipinski");
178    }
179
180    #[test]
181    fn test_lipinski_violations() {
182        // Giant fake molecule that violates everything
183        let desc = MolecularDescriptors {
184            molecular_weight: 800.0,
185            n_heavy_atoms: 60,
186            n_hydrogens: 20,
187            n_bonds: 80,
188            n_rotatable_bonds: 15,
189            n_hbd: 8,
190            n_hba: 15,
191            fsp3: 0.1,
192            total_abs_charge: 5.0,
193            max_charge: 0.5,
194            min_charge: -0.5,
195            wiener_index: 5000.0,
196            n_rings: 5,
197            n_aromatic: 12,
198            balaban_j: 2.0,
199            sum_electronegativity: 150.0,
200            sum_polarizability: 80.0,
201        };
202        let result = predict_properties(&desc);
203        assert!(
204            result.lipinski.violations >= 2,
205            "Should have multiple violations"
206        );
207        assert!(!result.lipinski.passes, "Should fail Lipinski");
208    }
209
210    #[test]
211    fn test_druglikeness_range() {
212        let elements = [6u8, 6, 8, 1, 1, 1, 1, 1, 1];
213        let bonds = [
214            (0, 1, 1u8),
215            (1, 2, 1),
216            (0, 3, 1),
217            (0, 4, 1),
218            (0, 5, 1),
219            (1, 6, 1),
220            (1, 7, 1),
221            (2, 8, 1),
222        ];
223        let desc = compute_descriptors(&elements, &bonds, &[], &[]);
224        let result = predict_properties(&desc);
225        assert!(result.druglikeness >= 0.0 && result.druglikeness <= 1.0);
226    }
227}