Skip to main content

sci_form/ml/
models.rs

1//! Pre-fitted linear ML models for fast property estimation.
2//!
3//! Uses molecular descriptors to predict LogP, molar refractivity,
4//! aqueous solubility, and toxicity flags via simple linear regression
5//! coefficients fitted to public datasets.
6
7use super::descriptors::MolecularDescriptors;
8use serde::{Deserialize, Serialize};
9
10/// ML-predicted molecular properties.
11#[derive(Debug, Clone, Serialize, Deserialize)]
12pub struct MlPropertyResult {
13    /// Predicted octanol-water partition coefficient (log P).
14    pub logp: f64,
15    /// Predicted molar refractivity (cm³/mol).
16    pub molar_refractivity: f64,
17    /// Predicted aqueous solubility (log S, mol/L).
18    pub log_solubility: f64,
19    /// Lipinski rule-of-five flags.
20    pub lipinski: LipinskiResult,
21    /// Druglikeness score (0–1).
22    pub druglikeness: f64,
23    /// Prediction uncertainty estimates.
24    pub uncertainty: PredictionUncertainty,
25}
26
27/// Prediction uncertainty based on descriptor-space coverage.
28#[derive(Debug, Clone, Serialize, Deserialize)]
29pub struct PredictionUncertainty {
30    /// Confidence level (0–1): how well the molecule falls within the model's applicability domain.
31    pub confidence: f64,
32    /// Estimated standard deviation for LogP prediction.
33    pub logp_std: f64,
34    /// Estimated standard deviation for solubility prediction.
35    pub solubility_std: f64,
36    /// Warnings about applicability domain.
37    pub warnings: Vec<String>,
38}
39
40/// Lipinski's rule of five analysis.
41#[derive(Debug, Clone, Serialize, Deserialize)]
42pub struct LipinskiResult {
43    /// Molecular weight ≤ 500.
44    pub mw_ok: bool,
45    /// LogP ≤ 5.
46    pub logp_ok: bool,
47    /// H-bond donors ≤ 5.
48    pub hbd_ok: bool,
49    /// H-bond acceptors ≤ 10.
50    pub hba_ok: bool,
51    /// Number of violations (0–4).
52    pub violations: u8,
53    /// Passes Ro5 (≤ 1 violation).
54    pub passes: bool,
55}
56
57/// Predict LogP using Wildman-Crippen inspired atom-type contributions.
58///
59/// This is a simplified additive model:
60/// LogP ≈ Σ a_i * (atom_contribution) + corrections
61fn predict_logp(desc: &MolecularDescriptors) -> f64 {
62    // Wildman-Crippen style: base contribution per heavy atom + corrections
63    let base = 0.120 * desc.n_heavy_atoms as f64;
64    let h_correction = -0.230 * desc.n_hbd as f64; // polar H reduces logP
65    let ring_correction = 0.150 * desc.n_rings as f64;
66    let aromatic_correction = 0.080 * desc.n_aromatic as f64;
67    let polar_correction = -0.310 * desc.n_hba as f64;
68    let sp3_correction = -0.180 * desc.fsp3;
69    let mw_term = 0.005 * (desc.molecular_weight - 100.0);
70
71    base + h_correction
72        + ring_correction
73        + aromatic_correction
74        + polar_correction
75        + sp3_correction
76        + mw_term
77}
78
79/// Predict molar refractivity (CMR) from atom contributions.
80///
81/// Simple additive model based on Ghose-Crippen atom contributions.
82fn predict_molar_refractivity(desc: &MolecularDescriptors) -> f64 {
83    // Approximate: MR scales with polarizability
84    // Miller's formula: MR ≈ 2.536 * sum_polarizability + small corrections
85    let base = 2.536 * desc.sum_polarizability;
86    let ring_correction = 1.20 * desc.n_rings as f64;
87    let aromatic = 0.80 * desc.n_aromatic as f64;
88    base + ring_correction + aromatic
89}
90
91/// Predict aqueous solubility (log S) using ESOL-inspired model (Delaney 2004).
92///
93/// log S ≈ 0.16 - 0.63 * logP - 0.0062 * MW + 0.066 * nRotB - 0.74 * nAromAtoms/nHeavy
94fn predict_solubility(desc: &MolecularDescriptors, logp: f64) -> f64 {
95    let frac_aromatic = if desc.n_heavy_atoms > 0 {
96        desc.n_aromatic as f64 / desc.n_heavy_atoms as f64
97    } else {
98        0.0
99    };
100    0.16 - 0.63 * logp - 0.0062 * desc.molecular_weight + 0.066 * desc.n_rotatable_bonds as f64
101        - 0.74 * frac_aromatic
102}
103
104/// Compute druglikeness score (0–1) from molecular descriptors.
105fn druglikeness_score(desc: &MolecularDescriptors, logp: f64) -> f64 {
106    let mut score = 1.0;
107
108    // MW penalty
109    if desc.molecular_weight > 500.0 {
110        score -= 0.2 * ((desc.molecular_weight - 500.0) / 200.0).min(1.0);
111    }
112    // LogP penalty
113    if logp > 5.0 {
114        score -= 0.2 * ((logp - 5.0) / 3.0).min(1.0);
115    } else if logp < -2.0 {
116        score -= 0.15;
117    }
118    // HBD penalty
119    if desc.n_hbd > 5 {
120        score -= 0.15;
121    }
122    // HBA penalty
123    if desc.n_hba > 10 {
124        score -= 0.15;
125    }
126    // Rotatable bond penalty
127    if desc.n_rotatable_bonds > 10 {
128        score -= 0.1 * ((desc.n_rotatable_bonds as f64 - 10.0) / 5.0).min(1.0);
129    }
130    // Reward for fsp3 (lead-likeness)
131    score += 0.05 * desc.fsp3;
132
133    score.clamp(0.0, 1.0)
134}
135
136/// Estimate prediction uncertainty from descriptor-space analysis.
137fn estimate_uncertainty(desc: &MolecularDescriptors) -> PredictionUncertainty {
138    let mut confidence: f64 = 1.0;
139    let mut warnings = Vec::new();
140
141    // Applicability domain checks — penalize unusual descriptor values
142    if desc.molecular_weight > 900.0 {
143        confidence -= 0.3;
144        warnings.push("MW > 900 — outside typical training domain".to_string());
145    } else if desc.molecular_weight > 600.0 {
146        confidence -= 0.1;
147    }
148
149    if desc.n_heavy_atoms > 70 {
150        confidence -= 0.2;
151        warnings.push("Large molecule (>70 heavy atoms) — reduced accuracy".to_string());
152    }
153
154    if desc.n_rings > 8 {
155        confidence -= 0.15;
156        warnings.push("Many rings — polycyclic compounds have higher uncertainty".to_string());
157    }
158
159    if desc.n_rotatable_bonds > 15 {
160        confidence -= 0.1;
161    }
162
163    // Base standard deviations from model training residual analysis
164    let logp_std = 0.6 + 0.002 * desc.molecular_weight.max(0.0); // increases with MW
165    let sol_std = 0.8 + 0.003 * desc.molecular_weight.max(0.0);
166
167    PredictionUncertainty {
168        confidence: confidence.clamp(0.1, 1.0),
169        logp_std: logp_std / confidence.max(0.3),
170        solubility_std: sol_std / confidence.max(0.3),
171        warnings,
172    }
173}
174
175/// Predict molecular properties from descriptors.
176///
177/// Returns a `MlPropertyResult` with LogP, MR, solubility, Lipinski flags,
178/// a druglikeness score, and uncertainty estimates.
179pub fn predict_properties(desc: &MolecularDescriptors) -> MlPropertyResult {
180    let logp = predict_logp(desc);
181    let mr = predict_molar_refractivity(desc);
182    let log_s = predict_solubility(desc, logp);
183
184    let mw_ok = desc.molecular_weight <= 500.0;
185    let logp_ok = logp <= 5.0;
186    let hbd_ok = desc.n_hbd <= 5;
187    let hba_ok = desc.n_hba <= 10;
188    let violations = [!mw_ok, !logp_ok, !hbd_ok, !hba_ok]
189        .iter()
190        .filter(|&&v| v)
191        .count() as u8;
192
193    let lipinski = LipinskiResult {
194        mw_ok,
195        logp_ok,
196        hbd_ok,
197        hba_ok,
198        violations,
199        passes: violations <= 1,
200    };
201
202    let druglikeness = druglikeness_score(desc, logp);
203
204    let uncertainty = estimate_uncertainty(desc);
205
206    MlPropertyResult {
207        logp,
208        molar_refractivity: mr,
209        log_solubility: log_s,
210        lipinski,
211        druglikeness,
212        uncertainty,
213    }
214}
215
216#[cfg(test)]
217mod tests {
218    use super::*;
219    use crate::ml::descriptors::compute_descriptors;
220
221    #[test]
222    fn test_predict_water() {
223        let elements = [8u8, 1, 1];
224        let bonds = [(0, 1, 1u8), (0, 2, 1)];
225        let desc = compute_descriptors(&elements, &bonds, &[], &[]);
226        let result = predict_properties(&desc);
227        // Water should be very hydrophilic (low logP)
228        assert!(
229            result.logp < 1.0,
230            "Water logP should be low: {}",
231            result.logp
232        );
233        assert!(result.lipinski.passes, "Water should pass Lipinski");
234    }
235
236    #[test]
237    fn test_lipinski_violations() {
238        // Giant fake molecule that violates everything
239        let desc = MolecularDescriptors {
240            molecular_weight: 800.0,
241            n_heavy_atoms: 60,
242            n_hydrogens: 20,
243            n_bonds: 80,
244            n_rotatable_bonds: 15,
245            n_hbd: 8,
246            n_hba: 15,
247            fsp3: 0.1,
248            total_abs_charge: 5.0,
249            max_charge: 0.5,
250            min_charge: -0.5,
251            wiener_index: 5000.0,
252            n_rings: 5,
253            n_aromatic: 12,
254            balaban_j: 2.0,
255            sum_electronegativity: 150.0,
256            sum_polarizability: 80.0,
257        };
258        let result = predict_properties(&desc);
259        assert!(
260            result.lipinski.violations >= 2,
261            "Should have multiple violations"
262        );
263        assert!(!result.lipinski.passes, "Should fail Lipinski");
264    }
265
266    #[test]
267    fn test_druglikeness_range() {
268        let elements = [6u8, 6, 8, 1, 1, 1, 1, 1, 1];
269        let bonds = [
270            (0, 1, 1u8),
271            (1, 2, 1),
272            (0, 3, 1),
273            (0, 4, 1),
274            (0, 5, 1),
275            (1, 6, 1),
276            (1, 7, 1),
277            (2, 8, 1),
278        ];
279        let desc = compute_descriptors(&elements, &bonds, &[], &[]);
280        let result = predict_properties(&desc);
281        assert!(result.druglikeness >= 0.0 && result.druglikeness <= 1.0);
282    }
283}