sci-form 0.14.3

High-performance 3D molecular conformer generation using ETKDG distance geometry
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
//! Ensemble ML models with non-linear predictions and uncertainty estimates.
//!
//! Extends the base linear models with:
//! - Decision-tree-like cascading rules for LogP, solubility, pKa
//! - TPSA (Topological Polar Surface Area) descriptor
//! - Veber rule analysis for oral bioavailability
//! - BBB permeability prediction
//! - Consensus scoring with prediction confidence

use super::descriptors::MolecularDescriptors;
use serde::{Deserialize, Serialize};

/// Extended ML property predictions with uncertainty and additional properties.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EnsembleResult {
    /// Predicted LogP (consensus of 3 models).
    pub logp: f64,
    /// Standard deviation across LogP models (uncertainty).
    pub logp_std: f64,
    /// Predicted aqueous solubility (log S, mol/L).
    pub log_solubility: f64,
    /// Predicted TPSA (Ų).
    pub tpsa: f64,
    /// Predicted pKa for the most acidic group.
    pub pka_acidic: Option<f64>,
    /// Predicted pKa for the most basic group.
    pub pka_basic: Option<f64>,
    /// Veber oral bioavailability rules.
    pub veber: VeberResult,
    /// BBB permeability prediction.
    pub bbb_permeable: bool,
    /// BBB permeability score (0–1).
    pub bbb_score: f64,
    /// Overall prediction confidence (0–1).
    pub confidence: f64,
}

/// Veber's rules for oral bioavailability.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct VeberResult {
    /// TPSA ≤ 140 Ų.
    pub tpsa_ok: bool,
    /// Number of rotatable bonds ≤ 10.
    pub rotb_ok: bool,
    /// Passes Veber rules (both criteria met).
    pub passes: bool,
}

/// Contribution of each atom type to TPSA (Ų).
/// Ertl, P., Rohde, B., Selzer, P. (2000) — simplified fragment contributions.
fn tpsa_contribution(z: u8, n_heavy_neighbors: usize, n_h_neighbors: usize) -> f64 {
    match z {
        // Nitrogen contributions
        7 => match (n_heavy_neighbors, n_h_neighbors) {
            (1, 2) => 26.02, // -NH₂
            (1, 1) => 26.02, // =NH
            (2, 1) => 12.36, // >NH
            (2, 0) => 12.36, // =N-
            (3, 0) => 3.24,  // >N-
            (1, 0) => 23.79, // ≡N
            _ => 12.0,
        },
        // Oxygen contributions
        8 => match (n_heavy_neighbors, n_h_neighbors) {
            (0, 2) => 20.23, // H₂O
            (1, 1) => 20.23, // -OH
            (1, 0) => 17.07, // =O
            (2, 0) => 9.23,  // -O-
            _ => 15.0,
        },
        // Sulfur: small contributions
        16 => match (n_heavy_neighbors, n_h_neighbors) {
            (1, 1) => 38.80, // -SH
            (2, 0) => 25.30, // -S-
            (1, 0) => 32.00, // =S
            _ => 28.0,
        },
        // Phosphorus
        15 => 34.14,
        _ => 0.0,
    }
}

/// Compute TPSA from elements and bond connectivity.
///
/// Uses Ertl fragment-based TPSA calculation.
pub fn compute_tpsa(elements: &[u8], bonds: &[(usize, usize, u8)]) -> f64 {
    let n = elements.len();
    let mut adj: Vec<Vec<usize>> = vec![vec![]; n];
    for &(i, j, _) in bonds {
        if i < n && j < n {
            adj[i].push(j);
            adj[j].push(i);
        }
    }

    let mut tpsa = 0.0;
    for i in 0..n {
        if !matches!(elements[i], 7 | 8 | 15 | 16) {
            continue;
        }
        let n_heavy = adj[i].iter().filter(|&&j| elements[j] != 1).count();
        let n_h = adj[i].iter().filter(|&&j| elements[j] == 1).count();
        tpsa += tpsa_contribution(elements[i], n_heavy, n_h);
    }
    tpsa
}

// ─── Ensemble LogP models ────────────────────────────────────────────────────

/// Model 1: Wildman-Crippen additive (same as base model).
fn logp_model_1(desc: &MolecularDescriptors) -> f64 {
    let base = 0.120 * desc.n_heavy_atoms as f64;
    let h_corr = -0.230 * desc.n_hbd as f64;
    let ring_corr = 0.150 * desc.n_rings as f64;
    let arom_corr = 0.080 * desc.n_aromatic as f64;
    let polar_corr = -0.310 * desc.n_hba as f64;
    let sp3_corr = -0.180 * desc.fsp3;
    let mw_term = 0.005 * (desc.molecular_weight - 100.0);
    base + h_corr + ring_corr + arom_corr + polar_corr + sp3_corr + mw_term
}

/// Model 2: ALOGP-like fragment-based with non-linear corrections.
fn logp_model_2(desc: &MolecularDescriptors, tpsa: f64) -> f64 {
    // Use TPSA as a proxy for overall polarity
    let polarity_term = -0.015 * tpsa;
    let size_term = 0.008 * desc.molecular_weight;
    let hbond_term = -0.45 * (desc.n_hbd as f64 + 0.5 * desc.n_hba as f64);
    let lipophilic = 0.22 * (desc.n_heavy_atoms as f64 - desc.n_hba as f64);

    // Non-linear correction: penalty for very high polarity
    let nl_correction = if tpsa > 80.0 {
        -0.003 * (tpsa - 80.0).powi(2) / 100.0
    } else {
        0.0
    };

    size_term + polarity_term + hbond_term + lipophilic + nl_correction - 1.5
}

/// Model 3: Topological descriptor based.
fn logp_model_3(desc: &MolecularDescriptors) -> f64 {
    let chi_approx = if desc.n_bonds > 0 {
        (desc.n_bonds as f64).sqrt() / (desc.n_heavy_atoms as f64).max(1.0)
    } else {
        0.0
    };

    let wiener_term = if desc.wiener_index > 0.0 {
        0.25 * desc.wiener_index.ln()
    } else {
        0.0
    };

    let polar_penalty = -0.35 * (desc.n_hbd + desc.n_hba) as f64;
    let arom_bonus = 0.12 * desc.n_aromatic as f64;

    chi_approx + wiener_term + polar_penalty + arom_bonus - 0.8
}

// ─── pKa prediction ──────────────────────────────────────────────────────────

/// Predict acidic pKa based on functional groups.
/// Returns None if no acidic group detected.
fn predict_pka_acidic(desc: &MolecularDescriptors, tpsa: f64) -> Option<f64> {
    if desc.n_hbd == 0 {
        return None;
    }

    // Base pKa value depends on donor type
    // Approximate: carboxylic acid ~4.5, phenol ~10, aliphatic OH ~16, NH ~35
    let base_pka = if desc.n_hba >= 2 && desc.n_hbd >= 1 {
        // Likely has carboxylic acid (O-H + C=O pattern)
        4.5
    } else {
        // Generic O-H or N-H
        14.0
    };

    // Corrections
    let ew_correction =
        -0.3 * (desc.sum_electronegativity / desc.n_heavy_atoms.max(1) as f64 - 2.5);
    let arom_correction = if desc.n_aromatic > 0 { -1.5 } else { 0.0 };
    let tpsa_correction = -0.02 * (tpsa - 60.0);

    Some((base_pka + ew_correction + arom_correction + tpsa_correction).clamp(0.0, 25.0))
}

/// Predict basic pKa. Returns None if no basic group detected.
fn predict_pka_basic(desc: &MolecularDescriptors) -> Option<f64> {
    // Check for nitrogen atoms that could be basic
    let has_nitrogen =
        desc.n_hba > 0 && desc.sum_electronegativity / desc.n_heavy_atoms.max(1) as f64 > 2.6;
    if !has_nitrogen {
        return None;
    }

    // Amine base pKa: primary ~10.6, aromatic ~5
    let base_pka = if desc.n_aromatic > 0 {
        5.2 // pyridine-like
    } else {
        10.6 // aliphatic amine
    };

    let sp3_correction = 0.5 * desc.fsp3;
    Some((base_pka + sp3_correction).clamp(0.0, 14.0))
}

// ─── BBB permeability ────────────────────────────────────────────────────────

/// Predict blood-brain barrier permeability.
///
/// Lipinski-like heuristic: MW < 450, TPSA < 90, LogP ∈ (1, 5), HBD ≤ 3.
fn predict_bbb(desc: &MolecularDescriptors, logp: f64, tpsa: f64) -> (bool, f64) {
    let mut score = 1.0;

    if desc.molecular_weight > 450.0 {
        score -= 0.3 * ((desc.molecular_weight - 450.0) / 100.0).min(1.0);
    }
    if tpsa > 90.0 {
        score -= 0.35 * ((tpsa - 90.0) / 50.0).min(1.0);
    }
    if logp < 1.0 {
        score -= 0.2 * (1.0 - logp).min(1.0);
    }
    if logp > 5.0 {
        score -= 0.2 * ((logp - 5.0) / 2.0).min(1.0);
    }
    if desc.n_hbd > 3 {
        score -= 0.15 * (desc.n_hbd as f64 - 3.0).min(2.0) / 2.0;
    }

    let score = score.clamp(0.0, 1.0);
    (score > 0.5, score)
}

/// Predict molecular properties using an ensemble of models.
///
/// Combines three LogP models via consensus, adds TPSA, pKa estimates,
/// BBB permeability, and Veber bioavailability rules.
pub fn predict_ensemble(
    desc: &MolecularDescriptors,
    elements: &[u8],
    bonds: &[(usize, usize, u8)],
) -> EnsembleResult {
    let tpsa = compute_tpsa(elements, bonds);

    // Ensemble LogP: average of 3 models
    let lp1 = logp_model_1(desc);
    let lp2 = logp_model_2(desc, tpsa);
    let lp3 = logp_model_3(desc);
    let logp = (lp1 + lp2 + lp3) / 3.0;

    // Standard deviation as uncertainty proxy
    let logp_std = {
        let mean = logp;
        let var = ((lp1 - mean).powi(2) + (lp2 - mean).powi(2) + (lp3 - mean).powi(2)) / 3.0;
        var.sqrt()
    };

    // Solubility (ESOL-like, using consensus LogP)
    let frac_aromatic = if desc.n_heavy_atoms > 0 {
        desc.n_aromatic as f64 / desc.n_heavy_atoms as f64
    } else {
        0.0
    };
    let log_solubility = 0.16 - 0.63 * logp - 0.0062 * desc.molecular_weight
        + 0.066 * desc.n_rotatable_bonds as f64
        - 0.74 * frac_aromatic;

    // pKa predictions
    let pka_acidic = predict_pka_acidic(desc, tpsa);
    let pka_basic = predict_pka_basic(desc);

    // Veber rules
    let tpsa_ok = tpsa <= 140.0;
    let rotb_ok = desc.n_rotatable_bonds <= 10;
    let veber = VeberResult {
        tpsa_ok,
        rotb_ok,
        passes: tpsa_ok && rotb_ok,
    };

    // BBB
    let (bbb_permeable, bbb_score) = predict_bbb(desc, logp, tpsa);

    // Confidence: based on model agreement and descriptor coverage
    let model_agreement = 1.0 - (logp_std / 2.0).min(1.0);
    let size_confidence = if desc.n_heavy_atoms >= 3 && desc.n_heavy_atoms <= 50 {
        1.0
    } else {
        0.5
    };
    let confidence = (model_agreement * 0.7 + size_confidence * 0.3).clamp(0.0, 1.0);

    EnsembleResult {
        logp,
        logp_std,
        log_solubility,
        tpsa,
        pka_acidic,
        pka_basic,
        veber,
        bbb_permeable,
        bbb_score,
        confidence,
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::ml::descriptors::compute_descriptors;

    #[test]
    fn test_tpsa_water() {
        // Water: O with 2 H neighbors
        let elements = [8u8, 1, 1];
        let bonds = [(0usize, 1usize, 1u8), (0, 2, 1)];
        let tpsa = compute_tpsa(&elements, &bonds);
        assert!(tpsa > 15.0 && tpsa < 25.0, "Water TPSA: {tpsa}");
    }

    #[test]
    fn test_tpsa_benzene() {
        // Benzene: no polar atoms → TPSA = 0
        let elements = [6u8, 6, 6, 6, 6, 6, 1, 1, 1, 1, 1, 1];
        let bonds: Vec<(usize, usize, u8)> = vec![
            (0, 1, 2),
            (1, 2, 1),
            (2, 3, 2),
            (3, 4, 1),
            (4, 5, 2),
            (5, 0, 1),
            (0, 6, 1),
            (1, 7, 1),
            (2, 8, 1),
            (3, 9, 1),
            (4, 10, 1),
            (5, 11, 1),
        ];
        let tpsa = compute_tpsa(&elements, &bonds);
        assert!(
            (tpsa - 0.0).abs() < 1e-6,
            "Benzene TPSA should be 0: {tpsa}"
        );
    }

    #[test]
    fn test_ensemble_ethanol() {
        let elements = [6u8, 6, 8, 1, 1, 1, 1, 1, 1];
        let bonds: Vec<(usize, usize, u8)> = vec![
            (0, 1, 1),
            (1, 2, 1),
            (0, 3, 1),
            (0, 4, 1),
            (0, 5, 1),
            (1, 6, 1),
            (1, 7, 1),
            (2, 8, 1),
        ];
        let desc = compute_descriptors(&elements, &bonds, &[], &[]);
        let result = predict_ensemble(&desc, &elements, &bonds);

        assert!(result.tpsa > 15.0, "Ethanol has polar O-H: {}", result.tpsa);
        assert!(result.logp < 2.0, "Ethanol is hydrophilic: {}", result.logp);
        assert!(result.logp_std >= 0.0, "Uncertainty must be non-negative");
        assert!(result.confidence > 0.0 && result.confidence <= 1.0);
        assert!(result.veber.passes);
    }

    #[test]
    fn test_ensemble_logp_consistency() {
        // All three models should give broadly similar results for simple molecules
        let elements = [6u8, 6, 6, 1, 1, 1, 1, 1, 1, 1, 1];
        let bonds: Vec<(usize, usize, u8)> = vec![
            (0, 1, 1),
            (1, 2, 1),
            (0, 3, 1),
            (0, 4, 1),
            (0, 5, 1),
            (1, 6, 1),
            (1, 7, 1),
            (2, 8, 1),
            (2, 9, 1),
            (2, 10, 1),
        ];
        let desc = compute_descriptors(&elements, &bonds, &[], &[]);
        let result = predict_ensemble(&desc, &elements, &bonds);

        // Models should agree within ~2 units for small alkanes
        assert!(
            result.logp_std < 2.0,
            "Models should broadly agree: std={}",
            result.logp_std
        );
    }

    #[test]
    fn test_veber_large_molecule() {
        let desc = MolecularDescriptors {
            molecular_weight: 600.0,
            n_heavy_atoms: 45,
            n_hydrogens: 20,
            n_bonds: 60,
            n_rotatable_bonds: 15,
            n_hbd: 5,
            n_hba: 10,
            fsp3: 0.3,
            total_abs_charge: 3.0,
            max_charge: 0.4,
            min_charge: -0.4,
            wiener_index: 3000.0,
            n_rings: 4,
            n_aromatic: 8,
            balaban_j: 1.8,
            sum_electronegativity: 120.0,
            sum_polarizability: 65.0,
        };
        let elements = [6u8; 45];
        let bonds: Vec<(usize, usize, u8)> = (0..44).map(|i| (i, i + 1, 1u8)).collect();
        let result = predict_ensemble(&desc, &elements, &bonds);

        assert!(
            !result.veber.rotb_ok,
            "Too many rotatable bonds: {}",
            desc.n_rotatable_bonds
        );
    }

    #[test]
    fn test_bbb_small_lipophilic() {
        // Small, lipophilic molecule — should be BBB permeable
        let elements = [6u8, 6, 6, 1, 1, 1, 1, 1, 1, 1, 1];
        let bonds: Vec<(usize, usize, u8)> = vec![
            (0, 1, 1),
            (1, 2, 1),
            (0, 3, 1),
            (0, 4, 1),
            (0, 5, 1),
            (1, 6, 1),
            (1, 7, 1),
            (2, 8, 1),
            (2, 9, 1),
            (2, 10, 1),
        ];
        let desc = compute_descriptors(&elements, &bonds, &[], &[]);
        let result = predict_ensemble(&desc, &elements, &bonds);
        assert!(
            result.bbb_score > 0.0,
            "Small lipophilic molecule should have positive BBB score"
        );
    }

    #[test]
    fn test_pka_with_acid() {
        // Molecule with O-H and C=O (carboxylic acid pattern)
        let desc = MolecularDescriptors {
            molecular_weight: 60.0,
            n_heavy_atoms: 3,
            n_hydrogens: 4,
            n_bonds: 6,
            n_rotatable_bonds: 0,
            n_hbd: 1,
            n_hba: 2,
            fsp3: 0.0,
            total_abs_charge: 0.5,
            max_charge: 0.2,
            min_charge: -0.3,
            wiener_index: 4.0,
            n_rings: 0,
            n_aromatic: 0,
            balaban_j: 1.0,
            sum_electronegativity: 8.0,
            sum_polarizability: 3.0,
        };
        let elements = [6u8, 8, 8, 1, 1, 1, 1];
        let bonds: Vec<(usize, usize, u8)> = vec![
            (0, 1, 2),
            (0, 2, 1),
            (2, 3, 1),
            (0, 4, 1),
            (0, 5, 1),
            (0, 6, 1),
        ];
        let result = predict_ensemble(&desc, &elements, &bonds);
        assert!(
            result.pka_acidic.is_some(),
            "Carboxylic acid should have pKa"
        );
    }
}