Skip to main content

chematic_fp/
ecfp.rs

1//! ECFP (Extended Connectivity Fingerprints) based on the Morgan algorithm.
2//!
3//! Uses FNV-1a 64-bit hashing for reproducibility and WASM-compatibility.
4
5use chematic_core::{AtomIdx, BondOrder, Molecule, implicit_hcount};
6use chematic_perception::find_sssr;
7
8use crate::bitvec::BitVec2048;
9
10const FNV_OFFSET: u64 = 14695981039346656037;
11const FNV_PRIME: u64 = 1099511628211;
12
13/// Compute the FNV-1a 64-bit hash of `bytes`.
14pub(crate) fn fnv1a(bytes: &[u8]) -> u64 {
15    let mut h = FNV_OFFSET;
16    for &b in bytes {
17        h ^= b as u64;
18        h = h.wrapping_mul(FNV_PRIME);
19    }
20    h
21}
22
23/// Hash one atom's neighbourhood at iteration `r` of the Morgan expansion.
24///
25/// Byte layout: `[r as u8, self_id (8 bytes), (bond_type (1) ++ nb_id (8))*]`
26/// Neighbours are sorted before hashing to make the result order-independent.
27fn expand_atom_id(mol: &Molecule, i: usize, r: u32, ids: &[u64]) -> u64 {
28    let idx = AtomIdx(i as u32);
29    let mut neighbor_info: Vec<(u8, u64)> = mol
30        .neighbors(idx)
31        .map(|(nb_idx, bond_idx)| (bond_type_int(mol.bond(bond_idx).order), ids[nb_idx.0 as usize]))
32        .collect();
33    neighbor_info.sort_unstable();
34
35    let mut bytes: Vec<u8> = Vec::with_capacity(1 + 8 + neighbor_info.len() * 9);
36    bytes.push(r as u8);
37    bytes.extend_from_slice(&ids[i].to_le_bytes());
38    for (btype, nb_id) in &neighbor_info {
39        bytes.push(*btype);
40        bytes.extend_from_slice(&nb_id.to_le_bytes());
41    }
42    fnv1a(&bytes)
43}
44
45/// Compute the FNV-1a atom identifier for iteration 0 of the Morgan algorithm.
46///
47/// The six-byte invariant covers: atomic number, degree, implicit H count, formal
48/// charge (clamped to byte range), ring membership, and aromaticity.  When
49/// `use_chirality` is true an extra chirality byte is appended; this preserves
50/// bit-compatibility with the default (`use_chirality=false`) fingerprints.
51pub(crate) fn initial_atom_id(
52    mol: &Molecule,
53    idx: AtomIdx,
54    ring_set: &chematic_perception::RingSet,
55    use_chirality: bool,
56) -> u64 {
57    let atom = mol.atom(idx);
58    let charge_adjusted = (atom.charge as i16 + 8).clamp(0, 255) as u8;
59    let base_bytes = [
60        atom.element.atomic_number(),
61        mol.neighbors(idx).count().min(255) as u8,
62        implicit_hcount(mol, idx),
63        charge_adjusted,
64        ring_set.contains_atom(idx) as u8,
65        atom.aromatic as u8,
66    ];
67    if use_chirality {
68        use chematic_core::Chirality;
69        let chirality_byte = match atom.chirality {
70            Chirality::None => 0u8,
71            Chirality::CounterClockwise => 1u8,
72            Chirality::Clockwise => 2u8,
73        };
74        let mut chiral_bytes = base_bytes.to_vec();
75        chiral_bytes.push(chirality_byte);
76        fnv1a(&chiral_bytes)
77    } else {
78        fnv1a(&base_bytes)
79    }
80}
81
82/// Configuration for ECFP computation.
83#[derive(Debug, Clone)]
84pub struct EcfpConfig {
85    /// Number of iterations (radius). ECFP4 = 2, ECFP6 = 3.
86    pub radius: u32,
87    /// Output bitvector size (default 2048).
88    pub nbits: usize,
89    /// When `true`, include tetrahedral chirality in the initial atom hash so
90    /// that R and S enantiomers produce different fingerprints.
91    ///
92    /// Defaults to `false` (chirality ignored, matching RDKit's `useChirality=False`).
93    pub use_chirality: bool,
94    /// When `true`, each hash sets two bit positions (using single and double-folded hash)
95    /// to reduce bitvector collisions. This reduces collision probability but changes
96    /// fingerprint values — **not backwards-compatible** with stored fingerprints.
97    ///
98    /// Defaults to `false` (single-bit folding, current behavior preserved).
99    pub use_double_fold: bool,
100}
101
102impl Default for EcfpConfig {
103    fn default() -> Self {
104        Self {
105            radius: 2,
106            nbits: 2048,
107            use_chirality: false,
108            use_double_fold: false,
109        }
110    }
111}
112
113/// Map a `BondOrder` to the integer code used in the ECFP hash.
114///
115/// - Single / Up / Down → 1
116/// - Double            → 2
117/// - Triple            → 3
118/// - Aromatic          → 4
119/// - Quadruple         → 5  (not in standard ECFP; assigned a distinct value)
120#[inline]
121pub(crate) fn bond_type_int(order: BondOrder) -> u8 {
122    match order {
123        BondOrder::Single | BondOrder::Up | BondOrder::Down | BondOrder::Dative => 1,
124        BondOrder::Double => 2,
125        BondOrder::Triple => 3,
126        BondOrder::Aromatic => 4,
127        BondOrder::Quadruple => 5,
128        BondOrder::Zero => 0,
129        BondOrder::QueryAny => 6,
130        BondOrder::QuerySingleOrDouble => 7,
131        BondOrder::QuerySingleOrAromatic => 8,
132        BondOrder::QueryDoubleOrAromatic => 9,
133    }
134}
135
136/// Compute an ECFP fingerprint for `mol` using the given configuration.
137///
138/// # Algorithm overview
139/// 1. Compute initial atom identifiers from atomic properties.
140/// 2. Iteratively expand each identifier by incorporating neighbour identifiers
141///    (with their bond types) for `config.radius` rounds.
142/// 3. After each iteration (including iteration 0), map every identifier to a
143///    bit in the output bitvector.
144/// Maximum supported radius for `ecfp`.  Matches the cap in `morgan_fp_counts`.
145/// Beyond this, `r as u8` would silently truncate, producing hash collisions.
146pub const MAX_ECFP_RADIUS: u32 = 20;
147
148pub fn ecfp(mol: &Molecule, config: &EcfpConfig) -> BitVec2048 {
149    let n = mol.atom_count();
150    let nbits = config.nbits;
151    // Cap radius to prevent `r as u8` truncation at r > 255 (hash collision bug).
152    let config = &EcfpConfig {
153        radius: config.radius.min(MAX_ECFP_RADIUS),
154        ..*config
155    };
156    let mut fp = BitVec2048::new();
157
158    if n == 0 {
159        return fp;
160    }
161
162    let ring_set = find_sssr(mol);
163
164    // Step 1: initial atom identifiers (iteration 0).
165    let mut ids: Vec<u64> = Vec::with_capacity(n);
166    for i in 0..n {
167        let idx = AtomIdx(i as u32);
168        let id = initial_atom_id(mol, idx, &ring_set, config.use_chirality);
169        fp.set((id % nbits as u64) as usize);
170        if config.use_double_fold {
171            fp.set(((id >> 11) % nbits as u64) as usize);
172        }
173        ids.push(id);
174    }
175
176    // Step 2: iterative expansion.
177    let mut new_ids: Vec<u64> = vec![0u64; n];
178    for r in 1..=config.radius {
179        for (i, slot) in new_ids.iter_mut().enumerate() {
180            let new_id = expand_atom_id(mol, i, r, &ids);
181            *slot = new_id;
182            fp.set((new_id % nbits as u64) as usize);
183            if config.use_double_fold {
184                fp.set(((new_id >> 11) % nbits as u64) as usize);
185            }
186        }
187        core::mem::swap(&mut ids, &mut new_ids);
188    }
189
190    fp
191}
192
193/// Count-based Morgan fingerprint: returns a map of `hash → count` for all
194/// atom environments up to `radius` iterations.
195///
196/// Each (atom, iteration) pair contributes its hash to the map.  Unlike the
197/// default RDKit behavior, redundant (duplicate) environments are **not**
198/// suppressed — every atom contributes at every iteration level.
199///
200/// This corresponds to `GetMorganFingerprint(mol, radius,
201/// useFeatures=False, includeRedundantEnvironments=True)` in RDKit.
202pub fn morgan_fp_counts(mol: &Molecule, radius: u32) -> std::collections::HashMap<u64, u32> {
203    use std::collections::HashMap;
204
205    const MAX_RADIUS: u32 = 20;
206    let radius = radius.min(MAX_RADIUS);
207
208    let n = mol.atom_count();
209    let mut counts: HashMap<u64, u32> = HashMap::new();
210
211    if n == 0 {
212        return counts;
213    }
214
215    let ring_set = find_sssr(mol);
216
217    // Radius-0: initial atom identifiers.
218    let mut ids: Vec<u64> = (0..n)
219        .map(|i| initial_atom_id(mol, AtomIdx(i as u32), &ring_set, false))
220        .collect();
221
222    for &id in &ids {
223        *counts.entry(id).or_insert(0) += 1;
224    }
225
226    // Radius 1..=radius: iterative expansion (same hash scheme as ecfp).
227    let mut new_ids = vec![0u64; n];
228    for r in 1..=radius {
229        for (i, slot) in new_ids.iter_mut().enumerate() {
230            let new_id = expand_atom_id(mol, i, r, &ids);
231            *slot = new_id;
232            *counts.entry(new_id).or_insert(0) += 1;
233        }
234        core::mem::swap(&mut ids, &mut new_ids);
235    }
236
237    counts
238}
239
240/// ECFP4 fingerprint (radius = 2, 2048 bits).
241pub fn ecfp4(mol: &Molecule) -> BitVec2048 {
242    ecfp(mol, &EcfpConfig::default())
243}
244
245/// ECFP6 fingerprint (radius = 3, 2048 bits).
246pub fn ecfp6(mol: &Molecule) -> BitVec2048 {
247    ecfp(mol, &EcfpConfig { radius: 3, ..EcfpConfig::default() })
248}
249
250/// Tanimoto similarity between two molecules using ECFP4.
251pub fn tanimoto_ecfp4(a: &Molecule, b: &Molecule) -> f64 {
252    ecfp4(a).tanimoto(&ecfp4(b))
253}
254
255#[cfg(test)]
256mod tests {
257    use super::*;
258    use chematic_smiles::parse;
259
260    fn benzene() -> Molecule {
261        parse("c1ccccc1").unwrap()
262    }
263
264    fn ethane() -> Molecule {
265        parse("CC").unwrap()
266    }
267
268    fn toluene() -> Molecule {
269        parse("Cc1ccccc1").unwrap()
270    }
271
272    fn aspirin() -> Molecule {
273        // Acetylsalicylic acid
274        parse("CC(=O)Oc1ccccc1C(=O)O").unwrap()
275    }
276
277    fn methane() -> Molecule {
278        parse("C").unwrap()
279    }
280
281    fn water() -> Molecule {
282        parse("O").unwrap()
283    }
284
285    #[test]
286    fn benzene_ecfp4_nonzero() {
287        let fp = ecfp4(&benzene());
288        assert!(fp.popcount() > 0, "benzene ECFP4 must be non-zero");
289    }
290
291    #[test]
292    fn benzene_ecfp4_deterministic() {
293        let fp1 = ecfp4(&benzene());
294        let fp2 = ecfp4(&benzene());
295        assert_eq!(fp1, fp2, "ECFP4 must be deterministic");
296    }
297
298    #[test]
299    fn ethane_vs_benzene_tanimoto_lt1() {
300        let t = tanimoto_ecfp4(&ethane(), &benzene());
301        assert!(t < 1.0, "ethane and benzene must differ (tanimoto={t})");
302    }
303
304    #[test]
305    fn benzene_vs_benzene_tanimoto_eq1() {
306        let t = tanimoto_ecfp4(&benzene(), &benzene());
307        assert_eq!(t, 1.0, "identical molecules must have tanimoto == 1.0");
308    }
309
310    #[test]
311    fn benzene_vs_toluene_tanimoto_between() {
312        let t = tanimoto_ecfp4(&benzene(), &toluene());
313        assert!(t > 0.0, "benzene and toluene share bits (tanimoto={t})");
314        assert!(
315            t < 1.0,
316            "benzene and toluene are not identical (tanimoto={t})"
317        );
318    }
319
320    #[test]
321    fn aspirin_ecfp4_many_bits() {
322        let fp = ecfp4(&aspirin());
323        assert!(
324            fp.popcount() > 5,
325            "aspirin ECFP4 must have more than 5 bits set (got {})",
326            fp.popcount()
327        );
328    }
329
330    #[test]
331    fn ecfp6_vs_ecfp4_benzene_differ() {
332        let fp4 = ecfp4(&benzene());
333        let fp6 = ecfp6(&benzene());
334        // Larger radius explores more environment — the bit counts should differ
335        // because radius-3 adds new hash values not present at radius-2.
336        assert_ne!(
337            fp4.popcount(),
338            fp6.popcount(),
339            "ECFP6 and ECFP4 should produce different bit counts for benzene"
340        );
341    }
342
343    #[test]
344    fn methane_ecfp4_nonzero() {
345        let fp = ecfp4(&methane());
346        assert!(fp.popcount() > 0, "methane ECFP4 must be non-zero");
347    }
348
349    #[test]
350    fn water_ecfp4_nonzero() {
351        let fp = ecfp4(&water());
352        assert!(fp.popcount() > 0, "water ECFP4 must be non-zero");
353    }
354
355    #[test]
356    fn tanimoto_ecfp4_benzene_self_is_one() {
357        let t = tanimoto_ecfp4(&benzene(), &benzene());
358        assert_eq!(t, 1.0, "tanimoto_ecfp4 of identical molecules must be 1.0");
359    }
360
361    #[test]
362    fn tanimoto_ecfp4_methane_vs_benzene_lt_half() {
363        let t = tanimoto_ecfp4(&methane(), &benzene());
364        assert!(
365            t < 0.5,
366            "methane and benzene should be very dissimilar (tanimoto={t})"
367        );
368    }
369
370    // ── morgan_fp_counts ─────────────────────────────────────────────────────
371
372    #[test]
373    fn morgan_counts_radius0_atom_count() {
374        // At radius 0, one hash per atom.
375        let m = benzene();
376        let counts = morgan_fp_counts(&m, 0);
377        let total: u32 = counts.values().sum();
378        assert_eq!(
379            total,
380            m.atom_count() as u32,
381            "radius-0 total count == atom_count"
382        );
383    }
384
385    #[test]
386    fn morgan_counts_radius2_total_grows() {
387        // Each additional radius adds one hash per atom → total = n * (radius+1).
388        let m = methane();
389        let n = m.atom_count() as u32;
390        let r = 2u32;
391        let counts = morgan_fp_counts(&m, r);
392        let total: u32 = counts.values().sum();
393        assert_eq!(
394            total,
395            n * (r + 1),
396            "methane total = atom_count * (radius+1)"
397        );
398    }
399
400    #[test]
401    fn morgan_counts_benzene_symmetry() {
402        // All 6 benzene C atoms are equivalent → radius-0 yields 1 unique hash.
403        let m = benzene();
404        let counts = morgan_fp_counts(&m, 0);
405        assert_eq!(
406            counts.len(),
407            1,
408            "benzene has one unique radius-0 environment"
409        );
410        assert_eq!(
411            *counts.values().next().unwrap(),
412            6,
413            "that environment appears 6 times"
414        );
415    }
416
417    #[test]
418    fn morgan_counts_empty_mol_is_empty() {
419        use chematic_core::MoleculeBuilder;
420        let m = MoleculeBuilder::new().build();
421        let counts = morgan_fp_counts(&m, 2);
422        assert!(counts.is_empty(), "empty molecule yields empty count map");
423    }
424
425    #[test]
426    fn morgan_counts_deterministic() {
427        let m = aspirin();
428        let c1 = morgan_fp_counts(&m, 2);
429        let c2 = morgan_fp_counts(&m, 2);
430        assert_eq!(c1, c2, "morgan_fp_counts must be deterministic");
431    }
432
433    #[test]
434    fn morgan_counts_consistent_with_ecfp_bits() {
435        // Every hash in the count map should be reachable from the ecfp bit set
436        // (after folding to 2048 bits).  This checks the same hash scheme.
437        let m = toluene();
438        let fp = ecfp(
439            &m,
440            &EcfpConfig {
441                radius: 2,
442                nbits: 2048,
443                use_chirality: false,
444                use_double_fold: false,
445            },
446        );
447        let counts = morgan_fp_counts(&m, 2);
448        for &hash in counts.keys() {
449            let bit = (hash % 2048) as usize;
450            assert!(
451                fp.get(bit),
452                "bit {bit} from count map not set in ECFP bitvec"
453            );
454        }
455    }
456
457    // -- Chirality tests ------------------------------------------------------
458
459    #[test]
460    fn ecfp4_ignores_chirality_by_default() {
461        // L-alanine and D-alanine should produce the same ECFP4 when
462        // use_chirality=false (default), since chirality is not in the hash.
463        let l_ala = parse("N[C@@H](C)C(=O)O").unwrap();
464        let d_ala = parse("N[C@H](C)C(=O)O").unwrap();
465        let fp_l = ecfp4(&l_ala);
466        let fp_d = ecfp4(&d_ala);
467        assert_eq!(
468            fp_l, fp_d,
469            "L/D-alanine ECFP4 should be identical when use_chirality=false"
470        );
471    }
472
473    #[test]
474    fn ecfp4_distinguishes_enantiomers_with_chirality() {
475        // With use_chirality=true, L-alanine and D-alanine must have different FPs.
476        let l_ala = parse("N[C@@H](C)C(=O)O").unwrap();
477        let d_ala = parse("N[C@H](C)C(=O)O").unwrap();
478        let config = EcfpConfig {
479            radius: 2,
480            nbits: 2048,
481            use_chirality: true,
482            use_double_fold: false,
483        };
484        let fp_l = ecfp(&l_ala, &config);
485        let fp_d = ecfp(&d_ala, &config);
486        assert_ne!(
487            fp_l, fp_d,
488            "L/D-alanine ECFP4 must differ when use_chirality=true"
489        );
490        // Tanimoto < 1.0 confirms they are not identical.
491        assert!(
492            fp_l.tanimoto(&fp_d) < 1.0,
493            "Tanimoto of L/D-alanine must be < 1.0 with use_chirality"
494        );
495    }
496
497    #[test]
498    fn ecfp4_non_chiral_generates_with_chirality_flag() {
499        // Non-chiral molecules like benzene should generate valid ECFP with use_chirality flag.
500        // This test verifies that use_chirality=true doesn't break on achiral molecules.
501        let mol = parse("c1ccccc1").unwrap(); // Benzene — no stereo centers.
502        let config = EcfpConfig {
503            radius: 2,
504            nbits: 2048,
505            use_chirality: true,
506            use_double_fold: false,
507        };
508        let fp = ecfp(&mol, &config);
509        assert!(fp.popcount() > 0, "Benzene should generate non-empty ECFP4 with use_chirality=true");
510    }
511}