Skip to main content

chematic_fp/
ecfp.rs

1//! ECFP (Extended Connectivity Fingerprints) based on the Morgan algorithm.
2//!
3//! Uses FNV-1a 64-bit hashing for reproducibility and WASM-compatibility.
4
5use chematic_core::{AtomIdx, BondOrder, Molecule, implicit_hcount};
6use chematic_perception::find_sssr;
7
8use crate::bitvec::BitVec2048;
9
10const FNV_OFFSET: u64 = 14695981039346656037;
11const FNV_PRIME: u64 = 1099511628211;
12
13/// Compute the FNV-1a 64-bit hash of `bytes`.
14pub(crate) fn fnv1a(bytes: &[u8]) -> u64 {
15    let mut h = FNV_OFFSET;
16    for &b in bytes {
17        h ^= b as u64;
18        h = h.wrapping_mul(FNV_PRIME);
19    }
20    h
21}
22
23/// Configuration for ECFP computation.
24#[derive(Debug, Clone)]
25pub struct EcfpConfig {
26    /// Number of iterations (radius). ECFP4 = 2, ECFP6 = 3.
27    pub radius: u32,
28    /// Output bitvector size (default 2048).
29    pub nbits: usize,
30}
31
32impl Default for EcfpConfig {
33    fn default() -> Self {
34        Self { radius: 2, nbits: 2048 }
35    }
36}
37
38/// Map a `BondOrder` to the integer code used in the ECFP hash.
39///
40/// - Single / Up / Down → 1
41/// - Double            → 2
42/// - Triple            → 3
43/// - Aromatic          → 4
44/// - Quadruple         → 5  (not in standard ECFP; assigned a distinct value)
45#[inline]
46pub(crate) fn bond_type_int(order: BondOrder) -> u8 {
47    match order {
48        BondOrder::Single | BondOrder::Up | BondOrder::Down => 1,
49        BondOrder::Double => 2,
50        BondOrder::Triple => 3,
51        BondOrder::Aromatic => 4,
52        BondOrder::Quadruple => 5,
53    }
54}
55
56/// Compute an ECFP fingerprint for `mol` using the given configuration.
57///
58/// # Algorithm overview
59/// 1. Compute initial atom identifiers from atomic properties.
60/// 2. Iteratively expand each identifier by incorporating neighbour identifiers
61///    (with their bond types) for `config.radius` rounds.
62/// 3. After each iteration (including iteration 0), map every identifier to a
63///    bit in the output bitvector.
64pub fn ecfp(mol: &Molecule, config: &EcfpConfig) -> BitVec2048 {
65    let n = mol.atom_count();
66    let nbits = config.nbits;
67    let mut fp = BitVec2048::new();
68
69    if n == 0 {
70        return fp;
71    }
72
73    let ring_set = find_sssr(mol);
74
75    // Step 1: initial atom identifiers (iteration 0).
76    let mut ids: Vec<u64> = Vec::with_capacity(n);
77    for i in 0..n {
78        let idx = AtomIdx(i as u32);
79        let atom = mol.atom(idx);
80
81        // Shift formal charge by 8 so that charges in [-8, +7] map to bytes [0, 15].
82        // i16 arithmetic avoids overflow on extreme charges.
83        let charge_adjusted = (atom.charge as i16 + 8) as u8;
84        let id = fnv1a(&[
85            atom.element.atomic_number(),
86            mol.neighbors(idx).count() as u8,
87            implicit_hcount(mol, idx),
88            charge_adjusted,
89            ring_set.contains_atom(idx) as u8,
90            atom.aromatic as u8,
91        ]);
92
93        fp.set((id % nbits as u64) as usize);
94        ids.push(id);
95    }
96
97    // Step 2: iterative expansion.
98    let mut new_ids: Vec<u64> = vec![0u64; n];
99    for r in 1..=config.radius {
100        for i in 0..n {
101            let idx = AtomIdx(i as u32);
102            let mut neighbor_info: Vec<(u8, u64)> = mol
103                .neighbors(idx)
104                .map(|(nb_idx, bond_idx)| {
105                    let btype = bond_type_int(mol.bond(bond_idx).order);
106                    (btype, ids[nb_idx.0 as usize])
107                })
108                .collect();
109            neighbor_info.sort_unstable();
110
111            // Byte layout: [round_u8, self_id(8), (btype(1) ++ nb_id(8))*]
112            let mut bytes: Vec<u8> = Vec::with_capacity(1 + 8 + neighbor_info.len() * 9);
113            bytes.push(r as u8);
114            bytes.extend_from_slice(&ids[i].to_le_bytes());
115            for (btype, nb_id) in &neighbor_info {
116                bytes.push(*btype);
117                bytes.extend_from_slice(&nb_id.to_le_bytes());
118            }
119
120            let new_id = fnv1a(&bytes);
121            new_ids[i] = new_id;
122            fp.set((new_id % nbits as u64) as usize);
123        }
124        core::mem::swap(&mut ids, &mut new_ids);
125    }
126
127    fp
128}
129
130/// Count-based Morgan fingerprint: returns a map of `hash → count` for all
131/// atom environments up to `radius` iterations.
132///
133/// Each (atom, iteration) pair contributes its hash to the map.  Unlike the
134/// default RDKit behavior, redundant (duplicate) environments are **not**
135/// suppressed — every atom contributes at every iteration level.
136///
137/// This corresponds to `GetMorganFingerprint(mol, radius,
138/// useFeatures=False, includeRedundantEnvironments=True)` in RDKit.
139pub fn morgan_fp_counts(mol: &Molecule, radius: u32) -> std::collections::HashMap<u64, u32> {
140    use std::collections::HashMap;
141
142    let n = mol.atom_count();
143    let mut counts: HashMap<u64, u32> = HashMap::new();
144
145    if n == 0 {
146        return counts;
147    }
148
149    let ring_set = find_sssr(mol);
150
151    // Radius-0: initial atom identifiers.
152    let mut ids: Vec<u64> = (0..n)
153        .map(|i| {
154            let idx = AtomIdx(i as u32);
155            let atom = mol.atom(idx);
156            let charge_adjusted = (atom.charge as i16 + 8) as u8;
157            fnv1a(&[
158                atom.element.atomic_number(),
159                mol.neighbors(idx).count() as u8,
160                implicit_hcount(mol, idx),
161                charge_adjusted,
162                ring_set.contains_atom(idx) as u8,
163                atom.aromatic as u8,
164            ])
165        })
166        .collect();
167
168    for &id in &ids {
169        *counts.entry(id).or_insert(0) += 1;
170    }
171
172    // Radius 1..=radius: iterative expansion (same hash scheme as ecfp).
173    let mut new_ids = vec![0u64; n];
174    for r in 1..=radius {
175        for i in 0..n {
176            let idx = AtomIdx(i as u32);
177            let mut neighbor_info: Vec<(u8, u64)> = mol
178                .neighbors(idx)
179                .map(|(nb_idx, bond_idx)| {
180                    (bond_type_int(mol.bond(bond_idx).order), ids[nb_idx.0 as usize])
181                })
182                .collect();
183            neighbor_info.sort_unstable();
184
185            let mut bytes: Vec<u8> = Vec::with_capacity(1 + 8 + neighbor_info.len() * 9);
186            bytes.push(r as u8);
187            bytes.extend_from_slice(&ids[i].to_le_bytes());
188            for (btype, nb_id) in &neighbor_info {
189                bytes.push(*btype);
190                bytes.extend_from_slice(&nb_id.to_le_bytes());
191            }
192
193            let new_id = fnv1a(&bytes);
194            new_ids[i] = new_id;
195            *counts.entry(new_id).or_insert(0) += 1;
196        }
197        core::mem::swap(&mut ids, &mut new_ids);
198    }
199
200    counts
201}
202
203/// ECFP4 fingerprint (radius = 2, 2048 bits).
204pub fn ecfp4(mol: &Molecule) -> BitVec2048 {
205    ecfp(mol, &EcfpConfig { radius: 2, nbits: 2048 })
206}
207
208/// ECFP6 fingerprint (radius = 3, 2048 bits).
209pub fn ecfp6(mol: &Molecule) -> BitVec2048 {
210    ecfp(mol, &EcfpConfig { radius: 3, nbits: 2048 })
211}
212
213/// Tanimoto similarity between two molecules using ECFP4.
214pub fn tanimoto_ecfp4(a: &Molecule, b: &Molecule) -> f64 {
215    ecfp4(a).tanimoto(&ecfp4(b))
216}
217
218#[cfg(test)]
219mod tests {
220    use super::*;
221    use chematic_smiles::parse;
222
223    fn benzene() -> Molecule {
224        parse("c1ccccc1").unwrap()
225    }
226
227    fn ethane() -> Molecule {
228        parse("CC").unwrap()
229    }
230
231    fn toluene() -> Molecule {
232        parse("Cc1ccccc1").unwrap()
233    }
234
235    fn aspirin() -> Molecule {
236        // Acetylsalicylic acid
237        parse("CC(=O)Oc1ccccc1C(=O)O").unwrap()
238    }
239
240    fn methane() -> Molecule {
241        parse("C").unwrap()
242    }
243
244    fn water() -> Molecule {
245        parse("O").unwrap()
246    }
247
248    #[test]
249    fn benzene_ecfp4_nonzero() {
250        let fp = ecfp4(&benzene());
251        assert!(fp.popcount() > 0, "benzene ECFP4 must be non-zero");
252    }
253
254    #[test]
255    fn benzene_ecfp4_deterministic() {
256        let fp1 = ecfp4(&benzene());
257        let fp2 = ecfp4(&benzene());
258        assert_eq!(fp1, fp2, "ECFP4 must be deterministic");
259    }
260
261    #[test]
262    fn ethane_vs_benzene_tanimoto_lt1() {
263        let t = tanimoto_ecfp4(&ethane(), &benzene());
264        assert!(t < 1.0, "ethane and benzene must differ (tanimoto={t})");
265    }
266
267    #[test]
268    fn benzene_vs_benzene_tanimoto_eq1() {
269        let t = tanimoto_ecfp4(&benzene(), &benzene());
270        assert_eq!(t, 1.0, "identical molecules must have tanimoto == 1.0");
271    }
272
273    #[test]
274    fn benzene_vs_toluene_tanimoto_between() {
275        let t = tanimoto_ecfp4(&benzene(), &toluene());
276        assert!(t > 0.0, "benzene and toluene share bits (tanimoto={t})");
277        assert!(t < 1.0, "benzene and toluene are not identical (tanimoto={t})");
278    }
279
280    #[test]
281    fn aspirin_ecfp4_many_bits() {
282        let fp = ecfp4(&aspirin());
283        assert!(
284            fp.popcount() > 5,
285            "aspirin ECFP4 must have more than 5 bits set (got {})",
286            fp.popcount()
287        );
288    }
289
290    #[test]
291    fn ecfp6_vs_ecfp4_benzene_differ() {
292        let fp4 = ecfp4(&benzene());
293        let fp6 = ecfp6(&benzene());
294        // Larger radius explores more environment — the bit counts should differ
295        // because radius-3 adds new hash values not present at radius-2.
296        assert_ne!(
297            fp4.popcount(),
298            fp6.popcount(),
299            "ECFP6 and ECFP4 should produce different bit counts for benzene"
300        );
301    }
302
303    #[test]
304    fn methane_ecfp4_nonzero() {
305        let fp = ecfp4(&methane());
306        assert!(fp.popcount() > 0, "methane ECFP4 must be non-zero");
307    }
308
309    #[test]
310    fn water_ecfp4_nonzero() {
311        let fp = ecfp4(&water());
312        assert!(fp.popcount() > 0, "water ECFP4 must be non-zero");
313    }
314
315    #[test]
316    fn tanimoto_ecfp4_benzene_self_is_one() {
317        let t = tanimoto_ecfp4(&benzene(), &benzene());
318        assert_eq!(t, 1.0, "tanimoto_ecfp4 of identical molecules must be 1.0");
319    }
320
321    #[test]
322    fn tanimoto_ecfp4_methane_vs_benzene_lt_half() {
323        let t = tanimoto_ecfp4(&methane(), &benzene());
324        assert!(
325            t < 0.5,
326            "methane and benzene should be very dissimilar (tanimoto={t})"
327        );
328    }
329
330    // ── morgan_fp_counts ─────────────────────────────────────────────────────
331
332    #[test]
333    fn morgan_counts_radius0_atom_count() {
334        // At radius 0, one hash per atom.
335        let m = benzene();
336        let counts = morgan_fp_counts(&m, 0);
337        let total: u32 = counts.values().sum();
338        assert_eq!(total, m.atom_count() as u32, "radius-0 total count == atom_count");
339    }
340
341    #[test]
342    fn morgan_counts_radius2_total_grows() {
343        // Each additional radius adds one hash per atom → total = n * (radius+1).
344        let m = methane();
345        let n = m.atom_count() as u32;
346        let r = 2u32;
347        let counts = morgan_fp_counts(&m, r);
348        let total: u32 = counts.values().sum();
349        assert_eq!(total, n * (r + 1), "methane total = atom_count * (radius+1)");
350    }
351
352    #[test]
353    fn morgan_counts_benzene_symmetry() {
354        // All 6 benzene C atoms are equivalent → radius-0 yields 1 unique hash.
355        let m = benzene();
356        let counts = morgan_fp_counts(&m, 0);
357        assert_eq!(counts.len(), 1, "benzene has one unique radius-0 environment");
358        assert_eq!(*counts.values().next().unwrap(), 6, "that environment appears 6 times");
359    }
360
361    #[test]
362    fn morgan_counts_empty_mol_is_empty() {
363        use chematic_core::MoleculeBuilder;
364        let m = MoleculeBuilder::new().build();
365        let counts = morgan_fp_counts(&m, 2);
366        assert!(counts.is_empty(), "empty molecule yields empty count map");
367    }
368
369    #[test]
370    fn morgan_counts_deterministic() {
371        let m = aspirin();
372        let c1 = morgan_fp_counts(&m, 2);
373        let c2 = morgan_fp_counts(&m, 2);
374        assert_eq!(c1, c2, "morgan_fp_counts must be deterministic");
375    }
376
377    #[test]
378    fn morgan_counts_consistent_with_ecfp_bits() {
379        // Every hash in the count map should be reachable from the ecfp bit set
380        // (after folding to 2048 bits).  This checks the same hash scheme.
381        let m = toluene();
382        let fp = ecfp(&m, &EcfpConfig { radius: 2, nbits: 2048 });
383        let counts = morgan_fp_counts(&m, 2);
384        for &hash in counts.keys() {
385            let bit = (hash % 2048) as usize;
386            assert!(fp.get(bit), "bit {bit} from count map not set in ECFP bitvec");
387        }
388    }
389}