Skip to main content

chematic_fp/
ecfp.rs

1//! ECFP (Extended Connectivity Fingerprints) based on the Morgan algorithm.
2//!
3//! Uses FNV-1a 64-bit hashing for reproducibility and WASM-compatibility.
4
5use chematic_core::{AtomIdx, BondOrder, Molecule, implicit_hcount};
6use chematic_perception::find_sssr;
7
8use crate::bitvec::BitVec2048;
9
10// ---------------------------------------------------------------------------
11// FNV-1a constants
12// ---------------------------------------------------------------------------
13
14const FNV_OFFSET: u64 = 14695981039346656037;
15const FNV_PRIME: u64 = 1099511628211;
16
17/// Compute the FNV-1a 64-bit hash of `bytes`.
18fn fnv1a(bytes: &[u8]) -> u64 {
19    let mut h = FNV_OFFSET;
20    for &b in bytes {
21        h ^= b as u64;
22        h = h.wrapping_mul(FNV_PRIME);
23    }
24    h
25}
26
27// ---------------------------------------------------------------------------
28// Configuration
29// ---------------------------------------------------------------------------
30
31/// Configuration for ECFP computation.
32#[derive(Debug, Clone)]
33pub struct EcfpConfig {
34    /// Number of iterations (radius). ECFP4 = 2, ECFP6 = 3.
35    pub radius: u32,
36    /// Output bitvector size (default 2048).
37    pub nbits: usize,
38}
39
40impl Default for EcfpConfig {
41    fn default() -> Self {
42        Self { radius: 2, nbits: 2048 }
43    }
44}
45
46// ---------------------------------------------------------------------------
47// Bond type encoding
48// ---------------------------------------------------------------------------
49
50/// Map a `BondOrder` to the integer code used in the ECFP hash.
51///
52/// - Single / Up / Down → 1
53/// - Double            → 2
54/// - Triple            → 3
55/// - Aromatic          → 4
56/// - Quadruple         → 5  (not in standard ECFP; assigned a distinct value)
57#[inline]
58fn bond_type_int(order: BondOrder) -> u8 {
59    match order {
60        BondOrder::Single | BondOrder::Up | BondOrder::Down => 1,
61        BondOrder::Double => 2,
62        BondOrder::Triple => 3,
63        BondOrder::Aromatic => 4,
64        BondOrder::Quadruple => 5,
65    }
66}
67
68// ---------------------------------------------------------------------------
69// Main entry point
70// ---------------------------------------------------------------------------
71
72/// Compute an ECFP fingerprint for `mol` using the given configuration.
73///
74/// # Algorithm overview
75/// 1. Compute initial atom identifiers from atomic properties.
76/// 2. Iteratively expand each identifier by incorporating neighbour identifiers
77///    (with their bond types) for `config.radius` rounds.
78/// 3. After each iteration (including iteration 0), map every identifier to a
79///    bit in the output bitvector.
80pub fn ecfp(mol: &Molecule, config: &EcfpConfig) -> BitVec2048 {
81    let n = mol.atom_count();
82    let nbits = config.nbits;
83    let mut fp = BitVec2048::new();
84
85    if n == 0 {
86        return fp;
87    }
88
89    // Pre-compute ring membership for all atoms once.
90    let ring_set = find_sssr(mol);
91
92    // --- Step 1: initial atom identifiers (iteration 0) ---
93    let mut ids: Vec<u64> = Vec::with_capacity(n);
94
95    for i in 0..n {
96        let idx = AtomIdx(i as u32);
97        let atom = mol.atom(idx);
98
99        let atomic_number = atom.element.atomic_number();
100        let degree = mol.neighbors(idx).count() as u8;
101        let h_count = implicit_hcount(mol, idx);
102        // Shift formal charge by 8 so that charges in [-8, +7] map to bytes [0, 15].
103        // Using i16 arithmetic avoids any overflow on extreme charges.
104        let charge_adjusted = (atom.charge as i16 + 8) as u8;
105        let is_in_ring: u8 = if ring_set.contains_atom(idx) { 1 } else { 0 };
106        let is_aromatic: u8 = if atom.aromatic { 1 } else { 0 };
107
108        let id = fnv1a(&[
109            atomic_number,
110            degree,
111            h_count,
112            charge_adjusted,
113            is_in_ring,
114            is_aromatic,
115        ]);
116
117        // Set bit for this atom's identifier.
118        fp.set((id % nbits as u64) as usize);
119        ids.push(id);
120    }
121
122    // --- Step 2: iterative expansion ---
123    let mut new_ids: Vec<u64> = vec![0u64; n];
124
125    for r in 1..=config.radius {
126        for i in 0..n {
127            let idx = AtomIdx(i as u32);
128
129            // Collect (bond_type_int, neighbor_id) pairs.
130            let mut neighbor_info: Vec<(u8, u64)> = mol
131                .neighbors(idx)
132                .map(|(nb_idx, bond_idx)| {
133                    let bond = mol.bond(bond_idx);
134                    let btype = bond_type_int(bond.order);
135                    let nb_id = ids[nb_idx.0 as usize];
136                    (btype, nb_id)
137                })
138                .collect();
139
140            // Sort for canonical ordering.
141            neighbor_info.sort_unstable();
142
143            // Build the byte array:
144            //   [r as u8, id_bytes(8), (btype(1) ++ nbr_id_bytes(8)) * neighbors]
145            let mut bytes: Vec<u8> = Vec::with_capacity(1 + 8 + neighbor_info.len() * 9);
146            bytes.push(r as u8);
147            bytes.extend_from_slice(&ids[i].to_le_bytes());
148            for (btype, nb_id) in &neighbor_info {
149                bytes.push(*btype);
150                bytes.extend_from_slice(&nb_id.to_le_bytes());
151            }
152
153            let new_id = fnv1a(&bytes);
154            new_ids[i] = new_id;
155
156            // Set bit for the new identifier.
157            fp.set((new_id % nbits as u64) as usize);
158        }
159
160        // Swap buffers; new_ids becomes ids for the next iteration.
161        core::mem::swap(&mut ids, &mut new_ids);
162    }
163
164    fp
165}
166
167// ---------------------------------------------------------------------------
168// Convenience wrappers
169// ---------------------------------------------------------------------------
170
171/// ECFP4 fingerprint (radius = 2, 2048 bits).
172pub fn ecfp4(mol: &Molecule) -> BitVec2048 {
173    ecfp(mol, &EcfpConfig { radius: 2, nbits: 2048 })
174}
175
176/// ECFP6 fingerprint (radius = 3, 2048 bits).
177pub fn ecfp6(mol: &Molecule) -> BitVec2048 {
178    ecfp(mol, &EcfpConfig { radius: 3, nbits: 2048 })
179}
180
181/// Tanimoto similarity between two molecules using ECFP4.
182pub fn tanimoto_ecfp4(a: &Molecule, b: &Molecule) -> f64 {
183    ecfp4(a).tanimoto(&ecfp4(b))
184}
185
186// ---------------------------------------------------------------------------
187// Tests
188// ---------------------------------------------------------------------------
189
190#[cfg(test)]
191mod tests {
192    use super::*;
193    use chematic_smiles::parse;
194
195    fn benzene() -> Molecule {
196        parse("c1ccccc1").unwrap()
197    }
198
199    fn ethane() -> Molecule {
200        parse("CC").unwrap()
201    }
202
203    fn toluene() -> Molecule {
204        parse("Cc1ccccc1").unwrap()
205    }
206
207    fn aspirin() -> Molecule {
208        // Acetylsalicylic acid
209        parse("CC(=O)Oc1ccccc1C(=O)O").unwrap()
210    }
211
212    fn methane() -> Molecule {
213        parse("C").unwrap()
214    }
215
216    fn water() -> Molecule {
217        parse("O").unwrap()
218    }
219
220    #[test]
221    fn benzene_ecfp4_nonzero() {
222        let fp = ecfp4(&benzene());
223        assert!(fp.popcount() > 0, "benzene ECFP4 must be non-zero");
224    }
225
226    #[test]
227    fn benzene_ecfp4_deterministic() {
228        let fp1 = ecfp4(&benzene());
229        let fp2 = ecfp4(&benzene());
230        assert_eq!(fp1, fp2, "ECFP4 must be deterministic");
231    }
232
233    #[test]
234    fn ethane_vs_benzene_tanimoto_lt1() {
235        let t = tanimoto_ecfp4(&ethane(), &benzene());
236        assert!(t < 1.0, "ethane and benzene must differ (tanimoto={t})");
237    }
238
239    #[test]
240    fn benzene_vs_benzene_tanimoto_eq1() {
241        let t = tanimoto_ecfp4(&benzene(), &benzene());
242        assert_eq!(t, 1.0, "identical molecules must have tanimoto == 1.0");
243    }
244
245    #[test]
246    fn benzene_vs_toluene_tanimoto_between() {
247        let t = tanimoto_ecfp4(&benzene(), &toluene());
248        assert!(t > 0.0, "benzene and toluene share bits (tanimoto={t})");
249        assert!(t < 1.0, "benzene and toluene are not identical (tanimoto={t})");
250    }
251
252    #[test]
253    fn aspirin_ecfp4_many_bits() {
254        let fp = ecfp4(&aspirin());
255        assert!(
256            fp.popcount() > 5,
257            "aspirin ECFP4 must have more than 5 bits set (got {})",
258            fp.popcount()
259        );
260    }
261
262    #[test]
263    fn ecfp6_vs_ecfp4_benzene_differ() {
264        let fp4 = ecfp4(&benzene());
265        let fp6 = ecfp6(&benzene());
266        // Larger radius explores more environment — the bit counts should differ
267        // because radius-3 adds new hash values not present at radius-2.
268        assert_ne!(
269            fp4.popcount(),
270            fp6.popcount(),
271            "ECFP6 and ECFP4 should produce different bit counts for benzene"
272        );
273    }
274
275    #[test]
276    fn methane_ecfp4_nonzero() {
277        let fp = ecfp4(&methane());
278        assert!(fp.popcount() > 0, "methane ECFP4 must be non-zero");
279    }
280
281    #[test]
282    fn water_ecfp4_nonzero() {
283        let fp = ecfp4(&water());
284        assert!(fp.popcount() > 0, "water ECFP4 must be non-zero");
285    }
286
287    #[test]
288    fn tanimoto_ecfp4_benzene_self_is_one() {
289        let t = tanimoto_ecfp4(&benzene(), &benzene());
290        assert_eq!(t, 1.0, "tanimoto_ecfp4 of identical molecules must be 1.0");
291    }
292
293    #[test]
294    fn tanimoto_ecfp4_methane_vs_benzene_lt_half() {
295        let t = tanimoto_ecfp4(&methane(), &benzene());
296        assert!(
297            t < 0.5,
298            "methane and benzene should be very dissimilar (tanimoto={t})"
299        );
300    }
301}