Skip to main content

sci_form/rings/
ecfp.rs

1//! Extended-Connectivity Fingerprints (ECFP / Morgan fingerprints).
2//!
3//! Implements Rogers & Hahn 2010 circular fingerprints:
4//! 1. Initialize atom invariants (element, degree, H-count, charge, ring)
5//! 2. Iteratively aggregate neighbor identifiers up to a given radius
6//! 3. Fold into a fixed-length bit vector
7
8use crate::graph::{BondOrder, Molecule};
9use petgraph::graph::NodeIndex;
10use petgraph::visit::EdgeRef;
11use serde::{Deserialize, Serialize};
12use std::collections::BTreeSet;
13
14/// An ECFP fingerprint represented as a set of on-bits in a folded bit vector.
15#[derive(Debug, Clone, Serialize, Deserialize)]
16pub struct ECFPFingerprint {
17    /// Number of bits in the fingerprint.
18    pub n_bits: usize,
19    /// Set of on-bit positions.
20    pub on_bits: BTreeSet<usize>,
21    /// Radius used for generation.
22    pub radius: usize,
23    /// Raw feature identifiers (before folding).
24    pub raw_features: Vec<u64>,
25}
26
27impl ECFPFingerprint {
28    /// Get the density (fraction of on-bits).
29    pub fn density(&self) -> f64 {
30        self.on_bits.len() as f64 / self.n_bits as f64
31    }
32
33    /// Convert to a dense bit vector.
34    pub fn to_bitvec(&self) -> Vec<bool> {
35        let mut bv = vec![false; self.n_bits];
36        for &bit in &self.on_bits {
37            bv[bit] = true;
38        }
39        bv
40    }
41}
42
43/// FNV-1a hash for deterministic fingerprint generation.
44fn fnv1a_hash(data: &[u64]) -> u64 {
45    let mut hash: u64 = 0xcbf29ce484222325; // FNV offset basis
46    for &val in data {
47        let bytes = val.to_le_bytes();
48        for &byte in &bytes {
49            hash ^= byte as u64;
50            hash = hash.wrapping_mul(0x100000001b3); // FNV prime
51        }
52    }
53    hash
54}
55
56/// Compute initial atom invariant for ECFP generation.
57///
58/// Encodes: atomic number, degree, H-count, formal charge, ring membership.
59fn atom_invariant(mol: &Molecule, idx: NodeIndex) -> u64 {
60    let atom = &mol.graph[idx];
61    let element = atom.element as u64;
62    let degree = mol.graph.edges(idx).count() as u64;
63    let h_count = mol
64        .graph
65        .neighbors(idx)
66        .filter(|n| mol.graph[*n].element == 1)
67        .count() as u64;
68    let formal_charge = (atom.formal_charge as i64 + 10) as u64; // offset to avoid negative
69    let is_aromatic = mol
70        .graph
71        .edges(idx)
72        .any(|e| matches!(e.weight().order, BondOrder::Aromatic));
73    let ring_flag = if crate::graph::atom_in_ring(mol, idx) {
74        1u64
75    } else {
76        0u64
77    };
78    let aromatic_flag = if is_aromatic { 1u64 } else { 0u64 };
79
80    // Combine into a single hash
81    fnv1a_hash(&[
82        element,
83        degree,
84        h_count,
85        formal_charge,
86        ring_flag,
87        aromatic_flag,
88    ])
89}
90
91/// Compute ECFP fingerprint for a molecule.
92///
93/// # Arguments
94/// - `mol`: parsed molecular graph
95/// - `radius`: maximum radius (ECFP4 = radius 2, ECFP6 = radius 3)
96/// - `n_bits`: bit vector length (typically 1024 or 2048)
97///
98/// # Returns
99/// `ECFPFingerprint` with on-bits and raw features.
100pub fn compute_ecfp(mol: &Molecule, radius: usize, n_bits: usize) -> ECFPFingerprint {
101    let n = mol.graph.node_count();
102    let n_bits = n_bits.max(64);
103
104    // 1. Initialize invariants
105    let mut current_ids: Vec<u64> = (0..n)
106        .map(|i| atom_invariant(mol, NodeIndex::new(i)))
107        .collect();
108
109    let mut all_features: Vec<u64> = current_ids.clone();
110
111    // 2. Iterative neighborhood aggregation
112    for _r in 0..radius {
113        let mut next_ids = Vec::with_capacity(n);
114
115        for i in 0..n {
116            let node = NodeIndex::new(i);
117
118            // Collect neighbor identifiers with bond info
119            let mut neighbor_data: Vec<u64> = Vec::new();
120            for edge in mol.graph.edges(node) {
121                let neighbor = if edge.source() == node {
122                    edge.target()
123                } else {
124                    edge.source()
125                };
126                let bond_type: u64 = match edge.weight().order {
127                    BondOrder::Single => 1,
128                    BondOrder::Double => 2,
129                    BondOrder::Triple => 3,
130                    BondOrder::Aromatic => 4,
131                    BondOrder::Unknown => 0,
132                };
133                // Hash bond_type with neighbor's current ID
134                neighbor_data.push(fnv1a_hash(&[bond_type, current_ids[neighbor.index()]]));
135            }
136
137            // Sort for order independence
138            neighbor_data.sort();
139
140            // Hash: [current_id, sorted_neighbor_hashes...]
141            let mut hash_input = vec![current_ids[i]];
142            hash_input.extend_from_slice(&neighbor_data);
143            let new_id = fnv1a_hash(&hash_input);
144
145            next_ids.push(new_id);
146            all_features.push(new_id);
147        }
148
149        current_ids = next_ids;
150    }
151
152    // 3. Fold into bit vector
153    let mut on_bits = BTreeSet::new();
154    for &feature in &all_features {
155        let bit = (feature % n_bits as u64) as usize;
156        on_bits.insert(bit);
157    }
158
159    ECFPFingerprint {
160        n_bits,
161        on_bits,
162        radius,
163        raw_features: all_features,
164    }
165}
166
167/// Compute ECFP fingerprints for a batch of molecules in parallel.
168///
169/// Uses rayon when the `parallel` feature is enabled.
170pub fn compute_ecfp_batch(mols: &[Molecule], radius: usize, n_bits: usize) -> Vec<ECFPFingerprint> {
171    #[cfg(feature = "parallel")]
172    {
173        use rayon::prelude::*;
174        mols.par_iter()
175            .map(|mol| compute_ecfp(mol, radius, n_bits))
176            .collect()
177    }
178    #[cfg(not(feature = "parallel"))]
179    {
180        mols.iter()
181            .map(|mol| compute_ecfp(mol, radius, n_bits))
182            .collect()
183    }
184}
185
186/// Compute the Tanimoto similarity between two ECFP fingerprints.
187///
188/// $$T(A,B) = \frac{|A \cap B|}{|A \cup B|}$$
189///
190/// Returns a value in [0.0, 1.0] where 1.0 = identical fingerprints.
191pub fn compute_tanimoto(fp1: &ECFPFingerprint, fp2: &ECFPFingerprint) -> f64 {
192    let intersection = fp1.on_bits.intersection(&fp2.on_bits).count();
193    let union = fp1.on_bits.union(&fp2.on_bits).count();
194
195    if union == 0 {
196        return 1.0; // both empty
197    }
198
199    intersection as f64 / union as f64
200}
201
202#[cfg(test)]
203mod tests {
204    use super::*;
205
206    #[test]
207    fn test_ecfp_benzene() {
208        let mol = Molecule::from_smiles("c1ccccc1").unwrap();
209        let fp = compute_ecfp(&mol, 2, 1024);
210
211        assert_eq!(fp.n_bits, 1024);
212        assert_eq!(fp.radius, 2);
213        assert!(!fp.on_bits.is_empty(), "Benzene should have on-bits");
214        assert!(fp.density() > 0.0 && fp.density() < 1.0);
215    }
216
217    #[test]
218    fn test_self_similarity() {
219        let mol = Molecule::from_smiles("CCO").unwrap();
220        let fp = compute_ecfp(&mol, 2, 1024);
221        let tanimoto = compute_tanimoto(&fp, &fp);
222        assert!(
223            (tanimoto - 1.0).abs() < 1e-10,
224            "Self-similarity should be 1.0, got {}",
225            tanimoto
226        );
227    }
228
229    #[test]
230    fn test_tanimoto_similar_molecules() {
231        let benzene = Molecule::from_smiles("c1ccccc1").unwrap();
232        let toluene = Molecule::from_smiles("Cc1ccccc1").unwrap();
233        let fp1 = compute_ecfp(&benzene, 2, 2048);
234        let fp2 = compute_ecfp(&toluene, 2, 2048);
235        let tanimoto = compute_tanimoto(&fp1, &fp2);
236
237        assert!(
238            tanimoto > 0.3,
239            "Benzene-toluene similarity should be >0.3, got {}",
240            tanimoto
241        );
242    }
243
244    #[test]
245    fn test_tanimoto_dissimilar_molecules() {
246        let benzene = Molecule::from_smiles("c1ccccc1").unwrap();
247        let hexane = Molecule::from_smiles("CCCCCC").unwrap();
248        let fp1 = compute_ecfp(&benzene, 2, 2048);
249        let fp2 = compute_ecfp(&hexane, 2, 2048);
250        let tanimoto = compute_tanimoto(&fp1, &fp2);
251
252        // Dissimilar molecules should have lower Tanimoto
253        assert!(
254            tanimoto < 0.5,
255            "Benzene-hexane similarity should be <0.5, got {}",
256            tanimoto
257        );
258    }
259
260    #[test]
261    fn test_deterministic_fingerprint() {
262        let mol = Molecule::from_smiles("CC(=O)O").unwrap();
263        let fp1 = compute_ecfp(&mol, 2, 1024);
264        let fp2 = compute_ecfp(&mol, 2, 1024);
265        assert_eq!(fp1.on_bits, fp2.on_bits, "ECFP should be deterministic");
266        assert_eq!(fp1.raw_features, fp2.raw_features);
267    }
268
269    #[test]
270    fn test_different_radii() {
271        let mol = Molecule::from_smiles("c1ccc(O)cc1").unwrap();
272        let fp2 = compute_ecfp(&mol, 1, 1024);
273        let fp4 = compute_ecfp(&mol, 2, 1024);
274        let fp6 = compute_ecfp(&mol, 3, 1024);
275
276        // Higher radius should produce more features
277        assert!(
278            fp4.raw_features.len() >= fp2.raw_features.len(),
279            "ECFP4 should have >= features than ECFP2"
280        );
281        assert!(
282            fp6.raw_features.len() >= fp4.raw_features.len(),
283            "ECFP6 should have >= features than ECFP4"
284        );
285    }
286
287    #[test]
288    fn test_fnv1a_deterministic() {
289        let a = fnv1a_hash(&[1, 2, 3]);
290        let b = fnv1a_hash(&[1, 2, 3]);
291        assert_eq!(a, b);
292
293        let c = fnv1a_hash(&[3, 2, 1]);
294        assert_ne!(a, c, "Different inputs should produce different hashes");
295    }
296}