1use crate::graph::{BondOrder, Molecule};
9use petgraph::graph::NodeIndex;
10use petgraph::visit::EdgeRef;
11use serde::{Deserialize, Serialize};
12use std::collections::BTreeSet;
13
14#[derive(Debug, Clone, Serialize, Deserialize)]
16pub struct ECFPFingerprint {
17 pub n_bits: usize,
19 pub on_bits: BTreeSet<usize>,
21 pub radius: usize,
23 pub raw_features: Vec<u64>,
25}
26
27impl ECFPFingerprint {
28 pub fn density(&self) -> f64 {
30 self.on_bits.len() as f64 / self.n_bits as f64
31 }
32
33 pub fn to_bitvec(&self) -> Vec<bool> {
35 let mut bv = vec![false; self.n_bits];
36 for &bit in &self.on_bits {
37 bv[bit] = true;
38 }
39 bv
40 }
41}
42
43fn fnv1a_hash(data: &[u64]) -> u64 {
45 let mut hash: u64 = 0xcbf29ce484222325; for &val in data {
47 let bytes = val.to_le_bytes();
48 for &byte in &bytes {
49 hash ^= byte as u64;
50 hash = hash.wrapping_mul(0x100000001b3); }
52 }
53 hash
54}
55
56fn atom_invariant(mol: &Molecule, idx: NodeIndex) -> u64 {
60 let atom = &mol.graph[idx];
61 let element = atom.element as u64;
62 let degree = mol.graph.edges(idx).count() as u64;
63 let h_count = mol
64 .graph
65 .neighbors(idx)
66 .filter(|n| mol.graph[*n].element == 1)
67 .count() as u64;
68 let formal_charge = (atom.formal_charge as i64 + 10) as u64; let is_aromatic = mol
70 .graph
71 .edges(idx)
72 .any(|e| matches!(e.weight().order, BondOrder::Aromatic));
73 let ring_flag = if crate::graph::atom_in_ring(mol, idx) {
74 1u64
75 } else {
76 0u64
77 };
78 let aromatic_flag = if is_aromatic { 1u64 } else { 0u64 };
79
80 fnv1a_hash(&[
82 element,
83 degree,
84 h_count,
85 formal_charge,
86 ring_flag,
87 aromatic_flag,
88 ])
89}
90
91pub fn compute_ecfp(mol: &Molecule, radius: usize, n_bits: usize) -> ECFPFingerprint {
101 let n = mol.graph.node_count();
102 let n_bits = n_bits.max(64);
103
104 let mut current_ids: Vec<u64> = (0..n)
106 .map(|i| atom_invariant(mol, NodeIndex::new(i)))
107 .collect();
108
109 let mut all_features: Vec<u64> = current_ids.clone();
110
111 for _r in 0..radius {
113 let mut next_ids = Vec::with_capacity(n);
114
115 for i in 0..n {
116 let node = NodeIndex::new(i);
117
118 let mut neighbor_data: Vec<u64> = Vec::new();
120 for edge in mol.graph.edges(node) {
121 let neighbor = if edge.source() == node {
122 edge.target()
123 } else {
124 edge.source()
125 };
126 let bond_type: u64 = match edge.weight().order {
127 BondOrder::Single => 1,
128 BondOrder::Double => 2,
129 BondOrder::Triple => 3,
130 BondOrder::Aromatic => 4,
131 BondOrder::Unknown => 0,
132 };
133 neighbor_data.push(fnv1a_hash(&[bond_type, current_ids[neighbor.index()]]));
135 }
136
137 neighbor_data.sort();
139
140 let mut hash_input = vec![current_ids[i]];
142 hash_input.extend_from_slice(&neighbor_data);
143 let new_id = fnv1a_hash(&hash_input);
144
145 next_ids.push(new_id);
146 all_features.push(new_id);
147 }
148
149 current_ids = next_ids;
150 }
151
152 let mut on_bits = BTreeSet::new();
154 for &feature in &all_features {
155 let bit = (feature % n_bits as u64) as usize;
156 on_bits.insert(bit);
157 }
158
159 ECFPFingerprint {
160 n_bits,
161 on_bits,
162 radius,
163 raw_features: all_features,
164 }
165}
166
167pub fn compute_ecfp_batch(mols: &[Molecule], radius: usize, n_bits: usize) -> Vec<ECFPFingerprint> {
171 #[cfg(feature = "parallel")]
172 {
173 use rayon::prelude::*;
174 mols.par_iter()
175 .map(|mol| compute_ecfp(mol, radius, n_bits))
176 .collect()
177 }
178 #[cfg(not(feature = "parallel"))]
179 {
180 mols.iter()
181 .map(|mol| compute_ecfp(mol, radius, n_bits))
182 .collect()
183 }
184}
185
186pub fn compute_tanimoto(fp1: &ECFPFingerprint, fp2: &ECFPFingerprint) -> f64 {
192 let intersection = fp1.on_bits.intersection(&fp2.on_bits).count();
193 let union = fp1.on_bits.union(&fp2.on_bits).count();
194
195 if union == 0 {
196 return 1.0; }
198
199 intersection as f64 / union as f64
200}
201
202#[cfg(test)]
203mod tests {
204 use super::*;
205
206 #[test]
207 fn test_ecfp_benzene() {
208 let mol = Molecule::from_smiles("c1ccccc1").unwrap();
209 let fp = compute_ecfp(&mol, 2, 1024);
210
211 assert_eq!(fp.n_bits, 1024);
212 assert_eq!(fp.radius, 2);
213 assert!(!fp.on_bits.is_empty(), "Benzene should have on-bits");
214 assert!(fp.density() > 0.0 && fp.density() < 1.0);
215 }
216
217 #[test]
218 fn test_self_similarity() {
219 let mol = Molecule::from_smiles("CCO").unwrap();
220 let fp = compute_ecfp(&mol, 2, 1024);
221 let tanimoto = compute_tanimoto(&fp, &fp);
222 assert!(
223 (tanimoto - 1.0).abs() < 1e-10,
224 "Self-similarity should be 1.0, got {}",
225 tanimoto
226 );
227 }
228
229 #[test]
230 fn test_tanimoto_similar_molecules() {
231 let benzene = Molecule::from_smiles("c1ccccc1").unwrap();
232 let toluene = Molecule::from_smiles("Cc1ccccc1").unwrap();
233 let fp1 = compute_ecfp(&benzene, 2, 2048);
234 let fp2 = compute_ecfp(&toluene, 2, 2048);
235 let tanimoto = compute_tanimoto(&fp1, &fp2);
236
237 assert!(
238 tanimoto > 0.3,
239 "Benzene-toluene similarity should be >0.3, got {}",
240 tanimoto
241 );
242 }
243
244 #[test]
245 fn test_tanimoto_dissimilar_molecules() {
246 let benzene = Molecule::from_smiles("c1ccccc1").unwrap();
247 let hexane = Molecule::from_smiles("CCCCCC").unwrap();
248 let fp1 = compute_ecfp(&benzene, 2, 2048);
249 let fp2 = compute_ecfp(&hexane, 2, 2048);
250 let tanimoto = compute_tanimoto(&fp1, &fp2);
251
252 assert!(
254 tanimoto < 0.5,
255 "Benzene-hexane similarity should be <0.5, got {}",
256 tanimoto
257 );
258 }
259
260 #[test]
261 fn test_deterministic_fingerprint() {
262 let mol = Molecule::from_smiles("CC(=O)O").unwrap();
263 let fp1 = compute_ecfp(&mol, 2, 1024);
264 let fp2 = compute_ecfp(&mol, 2, 1024);
265 assert_eq!(fp1.on_bits, fp2.on_bits, "ECFP should be deterministic");
266 assert_eq!(fp1.raw_features, fp2.raw_features);
267 }
268
269 #[test]
270 fn test_different_radii() {
271 let mol = Molecule::from_smiles("c1ccc(O)cc1").unwrap();
272 let fp2 = compute_ecfp(&mol, 1, 1024);
273 let fp4 = compute_ecfp(&mol, 2, 1024);
274 let fp6 = compute_ecfp(&mol, 3, 1024);
275
276 assert!(
278 fp4.raw_features.len() >= fp2.raw_features.len(),
279 "ECFP4 should have >= features than ECFP2"
280 );
281 assert!(
282 fp6.raw_features.len() >= fp4.raw_features.len(),
283 "ECFP6 should have >= features than ECFP4"
284 );
285 }
286
287 #[test]
288 fn test_fnv1a_deterministic() {
289 let a = fnv1a_hash(&[1, 2, 3]);
290 let b = fnv1a_hash(&[1, 2, 3]);
291 assert_eq!(a, b);
292
293 let c = fnv1a_hash(&[3, 2, 1]);
294 assert_ne!(a, c, "Different inputs should produce different hashes");
295 }
296}