1use chematic_core::{AtomIdx, BondOrder, Molecule, implicit_hcount};
6use chematic_perception::find_sssr;
7
8use crate::bitvec::BitVec2048;
9
10const FNV_OFFSET: u64 = 14695981039346656037;
11const FNV_PRIME: u64 = 1099511628211;
12
13pub(crate) fn fnv1a(bytes: &[u8]) -> u64 {
15 let mut h = FNV_OFFSET;
16 for &b in bytes {
17 h ^= b as u64;
18 h = h.wrapping_mul(FNV_PRIME);
19 }
20 h
21}
22
23#[derive(Debug, Clone)]
25pub struct EcfpConfig {
26 pub radius: u32,
28 pub nbits: usize,
30}
31
32impl Default for EcfpConfig {
33 fn default() -> Self {
34 Self { radius: 2, nbits: 2048 }
35 }
36}
37
38#[inline]
46pub(crate) fn bond_type_int(order: BondOrder) -> u8 {
47 match order {
48 BondOrder::Single | BondOrder::Up | BondOrder::Down => 1,
49 BondOrder::Double => 2,
50 BondOrder::Triple => 3,
51 BondOrder::Aromatic => 4,
52 BondOrder::Quadruple => 5,
53 }
54}
55
56pub fn ecfp(mol: &Molecule, config: &EcfpConfig) -> BitVec2048 {
65 let n = mol.atom_count();
66 let nbits = config.nbits;
67 let mut fp = BitVec2048::new();
68
69 if n == 0 {
70 return fp;
71 }
72
73 let ring_set = find_sssr(mol);
74
75 let mut ids: Vec<u64> = Vec::with_capacity(n);
77 for i in 0..n {
78 let idx = AtomIdx(i as u32);
79 let atom = mol.atom(idx);
80
81 let charge_adjusted = (atom.charge as i16 + 8) as u8;
84 let id = fnv1a(&[
85 atom.element.atomic_number(),
86 mol.neighbors(idx).count() as u8,
87 implicit_hcount(mol, idx),
88 charge_adjusted,
89 ring_set.contains_atom(idx) as u8,
90 atom.aromatic as u8,
91 ]);
92
93 fp.set((id % nbits as u64) as usize);
94 ids.push(id);
95 }
96
97 let mut new_ids: Vec<u64> = vec![0u64; n];
99 for r in 1..=config.radius {
100 for i in 0..n {
101 let idx = AtomIdx(i as u32);
102 let mut neighbor_info: Vec<(u8, u64)> = mol
103 .neighbors(idx)
104 .map(|(nb_idx, bond_idx)| {
105 let btype = bond_type_int(mol.bond(bond_idx).order);
106 (btype, ids[nb_idx.0 as usize])
107 })
108 .collect();
109 neighbor_info.sort_unstable();
110
111 let mut bytes: Vec<u8> = Vec::with_capacity(1 + 8 + neighbor_info.len() * 9);
113 bytes.push(r as u8);
114 bytes.extend_from_slice(&ids[i].to_le_bytes());
115 for (btype, nb_id) in &neighbor_info {
116 bytes.push(*btype);
117 bytes.extend_from_slice(&nb_id.to_le_bytes());
118 }
119
120 let new_id = fnv1a(&bytes);
121 new_ids[i] = new_id;
122 fp.set((new_id % nbits as u64) as usize);
123 }
124 core::mem::swap(&mut ids, &mut new_ids);
125 }
126
127 fp
128}
129
130pub fn morgan_fp_counts(mol: &Molecule, radius: u32) -> std::collections::HashMap<u64, u32> {
140 use std::collections::HashMap;
141
142 let n = mol.atom_count();
143 let mut counts: HashMap<u64, u32> = HashMap::new();
144
145 if n == 0 {
146 return counts;
147 }
148
149 let ring_set = find_sssr(mol);
150
151 let mut ids: Vec<u64> = (0..n)
153 .map(|i| {
154 let idx = AtomIdx(i as u32);
155 let atom = mol.atom(idx);
156 let charge_adjusted = (atom.charge as i16 + 8) as u8;
157 fnv1a(&[
158 atom.element.atomic_number(),
159 mol.neighbors(idx).count() as u8,
160 implicit_hcount(mol, idx),
161 charge_adjusted,
162 ring_set.contains_atom(idx) as u8,
163 atom.aromatic as u8,
164 ])
165 })
166 .collect();
167
168 for &id in &ids {
169 *counts.entry(id).or_insert(0) += 1;
170 }
171
172 let mut new_ids = vec![0u64; n];
174 for r in 1..=radius {
175 for i in 0..n {
176 let idx = AtomIdx(i as u32);
177 let mut neighbor_info: Vec<(u8, u64)> = mol
178 .neighbors(idx)
179 .map(|(nb_idx, bond_idx)| {
180 (bond_type_int(mol.bond(bond_idx).order), ids[nb_idx.0 as usize])
181 })
182 .collect();
183 neighbor_info.sort_unstable();
184
185 let mut bytes: Vec<u8> = Vec::with_capacity(1 + 8 + neighbor_info.len() * 9);
186 bytes.push(r as u8);
187 bytes.extend_from_slice(&ids[i].to_le_bytes());
188 for (btype, nb_id) in &neighbor_info {
189 bytes.push(*btype);
190 bytes.extend_from_slice(&nb_id.to_le_bytes());
191 }
192
193 let new_id = fnv1a(&bytes);
194 new_ids[i] = new_id;
195 *counts.entry(new_id).or_insert(0) += 1;
196 }
197 core::mem::swap(&mut ids, &mut new_ids);
198 }
199
200 counts
201}
202
203pub fn ecfp4(mol: &Molecule) -> BitVec2048 {
205 ecfp(mol, &EcfpConfig { radius: 2, nbits: 2048 })
206}
207
208pub fn ecfp6(mol: &Molecule) -> BitVec2048 {
210 ecfp(mol, &EcfpConfig { radius: 3, nbits: 2048 })
211}
212
213pub fn tanimoto_ecfp4(a: &Molecule, b: &Molecule) -> f64 {
215 ecfp4(a).tanimoto(&ecfp4(b))
216}
217
218#[cfg(test)]
219mod tests {
220 use super::*;
221 use chematic_smiles::parse;
222
223 fn benzene() -> Molecule {
224 parse("c1ccccc1").unwrap()
225 }
226
227 fn ethane() -> Molecule {
228 parse("CC").unwrap()
229 }
230
231 fn toluene() -> Molecule {
232 parse("Cc1ccccc1").unwrap()
233 }
234
235 fn aspirin() -> Molecule {
236 parse("CC(=O)Oc1ccccc1C(=O)O").unwrap()
238 }
239
240 fn methane() -> Molecule {
241 parse("C").unwrap()
242 }
243
244 fn water() -> Molecule {
245 parse("O").unwrap()
246 }
247
248 #[test]
249 fn benzene_ecfp4_nonzero() {
250 let fp = ecfp4(&benzene());
251 assert!(fp.popcount() > 0, "benzene ECFP4 must be non-zero");
252 }
253
254 #[test]
255 fn benzene_ecfp4_deterministic() {
256 let fp1 = ecfp4(&benzene());
257 let fp2 = ecfp4(&benzene());
258 assert_eq!(fp1, fp2, "ECFP4 must be deterministic");
259 }
260
261 #[test]
262 fn ethane_vs_benzene_tanimoto_lt1() {
263 let t = tanimoto_ecfp4(ðane(), &benzene());
264 assert!(t < 1.0, "ethane and benzene must differ (tanimoto={t})");
265 }
266
267 #[test]
268 fn benzene_vs_benzene_tanimoto_eq1() {
269 let t = tanimoto_ecfp4(&benzene(), &benzene());
270 assert_eq!(t, 1.0, "identical molecules must have tanimoto == 1.0");
271 }
272
273 #[test]
274 fn benzene_vs_toluene_tanimoto_between() {
275 let t = tanimoto_ecfp4(&benzene(), &toluene());
276 assert!(t > 0.0, "benzene and toluene share bits (tanimoto={t})");
277 assert!(t < 1.0, "benzene and toluene are not identical (tanimoto={t})");
278 }
279
280 #[test]
281 fn aspirin_ecfp4_many_bits() {
282 let fp = ecfp4(&aspirin());
283 assert!(
284 fp.popcount() > 5,
285 "aspirin ECFP4 must have more than 5 bits set (got {})",
286 fp.popcount()
287 );
288 }
289
290 #[test]
291 fn ecfp6_vs_ecfp4_benzene_differ() {
292 let fp4 = ecfp4(&benzene());
293 let fp6 = ecfp6(&benzene());
294 assert_ne!(
297 fp4.popcount(),
298 fp6.popcount(),
299 "ECFP6 and ECFP4 should produce different bit counts for benzene"
300 );
301 }
302
303 #[test]
304 fn methane_ecfp4_nonzero() {
305 let fp = ecfp4(&methane());
306 assert!(fp.popcount() > 0, "methane ECFP4 must be non-zero");
307 }
308
309 #[test]
310 fn water_ecfp4_nonzero() {
311 let fp = ecfp4(&water());
312 assert!(fp.popcount() > 0, "water ECFP4 must be non-zero");
313 }
314
315 #[test]
316 fn tanimoto_ecfp4_benzene_self_is_one() {
317 let t = tanimoto_ecfp4(&benzene(), &benzene());
318 assert_eq!(t, 1.0, "tanimoto_ecfp4 of identical molecules must be 1.0");
319 }
320
321 #[test]
322 fn tanimoto_ecfp4_methane_vs_benzene_lt_half() {
323 let t = tanimoto_ecfp4(&methane(), &benzene());
324 assert!(
325 t < 0.5,
326 "methane and benzene should be very dissimilar (tanimoto={t})"
327 );
328 }
329
330 #[test]
333 fn morgan_counts_radius0_atom_count() {
334 let m = benzene();
336 let counts = morgan_fp_counts(&m, 0);
337 let total: u32 = counts.values().sum();
338 assert_eq!(total, m.atom_count() as u32, "radius-0 total count == atom_count");
339 }
340
341 #[test]
342 fn morgan_counts_radius2_total_grows() {
343 let m = methane();
345 let n = m.atom_count() as u32;
346 let r = 2u32;
347 let counts = morgan_fp_counts(&m, r);
348 let total: u32 = counts.values().sum();
349 assert_eq!(total, n * (r + 1), "methane total = atom_count * (radius+1)");
350 }
351
352 #[test]
353 fn morgan_counts_benzene_symmetry() {
354 let m = benzene();
356 let counts = morgan_fp_counts(&m, 0);
357 assert_eq!(counts.len(), 1, "benzene has one unique radius-0 environment");
358 assert_eq!(*counts.values().next().unwrap(), 6, "that environment appears 6 times");
359 }
360
361 #[test]
362 fn morgan_counts_empty_mol_is_empty() {
363 use chematic_core::MoleculeBuilder;
364 let m = MoleculeBuilder::new().build();
365 let counts = morgan_fp_counts(&m, 2);
366 assert!(counts.is_empty(), "empty molecule yields empty count map");
367 }
368
369 #[test]
370 fn morgan_counts_deterministic() {
371 let m = aspirin();
372 let c1 = morgan_fp_counts(&m, 2);
373 let c2 = morgan_fp_counts(&m, 2);
374 assert_eq!(c1, c2, "morgan_fp_counts must be deterministic");
375 }
376
377 #[test]
378 fn morgan_counts_consistent_with_ecfp_bits() {
379 let m = toluene();
382 let fp = ecfp(&m, &EcfpConfig { radius: 2, nbits: 2048 });
383 let counts = morgan_fp_counts(&m, 2);
384 for &hash in counts.keys() {
385 let bit = (hash % 2048) as usize;
386 assert!(fp.get(bit), "bit {bit} from count map not set in ECFP bitvec");
387 }
388 }
389}