1use chematic_core::{AtomIdx, BondOrder, Molecule, implicit_hcount};
6use chematic_perception::find_sssr;
7
8use crate::bitvec::BitVec2048;
9
10const FNV_OFFSET: u64 = 14695981039346656037;
11const FNV_PRIME: u64 = 1099511628211;
12
13pub(crate) fn fnv1a(bytes: &[u8]) -> u64 {
15 let mut h = FNV_OFFSET;
16 for &b in bytes {
17 h ^= b as u64;
18 h = h.wrapping_mul(FNV_PRIME);
19 }
20 h
21}
22
23fn expand_atom_id(mol: &Molecule, i: usize, r: u32, ids: &[u64]) -> u64 {
28 let idx = AtomIdx(i as u32);
29 let mut neighbor_info: Vec<(u8, u64)> = mol
30 .neighbors(idx)
31 .map(|(nb_idx, bond_idx)| (bond_type_int(mol.bond(bond_idx).order), ids[nb_idx.0 as usize]))
32 .collect();
33 neighbor_info.sort_unstable();
34
35 let mut bytes: Vec<u8> = Vec::with_capacity(1 + 8 + neighbor_info.len() * 9);
36 bytes.push(r as u8);
37 bytes.extend_from_slice(&ids[i].to_le_bytes());
38 for (btype, nb_id) in &neighbor_info {
39 bytes.push(*btype);
40 bytes.extend_from_slice(&nb_id.to_le_bytes());
41 }
42 fnv1a(&bytes)
43}
44
45pub(crate) fn initial_atom_id(
52 mol: &Molecule,
53 idx: AtomIdx,
54 ring_set: &chematic_perception::RingSet,
55 use_chirality: bool,
56) -> u64 {
57 let atom = mol.atom(idx);
58 let charge_adjusted = (atom.charge as i16 + 8).clamp(0, 255) as u8;
59 let base_bytes = [
60 atom.element.atomic_number(),
61 mol.neighbors(idx).count().min(255) as u8,
62 implicit_hcount(mol, idx),
63 charge_adjusted,
64 ring_set.contains_atom(idx) as u8,
65 atom.aromatic as u8,
66 ];
67 if use_chirality {
68 use chematic_core::Chirality;
69 let chirality_byte = match atom.chirality {
70 Chirality::None => 0u8,
71 Chirality::CounterClockwise => 1u8,
72 Chirality::Clockwise => 2u8,
73 };
74 let mut chiral_bytes = base_bytes.to_vec();
75 chiral_bytes.push(chirality_byte);
76 fnv1a(&chiral_bytes)
77 } else {
78 fnv1a(&base_bytes)
79 }
80}
81
82#[derive(Debug, Clone)]
84pub struct EcfpConfig {
85 pub radius: u32,
87 pub nbits: usize,
89 pub use_chirality: bool,
94 pub use_double_fold: bool,
100}
101
102impl Default for EcfpConfig {
103 fn default() -> Self {
104 Self {
105 radius: 2,
106 nbits: 2048,
107 use_chirality: false,
108 use_double_fold: false,
109 }
110 }
111}
112
113#[inline]
121pub(crate) fn bond_type_int(order: BondOrder) -> u8 {
122 match order {
123 BondOrder::Single | BondOrder::Up | BondOrder::Down | BondOrder::Dative => 1,
124 BondOrder::Double => 2,
125 BondOrder::Triple => 3,
126 BondOrder::Aromatic => 4,
127 BondOrder::Quadruple => 5,
128 BondOrder::Zero => 0,
129 BondOrder::QueryAny => 6,
130 BondOrder::QuerySingleOrDouble => 7,
131 BondOrder::QuerySingleOrAromatic => 8,
132 BondOrder::QueryDoubleOrAromatic => 9,
133 }
134}
135
136pub const MAX_ECFP_RADIUS: u32 = 20;
147
148pub fn ecfp(mol: &Molecule, config: &EcfpConfig) -> BitVec2048 {
149 let n = mol.atom_count();
150 let nbits = config.nbits;
151 let config = &EcfpConfig {
153 radius: config.radius.min(MAX_ECFP_RADIUS),
154 ..*config
155 };
156 let mut fp = BitVec2048::new();
157
158 if n == 0 {
159 return fp;
160 }
161
162 let ring_set = find_sssr(mol);
163
164 let mut ids: Vec<u64> = Vec::with_capacity(n);
166 for i in 0..n {
167 let idx = AtomIdx(i as u32);
168 let id = initial_atom_id(mol, idx, &ring_set, config.use_chirality);
169 fp.set((id % nbits as u64) as usize);
170 if config.use_double_fold {
171 fp.set(((id >> 11) % nbits as u64) as usize);
172 }
173 ids.push(id);
174 }
175
176 let mut new_ids: Vec<u64> = vec![0u64; n];
178 for r in 1..=config.radius {
179 for (i, slot) in new_ids.iter_mut().enumerate() {
180 let new_id = expand_atom_id(mol, i, r, &ids);
181 *slot = new_id;
182 fp.set((new_id % nbits as u64) as usize);
183 if config.use_double_fold {
184 fp.set(((new_id >> 11) % nbits as u64) as usize);
185 }
186 }
187 core::mem::swap(&mut ids, &mut new_ids);
188 }
189
190 fp
191}
192
193pub fn morgan_fp_counts(mol: &Molecule, radius: u32) -> std::collections::HashMap<u64, u32> {
203 use std::collections::HashMap;
204
205 const MAX_RADIUS: u32 = 20;
206 let radius = radius.min(MAX_RADIUS);
207
208 let n = mol.atom_count();
209 let mut counts: HashMap<u64, u32> = HashMap::new();
210
211 if n == 0 {
212 return counts;
213 }
214
215 let ring_set = find_sssr(mol);
216
217 let mut ids: Vec<u64> = (0..n)
219 .map(|i| initial_atom_id(mol, AtomIdx(i as u32), &ring_set, false))
220 .collect();
221
222 for &id in &ids {
223 *counts.entry(id).or_insert(0) += 1;
224 }
225
226 let mut new_ids = vec![0u64; n];
228 for r in 1..=radius {
229 for (i, slot) in new_ids.iter_mut().enumerate() {
230 let new_id = expand_atom_id(mol, i, r, &ids);
231 *slot = new_id;
232 *counts.entry(new_id).or_insert(0) += 1;
233 }
234 core::mem::swap(&mut ids, &mut new_ids);
235 }
236
237 counts
238}
239
240pub fn ecfp4(mol: &Molecule) -> BitVec2048 {
242 ecfp(mol, &EcfpConfig::default())
243}
244
245pub fn ecfp6(mol: &Molecule) -> BitVec2048 {
247 ecfp(mol, &EcfpConfig { radius: 3, ..EcfpConfig::default() })
248}
249
250pub fn tanimoto_ecfp4(a: &Molecule, b: &Molecule) -> f64 {
252 ecfp4(a).tanimoto(&ecfp4(b))
253}
254
255#[cfg(test)]
256mod tests {
257 use super::*;
258 use chematic_smiles::parse;
259
260 fn benzene() -> Molecule {
261 parse("c1ccccc1").unwrap()
262 }
263
264 fn ethane() -> Molecule {
265 parse("CC").unwrap()
266 }
267
268 fn toluene() -> Molecule {
269 parse("Cc1ccccc1").unwrap()
270 }
271
272 fn aspirin() -> Molecule {
273 parse("CC(=O)Oc1ccccc1C(=O)O").unwrap()
275 }
276
277 fn methane() -> Molecule {
278 parse("C").unwrap()
279 }
280
281 fn water() -> Molecule {
282 parse("O").unwrap()
283 }
284
285 #[test]
286 fn benzene_ecfp4_nonzero() {
287 let fp = ecfp4(&benzene());
288 assert!(fp.popcount() > 0, "benzene ECFP4 must be non-zero");
289 }
290
291 #[test]
292 fn benzene_ecfp4_deterministic() {
293 let fp1 = ecfp4(&benzene());
294 let fp2 = ecfp4(&benzene());
295 assert_eq!(fp1, fp2, "ECFP4 must be deterministic");
296 }
297
298 #[test]
299 fn ethane_vs_benzene_tanimoto_lt1() {
300 let t = tanimoto_ecfp4(ðane(), &benzene());
301 assert!(t < 1.0, "ethane and benzene must differ (tanimoto={t})");
302 }
303
304 #[test]
305 fn benzene_vs_benzene_tanimoto_eq1() {
306 let t = tanimoto_ecfp4(&benzene(), &benzene());
307 assert_eq!(t, 1.0, "identical molecules must have tanimoto == 1.0");
308 }
309
310 #[test]
311 fn benzene_vs_toluene_tanimoto_between() {
312 let t = tanimoto_ecfp4(&benzene(), &toluene());
313 assert!(t > 0.0, "benzene and toluene share bits (tanimoto={t})");
314 assert!(
315 t < 1.0,
316 "benzene and toluene are not identical (tanimoto={t})"
317 );
318 }
319
320 #[test]
321 fn aspirin_ecfp4_many_bits() {
322 let fp = ecfp4(&aspirin());
323 assert!(
324 fp.popcount() > 5,
325 "aspirin ECFP4 must have more than 5 bits set (got {})",
326 fp.popcount()
327 );
328 }
329
330 #[test]
331 fn ecfp6_vs_ecfp4_benzene_differ() {
332 let fp4 = ecfp4(&benzene());
333 let fp6 = ecfp6(&benzene());
334 assert_ne!(
337 fp4.popcount(),
338 fp6.popcount(),
339 "ECFP6 and ECFP4 should produce different bit counts for benzene"
340 );
341 }
342
343 #[test]
344 fn methane_ecfp4_nonzero() {
345 let fp = ecfp4(&methane());
346 assert!(fp.popcount() > 0, "methane ECFP4 must be non-zero");
347 }
348
349 #[test]
350 fn water_ecfp4_nonzero() {
351 let fp = ecfp4(&water());
352 assert!(fp.popcount() > 0, "water ECFP4 must be non-zero");
353 }
354
355 #[test]
356 fn tanimoto_ecfp4_benzene_self_is_one() {
357 let t = tanimoto_ecfp4(&benzene(), &benzene());
358 assert_eq!(t, 1.0, "tanimoto_ecfp4 of identical molecules must be 1.0");
359 }
360
361 #[test]
362 fn tanimoto_ecfp4_methane_vs_benzene_lt_half() {
363 let t = tanimoto_ecfp4(&methane(), &benzene());
364 assert!(
365 t < 0.5,
366 "methane and benzene should be very dissimilar (tanimoto={t})"
367 );
368 }
369
370 #[test]
373 fn morgan_counts_radius0_atom_count() {
374 let m = benzene();
376 let counts = morgan_fp_counts(&m, 0);
377 let total: u32 = counts.values().sum();
378 assert_eq!(
379 total,
380 m.atom_count() as u32,
381 "radius-0 total count == atom_count"
382 );
383 }
384
385 #[test]
386 fn morgan_counts_radius2_total_grows() {
387 let m = methane();
389 let n = m.atom_count() as u32;
390 let r = 2u32;
391 let counts = morgan_fp_counts(&m, r);
392 let total: u32 = counts.values().sum();
393 assert_eq!(
394 total,
395 n * (r + 1),
396 "methane total = atom_count * (radius+1)"
397 );
398 }
399
400 #[test]
401 fn morgan_counts_benzene_symmetry() {
402 let m = benzene();
404 let counts = morgan_fp_counts(&m, 0);
405 assert_eq!(
406 counts.len(),
407 1,
408 "benzene has one unique radius-0 environment"
409 );
410 assert_eq!(
411 *counts.values().next().unwrap(),
412 6,
413 "that environment appears 6 times"
414 );
415 }
416
417 #[test]
418 fn morgan_counts_empty_mol_is_empty() {
419 use chematic_core::MoleculeBuilder;
420 let m = MoleculeBuilder::new().build();
421 let counts = morgan_fp_counts(&m, 2);
422 assert!(counts.is_empty(), "empty molecule yields empty count map");
423 }
424
425 #[test]
426 fn morgan_counts_deterministic() {
427 let m = aspirin();
428 let c1 = morgan_fp_counts(&m, 2);
429 let c2 = morgan_fp_counts(&m, 2);
430 assert_eq!(c1, c2, "morgan_fp_counts must be deterministic");
431 }
432
433 #[test]
434 fn morgan_counts_consistent_with_ecfp_bits() {
435 let m = toluene();
438 let fp = ecfp(
439 &m,
440 &EcfpConfig {
441 radius: 2,
442 nbits: 2048,
443 use_chirality: false,
444 use_double_fold: false,
445 },
446 );
447 let counts = morgan_fp_counts(&m, 2);
448 for &hash in counts.keys() {
449 let bit = (hash % 2048) as usize;
450 assert!(
451 fp.get(bit),
452 "bit {bit} from count map not set in ECFP bitvec"
453 );
454 }
455 }
456
457 #[test]
460 fn ecfp4_ignores_chirality_by_default() {
461 let l_ala = parse("N[C@@H](C)C(=O)O").unwrap();
464 let d_ala = parse("N[C@H](C)C(=O)O").unwrap();
465 let fp_l = ecfp4(&l_ala);
466 let fp_d = ecfp4(&d_ala);
467 assert_eq!(
468 fp_l, fp_d,
469 "L/D-alanine ECFP4 should be identical when use_chirality=false"
470 );
471 }
472
473 #[test]
474 fn ecfp4_distinguishes_enantiomers_with_chirality() {
475 let l_ala = parse("N[C@@H](C)C(=O)O").unwrap();
477 let d_ala = parse("N[C@H](C)C(=O)O").unwrap();
478 let config = EcfpConfig {
479 radius: 2,
480 nbits: 2048,
481 use_chirality: true,
482 use_double_fold: false,
483 };
484 let fp_l = ecfp(&l_ala, &config);
485 let fp_d = ecfp(&d_ala, &config);
486 assert_ne!(
487 fp_l, fp_d,
488 "L/D-alanine ECFP4 must differ when use_chirality=true"
489 );
490 assert!(
492 fp_l.tanimoto(&fp_d) < 1.0,
493 "Tanimoto of L/D-alanine must be < 1.0 with use_chirality"
494 );
495 }
496
497 #[test]
498 fn ecfp4_non_chiral_generates_with_chirality_flag() {
499 let mol = parse("c1ccccc1").unwrap(); let config = EcfpConfig {
503 radius: 2,
504 nbits: 2048,
505 use_chirality: true,
506 use_double_fold: false,
507 };
508 let fp = ecfp(&mol, &config);
509 assert!(fp.popcount() > 0, "Benzene should generate non-empty ECFP4 with use_chirality=true");
510 }
511}