1use chematic_core::{AtomIdx, BondOrder, Molecule, implicit_hcount};
6use chematic_perception::find_sssr;
7
8use crate::bitvec::BitVec2048;
9
10const FNV_OFFSET: u64 = 14695981039346656037;
15const FNV_PRIME: u64 = 1099511628211;
16
17fn fnv1a(bytes: &[u8]) -> u64 {
19 let mut h = FNV_OFFSET;
20 for &b in bytes {
21 h ^= b as u64;
22 h = h.wrapping_mul(FNV_PRIME);
23 }
24 h
25}
26
27#[derive(Debug, Clone)]
33pub struct EcfpConfig {
34 pub radius: u32,
36 pub nbits: usize,
38}
39
40impl Default for EcfpConfig {
41 fn default() -> Self {
42 Self { radius: 2, nbits: 2048 }
43 }
44}
45
46#[inline]
58fn bond_type_int(order: BondOrder) -> u8 {
59 match order {
60 BondOrder::Single | BondOrder::Up | BondOrder::Down => 1,
61 BondOrder::Double => 2,
62 BondOrder::Triple => 3,
63 BondOrder::Aromatic => 4,
64 BondOrder::Quadruple => 5,
65 }
66}
67
68pub fn ecfp(mol: &Molecule, config: &EcfpConfig) -> BitVec2048 {
81 let n = mol.atom_count();
82 let nbits = config.nbits;
83 let mut fp = BitVec2048::new();
84
85 if n == 0 {
86 return fp;
87 }
88
89 let ring_set = find_sssr(mol);
91
92 let mut ids: Vec<u64> = Vec::with_capacity(n);
94
95 for i in 0..n {
96 let idx = AtomIdx(i as u32);
97 let atom = mol.atom(idx);
98
99 let atomic_number = atom.element.atomic_number();
100 let degree = mol.neighbors(idx).count() as u8;
101 let h_count = implicit_hcount(mol, idx);
102 let charge_adjusted = (atom.charge as i16 + 8) as u8;
105 let is_in_ring: u8 = if ring_set.contains_atom(idx) { 1 } else { 0 };
106 let is_aromatic: u8 = if atom.aromatic { 1 } else { 0 };
107
108 let id = fnv1a(&[
109 atomic_number,
110 degree,
111 h_count,
112 charge_adjusted,
113 is_in_ring,
114 is_aromatic,
115 ]);
116
117 fp.set((id % nbits as u64) as usize);
119 ids.push(id);
120 }
121
122 let mut new_ids: Vec<u64> = vec![0u64; n];
124
125 for r in 1..=config.radius {
126 for i in 0..n {
127 let idx = AtomIdx(i as u32);
128
129 let mut neighbor_info: Vec<(u8, u64)> = mol
131 .neighbors(idx)
132 .map(|(nb_idx, bond_idx)| {
133 let bond = mol.bond(bond_idx);
134 let btype = bond_type_int(bond.order);
135 let nb_id = ids[nb_idx.0 as usize];
136 (btype, nb_id)
137 })
138 .collect();
139
140 neighbor_info.sort_unstable();
142
143 let mut bytes: Vec<u8> = Vec::with_capacity(1 + 8 + neighbor_info.len() * 9);
146 bytes.push(r as u8);
147 bytes.extend_from_slice(&ids[i].to_le_bytes());
148 for (btype, nb_id) in &neighbor_info {
149 bytes.push(*btype);
150 bytes.extend_from_slice(&nb_id.to_le_bytes());
151 }
152
153 let new_id = fnv1a(&bytes);
154 new_ids[i] = new_id;
155
156 fp.set((new_id % nbits as u64) as usize);
158 }
159
160 core::mem::swap(&mut ids, &mut new_ids);
162 }
163
164 fp
165}
166
167pub fn ecfp4(mol: &Molecule) -> BitVec2048 {
173 ecfp(mol, &EcfpConfig { radius: 2, nbits: 2048 })
174}
175
176pub fn ecfp6(mol: &Molecule) -> BitVec2048 {
178 ecfp(mol, &EcfpConfig { radius: 3, nbits: 2048 })
179}
180
181pub fn tanimoto_ecfp4(a: &Molecule, b: &Molecule) -> f64 {
183 ecfp4(a).tanimoto(&ecfp4(b))
184}
185
186#[cfg(test)]
191mod tests {
192 use super::*;
193 use chematic_smiles::parse;
194
195 fn benzene() -> Molecule {
196 parse("c1ccccc1").unwrap()
197 }
198
199 fn ethane() -> Molecule {
200 parse("CC").unwrap()
201 }
202
203 fn toluene() -> Molecule {
204 parse("Cc1ccccc1").unwrap()
205 }
206
207 fn aspirin() -> Molecule {
208 parse("CC(=O)Oc1ccccc1C(=O)O").unwrap()
210 }
211
212 fn methane() -> Molecule {
213 parse("C").unwrap()
214 }
215
216 fn water() -> Molecule {
217 parse("O").unwrap()
218 }
219
220 #[test]
221 fn benzene_ecfp4_nonzero() {
222 let fp = ecfp4(&benzene());
223 assert!(fp.popcount() > 0, "benzene ECFP4 must be non-zero");
224 }
225
226 #[test]
227 fn benzene_ecfp4_deterministic() {
228 let fp1 = ecfp4(&benzene());
229 let fp2 = ecfp4(&benzene());
230 assert_eq!(fp1, fp2, "ECFP4 must be deterministic");
231 }
232
233 #[test]
234 fn ethane_vs_benzene_tanimoto_lt1() {
235 let t = tanimoto_ecfp4(ðane(), &benzene());
236 assert!(t < 1.0, "ethane and benzene must differ (tanimoto={t})");
237 }
238
239 #[test]
240 fn benzene_vs_benzene_tanimoto_eq1() {
241 let t = tanimoto_ecfp4(&benzene(), &benzene());
242 assert_eq!(t, 1.0, "identical molecules must have tanimoto == 1.0");
243 }
244
245 #[test]
246 fn benzene_vs_toluene_tanimoto_between() {
247 let t = tanimoto_ecfp4(&benzene(), &toluene());
248 assert!(t > 0.0, "benzene and toluene share bits (tanimoto={t})");
249 assert!(t < 1.0, "benzene and toluene are not identical (tanimoto={t})");
250 }
251
252 #[test]
253 fn aspirin_ecfp4_many_bits() {
254 let fp = ecfp4(&aspirin());
255 assert!(
256 fp.popcount() > 5,
257 "aspirin ECFP4 must have more than 5 bits set (got {})",
258 fp.popcount()
259 );
260 }
261
262 #[test]
263 fn ecfp6_vs_ecfp4_benzene_differ() {
264 let fp4 = ecfp4(&benzene());
265 let fp6 = ecfp6(&benzene());
266 assert_ne!(
269 fp4.popcount(),
270 fp6.popcount(),
271 "ECFP6 and ECFP4 should produce different bit counts for benzene"
272 );
273 }
274
275 #[test]
276 fn methane_ecfp4_nonzero() {
277 let fp = ecfp4(&methane());
278 assert!(fp.popcount() > 0, "methane ECFP4 must be non-zero");
279 }
280
281 #[test]
282 fn water_ecfp4_nonzero() {
283 let fp = ecfp4(&water());
284 assert!(fp.popcount() > 0, "water ECFP4 must be non-zero");
285 }
286
287 #[test]
288 fn tanimoto_ecfp4_benzene_self_is_one() {
289 let t = tanimoto_ecfp4(&benzene(), &benzene());
290 assert_eq!(t, 1.0, "tanimoto_ecfp4 of identical molecules must be 1.0");
291 }
292
293 #[test]
294 fn tanimoto_ecfp4_methane_vs_benzene_lt_half() {
295 let t = tanimoto_ecfp4(&methane(), &benzene());
296 assert!(
297 t < 0.5,
298 "methane and benzene should be very dissimilar (tanimoto={t})"
299 );
300 }
301}