use chematic_core::{AtomIdx, BondOrder, Molecule, implicit_hcount};
use chematic_perception::find_sssr;
use crate::bitvec::BitVec2048;
const FNV_OFFSET: u64 = 14695981039346656037;
const FNV_PRIME: u64 = 1099511628211;
pub(crate) fn fnv1a(bytes: &[u8]) -> u64 {
let mut h = FNV_OFFSET;
for &b in bytes {
h ^= b as u64;
h = h.wrapping_mul(FNV_PRIME);
}
h
}
#[derive(Debug, Clone)]
pub struct EcfpConfig {
pub radius: u32,
pub nbits: usize,
pub use_chirality: bool,
}
impl Default for EcfpConfig {
fn default() -> Self {
Self { radius: 2, nbits: 2048, use_chirality: false }
}
}
#[inline]
pub(crate) fn bond_type_int(order: BondOrder) -> u8 {
match order {
BondOrder::Single | BondOrder::Up | BondOrder::Down => 1,
BondOrder::Double => 2,
BondOrder::Triple => 3,
BondOrder::Aromatic => 4,
BondOrder::Quadruple => 5,
}
}
pub fn ecfp(mol: &Molecule, config: &EcfpConfig) -> BitVec2048 {
let n = mol.atom_count();
let nbits = config.nbits;
let mut fp = BitVec2048::new();
if n == 0 {
return fp;
}
let ring_set = find_sssr(mol);
let mut ids: Vec<u64> = Vec::with_capacity(n);
for i in 0..n {
let idx = AtomIdx(i as u32);
let atom = mol.atom(idx);
let charge_adjusted = (atom.charge as i16 + 8) as u8;
let base_bytes = [
atom.element.atomic_number(),
mol.neighbors(idx).count() as u8,
implicit_hcount(mol, idx),
charge_adjusted,
ring_set.contains_atom(idx) as u8,
atom.aromatic as u8,
];
let id = if config.use_chirality {
use chematic_core::Chirality;
let chirality_byte = match atom.chirality {
Chirality::None => 0u8,
Chirality::CounterClockwise => 1u8,
Chirality::Clockwise => 2u8,
};
let mut chiral_bytes = base_bytes.to_vec();
chiral_bytes.push(chirality_byte);
fnv1a(&chiral_bytes)
} else {
fnv1a(&base_bytes)
};
fp.set((id % nbits as u64) as usize);
ids.push(id);
}
let mut new_ids: Vec<u64> = vec![0u64; n];
for r in 1..=config.radius {
for i in 0..n {
let idx = AtomIdx(i as u32);
let mut neighbor_info: Vec<(u8, u64)> = mol
.neighbors(idx)
.map(|(nb_idx, bond_idx)| {
let btype = bond_type_int(mol.bond(bond_idx).order);
(btype, ids[nb_idx.0 as usize])
})
.collect();
neighbor_info.sort_unstable();
let mut bytes: Vec<u8> = Vec::with_capacity(1 + 8 + neighbor_info.len() * 9);
bytes.push(r as u8);
bytes.extend_from_slice(&ids[i].to_le_bytes());
for (btype, nb_id) in &neighbor_info {
bytes.push(*btype);
bytes.extend_from_slice(&nb_id.to_le_bytes());
}
let new_id = fnv1a(&bytes);
new_ids[i] = new_id;
fp.set((new_id % nbits as u64) as usize);
}
core::mem::swap(&mut ids, &mut new_ids);
}
fp
}
pub fn morgan_fp_counts(mol: &Molecule, radius: u32) -> std::collections::HashMap<u64, u32> {
use std::collections::HashMap;
let n = mol.atom_count();
let mut counts: HashMap<u64, u32> = HashMap::new();
if n == 0 {
return counts;
}
let ring_set = find_sssr(mol);
let mut ids: Vec<u64> = (0..n)
.map(|i| {
let idx = AtomIdx(i as u32);
let atom = mol.atom(idx);
let charge_adjusted = (atom.charge as i16 + 8) as u8;
fnv1a(&[
atom.element.atomic_number(),
mol.neighbors(idx).count() as u8,
implicit_hcount(mol, idx),
charge_adjusted,
ring_set.contains_atom(idx) as u8,
atom.aromatic as u8,
])
})
.collect();
for &id in &ids {
*counts.entry(id).or_insert(0) += 1;
}
let mut new_ids = vec![0u64; n];
for r in 1..=radius {
for i in 0..n {
let idx = AtomIdx(i as u32);
let mut neighbor_info: Vec<(u8, u64)> = mol
.neighbors(idx)
.map(|(nb_idx, bond_idx)| {
(bond_type_int(mol.bond(bond_idx).order), ids[nb_idx.0 as usize])
})
.collect();
neighbor_info.sort_unstable();
let mut bytes: Vec<u8> = Vec::with_capacity(1 + 8 + neighbor_info.len() * 9);
bytes.push(r as u8);
bytes.extend_from_slice(&ids[i].to_le_bytes());
for (btype, nb_id) in &neighbor_info {
bytes.push(*btype);
bytes.extend_from_slice(&nb_id.to_le_bytes());
}
let new_id = fnv1a(&bytes);
new_ids[i] = new_id;
*counts.entry(new_id).or_insert(0) += 1;
}
core::mem::swap(&mut ids, &mut new_ids);
}
counts
}
pub fn ecfp4(mol: &Molecule) -> BitVec2048 {
ecfp(mol, &EcfpConfig::default())
}
pub fn ecfp6(mol: &Molecule) -> BitVec2048 {
ecfp(mol, &EcfpConfig { radius: 3, nbits: 2048, use_chirality: false })
}
pub fn tanimoto_ecfp4(a: &Molecule, b: &Molecule) -> f64 {
ecfp4(a).tanimoto(&ecfp4(b))
}
#[cfg(test)]
mod tests {
use super::*;
use chematic_smiles::parse;
fn benzene() -> Molecule {
parse("c1ccccc1").unwrap()
}
fn ethane() -> Molecule {
parse("CC").unwrap()
}
fn toluene() -> Molecule {
parse("Cc1ccccc1").unwrap()
}
fn aspirin() -> Molecule {
parse("CC(=O)Oc1ccccc1C(=O)O").unwrap()
}
fn methane() -> Molecule {
parse("C").unwrap()
}
fn water() -> Molecule {
parse("O").unwrap()
}
#[test]
fn benzene_ecfp4_nonzero() {
let fp = ecfp4(&benzene());
assert!(fp.popcount() > 0, "benzene ECFP4 must be non-zero");
}
#[test]
fn benzene_ecfp4_deterministic() {
let fp1 = ecfp4(&benzene());
let fp2 = ecfp4(&benzene());
assert_eq!(fp1, fp2, "ECFP4 must be deterministic");
}
#[test]
fn ethane_vs_benzene_tanimoto_lt1() {
let t = tanimoto_ecfp4(ðane(), &benzene());
assert!(t < 1.0, "ethane and benzene must differ (tanimoto={t})");
}
#[test]
fn benzene_vs_benzene_tanimoto_eq1() {
let t = tanimoto_ecfp4(&benzene(), &benzene());
assert_eq!(t, 1.0, "identical molecules must have tanimoto == 1.0");
}
#[test]
fn benzene_vs_toluene_tanimoto_between() {
let t = tanimoto_ecfp4(&benzene(), &toluene());
assert!(t > 0.0, "benzene and toluene share bits (tanimoto={t})");
assert!(t < 1.0, "benzene and toluene are not identical (tanimoto={t})");
}
#[test]
fn aspirin_ecfp4_many_bits() {
let fp = ecfp4(&aspirin());
assert!(
fp.popcount() > 5,
"aspirin ECFP4 must have more than 5 bits set (got {})",
fp.popcount()
);
}
#[test]
fn ecfp6_vs_ecfp4_benzene_differ() {
let fp4 = ecfp4(&benzene());
let fp6 = ecfp6(&benzene());
assert_ne!(
fp4.popcount(),
fp6.popcount(),
"ECFP6 and ECFP4 should produce different bit counts for benzene"
);
}
#[test]
fn methane_ecfp4_nonzero() {
let fp = ecfp4(&methane());
assert!(fp.popcount() > 0, "methane ECFP4 must be non-zero");
}
#[test]
fn water_ecfp4_nonzero() {
let fp = ecfp4(&water());
assert!(fp.popcount() > 0, "water ECFP4 must be non-zero");
}
#[test]
fn tanimoto_ecfp4_benzene_self_is_one() {
let t = tanimoto_ecfp4(&benzene(), &benzene());
assert_eq!(t, 1.0, "tanimoto_ecfp4 of identical molecules must be 1.0");
}
#[test]
fn tanimoto_ecfp4_methane_vs_benzene_lt_half() {
let t = tanimoto_ecfp4(&methane(), &benzene());
assert!(
t < 0.5,
"methane and benzene should be very dissimilar (tanimoto={t})"
);
}
#[test]
fn morgan_counts_radius0_atom_count() {
let m = benzene();
let counts = morgan_fp_counts(&m, 0);
let total: u32 = counts.values().sum();
assert_eq!(total, m.atom_count() as u32, "radius-0 total count == atom_count");
}
#[test]
fn morgan_counts_radius2_total_grows() {
let m = methane();
let n = m.atom_count() as u32;
let r = 2u32;
let counts = morgan_fp_counts(&m, r);
let total: u32 = counts.values().sum();
assert_eq!(total, n * (r + 1), "methane total = atom_count * (radius+1)");
}
#[test]
fn morgan_counts_benzene_symmetry() {
let m = benzene();
let counts = morgan_fp_counts(&m, 0);
assert_eq!(counts.len(), 1, "benzene has one unique radius-0 environment");
assert_eq!(*counts.values().next().unwrap(), 6, "that environment appears 6 times");
}
#[test]
fn morgan_counts_empty_mol_is_empty() {
use chematic_core::MoleculeBuilder;
let m = MoleculeBuilder::new().build();
let counts = morgan_fp_counts(&m, 2);
assert!(counts.is_empty(), "empty molecule yields empty count map");
}
#[test]
fn morgan_counts_deterministic() {
let m = aspirin();
let c1 = morgan_fp_counts(&m, 2);
let c2 = morgan_fp_counts(&m, 2);
assert_eq!(c1, c2, "morgan_fp_counts must be deterministic");
}
#[test]
fn morgan_counts_consistent_with_ecfp_bits() {
let m = toluene();
let fp = ecfp(&m, &EcfpConfig { radius: 2, nbits: 2048, use_chirality: false });
let counts = morgan_fp_counts(&m, 2);
for &hash in counts.keys() {
let bit = (hash % 2048) as usize;
assert!(fp.get(bit), "bit {bit} from count map not set in ECFP bitvec");
}
}
#[test]
fn ecfp4_ignores_chirality_by_default() {
let l_ala = parse("N[C@@H](C)C(=O)O").unwrap();
let d_ala = parse("N[C@H](C)C(=O)O").unwrap();
let fp_l = ecfp4(&l_ala);
let fp_d = ecfp4(&d_ala);
assert_eq!(fp_l, fp_d, "L/D-alanine ECFP4 should be identical when use_chirality=false");
}
#[test]
fn ecfp4_distinguishes_enantiomers_with_chirality() {
let l_ala = parse("N[C@@H](C)C(=O)O").unwrap();
let d_ala = parse("N[C@H](C)C(=O)O").unwrap();
let config = EcfpConfig { radius: 2, nbits: 2048, use_chirality: true };
let fp_l = ecfp(&l_ala, &config);
let fp_d = ecfp(&d_ala, &config);
assert_ne!(fp_l, fp_d, "L/D-alanine ECFP4 must differ when use_chirality=true");
assert!(fp_l.tanimoto(&fp_d) < 1.0, "Tanimoto of L/D-alanine must be < 1.0 with use_chirality");
}
}