use chematic_core::{AtomIdx, BondOrder, Molecule, implicit_hcount};
use chematic_perception::find_sssr;
use crate::bitvec::BitVec2048;
const FNV_OFFSET: u64 = 14695981039346656037;
const FNV_PRIME: u64 = 1099511628211;
pub(crate) fn fnv1a(bytes: &[u8]) -> u64 {
let mut h = FNV_OFFSET;
for &b in bytes {
h ^= b as u64;
h = h.wrapping_mul(FNV_PRIME);
}
h
}
fn expand_atom_id(mol: &Molecule, i: usize, r: u32, ids: &[u64]) -> u64 {
let idx = AtomIdx(i as u32);
let mut neighbor_info: Vec<(u8, u64)> = mol
.neighbors(idx)
.map(|(nb_idx, bond_idx)| (bond_type_int(mol.bond(bond_idx).order), ids[nb_idx.0 as usize]))
.collect();
neighbor_info.sort_unstable();
let mut bytes: Vec<u8> = Vec::with_capacity(1 + 8 + neighbor_info.len() * 9);
bytes.push(r as u8);
bytes.extend_from_slice(&ids[i].to_le_bytes());
for (btype, nb_id) in &neighbor_info {
bytes.push(*btype);
bytes.extend_from_slice(&nb_id.to_le_bytes());
}
fnv1a(&bytes)
}
pub(crate) fn initial_atom_id(
mol: &Molecule,
idx: AtomIdx,
ring_set: &chematic_perception::RingSet,
use_chirality: bool,
) -> u64 {
let atom = mol.atom(idx);
let charge_adjusted = (atom.charge as i16 + 8).clamp(0, 255) as u8;
let base_bytes = [
atom.element.atomic_number(),
mol.neighbors(idx).count().min(255) as u8,
implicit_hcount(mol, idx),
charge_adjusted,
ring_set.contains_atom(idx) as u8,
atom.aromatic as u8,
];
if use_chirality {
use chematic_core::Chirality;
let chirality_byte = match atom.chirality {
Chirality::None => 0u8,
Chirality::CounterClockwise => 1u8,
Chirality::Clockwise => 2u8,
};
let mut chiral_bytes = base_bytes.to_vec();
chiral_bytes.push(chirality_byte);
fnv1a(&chiral_bytes)
} else {
fnv1a(&base_bytes)
}
}
#[derive(Debug, Clone)]
pub struct EcfpConfig {
pub radius: u32,
pub nbits: usize,
pub use_chirality: bool,
pub use_double_fold: bool,
}
impl Default for EcfpConfig {
fn default() -> Self {
Self {
radius: 2,
nbits: 2048,
use_chirality: false,
use_double_fold: false,
}
}
}
#[inline]
pub(crate) fn bond_type_int(order: BondOrder) -> u8 {
match order {
BondOrder::Single | BondOrder::Up | BondOrder::Down | BondOrder::Dative => 1,
BondOrder::Double => 2,
BondOrder::Triple => 3,
BondOrder::Aromatic => 4,
BondOrder::Quadruple => 5,
BondOrder::Zero => 0,
BondOrder::QueryAny => 6,
BondOrder::QuerySingleOrDouble => 7,
BondOrder::QuerySingleOrAromatic => 8,
BondOrder::QueryDoubleOrAromatic => 9,
}
}
pub const MAX_ECFP_RADIUS: u32 = 20;
pub fn ecfp(mol: &Molecule, config: &EcfpConfig) -> BitVec2048 {
let n = mol.atom_count();
let nbits = config.nbits;
let config = &EcfpConfig {
radius: config.radius.min(MAX_ECFP_RADIUS),
..*config
};
let mut fp = BitVec2048::new();
if n == 0 {
return fp;
}
let ring_set = find_sssr(mol);
let mut ids: Vec<u64> = Vec::with_capacity(n);
for i in 0..n {
let idx = AtomIdx(i as u32);
let id = initial_atom_id(mol, idx, &ring_set, config.use_chirality);
fp.set((id % nbits as u64) as usize);
if config.use_double_fold {
fp.set(((id >> 11) % nbits as u64) as usize);
}
ids.push(id);
}
let mut new_ids: Vec<u64> = vec![0u64; n];
for r in 1..=config.radius {
for (i, slot) in new_ids.iter_mut().enumerate() {
let new_id = expand_atom_id(mol, i, r, &ids);
*slot = new_id;
fp.set((new_id % nbits as u64) as usize);
if config.use_double_fold {
fp.set(((new_id >> 11) % nbits as u64) as usize);
}
}
core::mem::swap(&mut ids, &mut new_ids);
}
fp
}
pub fn morgan_fp_counts(mol: &Molecule, radius: u32) -> std::collections::HashMap<u64, u32> {
use std::collections::HashMap;
const MAX_RADIUS: u32 = 20;
let radius = radius.min(MAX_RADIUS);
let n = mol.atom_count();
let mut counts: HashMap<u64, u32> = HashMap::new();
if n == 0 {
return counts;
}
let ring_set = find_sssr(mol);
let mut ids: Vec<u64> = (0..n)
.map(|i| initial_atom_id(mol, AtomIdx(i as u32), &ring_set, false))
.collect();
for &id in &ids {
*counts.entry(id).or_insert(0) += 1;
}
let mut new_ids = vec![0u64; n];
for r in 1..=radius {
for (i, slot) in new_ids.iter_mut().enumerate() {
let new_id = expand_atom_id(mol, i, r, &ids);
*slot = new_id;
*counts.entry(new_id).or_insert(0) += 1;
}
core::mem::swap(&mut ids, &mut new_ids);
}
counts
}
pub fn ecfp4(mol: &Molecule) -> BitVec2048 {
ecfp(mol, &EcfpConfig::default())
}
pub fn ecfp6(mol: &Molecule) -> BitVec2048 {
ecfp(mol, &EcfpConfig { radius: 3, ..EcfpConfig::default() })
}
pub fn tanimoto_ecfp4(a: &Molecule, b: &Molecule) -> f64 {
ecfp4(a).tanimoto(&ecfp4(b))
}
#[cfg(test)]
mod tests {
use super::*;
use chematic_smiles::parse;
fn benzene() -> Molecule {
parse("c1ccccc1").unwrap()
}
fn ethane() -> Molecule {
parse("CC").unwrap()
}
fn toluene() -> Molecule {
parse("Cc1ccccc1").unwrap()
}
fn aspirin() -> Molecule {
parse("CC(=O)Oc1ccccc1C(=O)O").unwrap()
}
fn methane() -> Molecule {
parse("C").unwrap()
}
fn water() -> Molecule {
parse("O").unwrap()
}
#[test]
fn benzene_ecfp4_nonzero() {
let fp = ecfp4(&benzene());
assert!(fp.popcount() > 0, "benzene ECFP4 must be non-zero");
}
#[test]
fn benzene_ecfp4_deterministic() {
let fp1 = ecfp4(&benzene());
let fp2 = ecfp4(&benzene());
assert_eq!(fp1, fp2, "ECFP4 must be deterministic");
}
#[test]
fn ethane_vs_benzene_tanimoto_lt1() {
let t = tanimoto_ecfp4(ðane(), &benzene());
assert!(t < 1.0, "ethane and benzene must differ (tanimoto={t})");
}
#[test]
fn benzene_vs_benzene_tanimoto_eq1() {
let t = tanimoto_ecfp4(&benzene(), &benzene());
assert_eq!(t, 1.0, "identical molecules must have tanimoto == 1.0");
}
#[test]
fn benzene_vs_toluene_tanimoto_between() {
let t = tanimoto_ecfp4(&benzene(), &toluene());
assert!(t > 0.0, "benzene and toluene share bits (tanimoto={t})");
assert!(
t < 1.0,
"benzene and toluene are not identical (tanimoto={t})"
);
}
#[test]
fn aspirin_ecfp4_many_bits() {
let fp = ecfp4(&aspirin());
assert!(
fp.popcount() > 5,
"aspirin ECFP4 must have more than 5 bits set (got {})",
fp.popcount()
);
}
#[test]
fn ecfp6_vs_ecfp4_benzene_differ() {
let fp4 = ecfp4(&benzene());
let fp6 = ecfp6(&benzene());
assert_ne!(
fp4.popcount(),
fp6.popcount(),
"ECFP6 and ECFP4 should produce different bit counts for benzene"
);
}
#[test]
fn methane_ecfp4_nonzero() {
let fp = ecfp4(&methane());
assert!(fp.popcount() > 0, "methane ECFP4 must be non-zero");
}
#[test]
fn water_ecfp4_nonzero() {
let fp = ecfp4(&water());
assert!(fp.popcount() > 0, "water ECFP4 must be non-zero");
}
#[test]
fn tanimoto_ecfp4_benzene_self_is_one() {
let t = tanimoto_ecfp4(&benzene(), &benzene());
assert_eq!(t, 1.0, "tanimoto_ecfp4 of identical molecules must be 1.0");
}
#[test]
fn tanimoto_ecfp4_methane_vs_benzene_lt_half() {
let t = tanimoto_ecfp4(&methane(), &benzene());
assert!(
t < 0.5,
"methane and benzene should be very dissimilar (tanimoto={t})"
);
}
#[test]
fn morgan_counts_radius0_atom_count() {
let m = benzene();
let counts = morgan_fp_counts(&m, 0);
let total: u32 = counts.values().sum();
assert_eq!(
total,
m.atom_count() as u32,
"radius-0 total count == atom_count"
);
}
#[test]
fn morgan_counts_radius2_total_grows() {
let m = methane();
let n = m.atom_count() as u32;
let r = 2u32;
let counts = morgan_fp_counts(&m, r);
let total: u32 = counts.values().sum();
assert_eq!(
total,
n * (r + 1),
"methane total = atom_count * (radius+1)"
);
}
#[test]
fn morgan_counts_benzene_symmetry() {
let m = benzene();
let counts = morgan_fp_counts(&m, 0);
assert_eq!(
counts.len(),
1,
"benzene has one unique radius-0 environment"
);
assert_eq!(
*counts.values().next().unwrap(),
6,
"that environment appears 6 times"
);
}
#[test]
fn morgan_counts_empty_mol_is_empty() {
use chematic_core::MoleculeBuilder;
let m = MoleculeBuilder::new().build();
let counts = morgan_fp_counts(&m, 2);
assert!(counts.is_empty(), "empty molecule yields empty count map");
}
#[test]
fn morgan_counts_deterministic() {
let m = aspirin();
let c1 = morgan_fp_counts(&m, 2);
let c2 = morgan_fp_counts(&m, 2);
assert_eq!(c1, c2, "morgan_fp_counts must be deterministic");
}
#[test]
fn morgan_counts_consistent_with_ecfp_bits() {
let m = toluene();
let fp = ecfp(
&m,
&EcfpConfig {
radius: 2,
nbits: 2048,
use_chirality: false,
use_double_fold: false,
},
);
let counts = morgan_fp_counts(&m, 2);
for &hash in counts.keys() {
let bit = (hash % 2048) as usize;
assert!(
fp.get(bit),
"bit {bit} from count map not set in ECFP bitvec"
);
}
}
#[test]
fn ecfp4_ignores_chirality_by_default() {
let l_ala = parse("N[C@@H](C)C(=O)O").unwrap();
let d_ala = parse("N[C@H](C)C(=O)O").unwrap();
let fp_l = ecfp4(&l_ala);
let fp_d = ecfp4(&d_ala);
assert_eq!(
fp_l, fp_d,
"L/D-alanine ECFP4 should be identical when use_chirality=false"
);
}
#[test]
fn ecfp4_distinguishes_enantiomers_with_chirality() {
let l_ala = parse("N[C@@H](C)C(=O)O").unwrap();
let d_ala = parse("N[C@H](C)C(=O)O").unwrap();
let config = EcfpConfig {
radius: 2,
nbits: 2048,
use_chirality: true,
use_double_fold: false,
};
let fp_l = ecfp(&l_ala, &config);
let fp_d = ecfp(&d_ala, &config);
assert_ne!(
fp_l, fp_d,
"L/D-alanine ECFP4 must differ when use_chirality=true"
);
assert!(
fp_l.tanimoto(&fp_d) < 1.0,
"Tanimoto of L/D-alanine must be < 1.0 with use_chirality"
);
}
#[test]
fn ecfp4_non_chiral_generates_with_chirality_flag() {
let mol = parse("c1ccccc1").unwrap(); let config = EcfpConfig {
radius: 2,
nbits: 2048,
use_chirality: true,
use_double_fold: false,
};
let fp = ecfp(&mol, &config);
assert!(fp.popcount() > 0, "Benzene should generate non-empty ECFP4 with use_chirality=true");
}
}