use crate::ecfp::fnv1a;
use chematic_core::{AtomIdx, Molecule};
#[derive(Debug, Clone)]
pub struct Map4Config {
pub max_radius: usize,
pub n_permutations: usize,
}
impl Default for Map4Config {
fn default() -> Self {
Self { max_radius: 2, n_permutations: 1024 }
}
}
pub fn map4(mol: &Molecule, config: &Map4Config) -> Vec<u32> {
let shingles = collect_shingles(mol, config.max_radius);
minhash(&shingles, config.n_permutations)
}
pub fn map4_default(mol: &Molecule) -> Vec<u32> {
map4(mol, &Map4Config::default())
}
pub fn tanimoto_map4(a: &[u32], b: &[u32]) -> f64 {
if a.len() != b.len() || a.is_empty() {
return 0.0;
}
let matches = a.iter().zip(b.iter()).filter(|(x, y)| x == y).count();
matches as f64 / a.len() as f64
}
fn collect_shingles(mol: &Molecule, max_radius: usize) -> Vec<u64> {
let n = mol.atom_count();
let dist = bfs_all_pairs(mol);
let mut hashes = Vec::new();
for a in 0..n {
for b in a..n {
let d = dist[a][b];
if d == usize::MAX {
continue; }
for r_a in 0..=max_radius {
for r_b in 0..=max_radius {
let env_a = circular_env_hash(mol, a, r_a, &dist);
let env_b = circular_env_hash(mol, b, r_b, &dist);
let (ea, eb) = if env_a <= env_b { (env_a, env_b) } else { (env_b, env_a) };
let mut h = fnv1a(ea.to_le_bytes().as_ref());
h = fnv1a_extend(h, eb.to_le_bytes().as_ref());
h = fnv1a_extend(h, &(d as u64).to_le_bytes());
hashes.push(h);
}
}
}
}
hashes.sort_unstable();
hashes.dedup();
hashes
}
fn circular_env_hash(mol: &Molecule, center: usize, radius: usize, dist: &[Vec<usize>]) -> u64 {
let mut contributions: Vec<(usize, u64)> = mol
.atoms()
.filter_map(|(idx, atom)| {
let d = dist[center][idx.0 as usize];
if d <= radius {
let atom_hash = fnv1a(&[
atom.element.atomic_number(),
atom.charge.unsigned_abs() as u8,
atom.element.atomic_number().wrapping_add(d as u8),
]);
Some((d, atom_hash))
} else {
None
}
})
.collect();
contributions.sort_unstable();
let mut h: u64 = 0xcbf29ce484222325; for (d, ah) in contributions {
h = fnv1a_extend(h, &(d as u64).to_le_bytes());
h = fnv1a_extend(h, &ah.to_le_bytes());
}
h
}
fn minhash(shingles: &[u64], n_permutations: usize) -> Vec<u32> {
let mut sig = vec![u32::MAX; n_permutations];
for &h in shingles {
for i in 0..n_permutations {
let mixed = fnv1a_mix(h, i as u64);
let v = (mixed >> 32) as u32;
if v < sig[i] {
sig[i] = v;
}
}
}
sig
}
fn bfs_all_pairs(mol: &Molecule) -> Vec<Vec<usize>> {
let n = mol.atom_count();
(0..n).map(|src| bfs_from(mol, src, n)).collect()
}
fn bfs_from(mol: &Molecule, src: usize, n: usize) -> Vec<usize> {
let mut dist = vec![usize::MAX; n];
dist[src] = 0;
let mut queue = std::collections::VecDeque::new();
queue.push_back(src);
while let Some(cur) = queue.pop_front() {
let d = dist[cur];
for (nb, _) in mol.neighbors(AtomIdx(cur as u32)) {
let nbi = nb.0 as usize;
if dist[nbi] == usize::MAX {
dist[nbi] = d + 1;
queue.push_back(nbi);
}
}
}
dist
}
#[inline]
fn fnv1a_extend(mut h: u64, bytes: &[u8]) -> u64 {
for &b in bytes {
h ^= b as u64;
h = h.wrapping_mul(0x00000100000001b3);
}
h
}
#[inline]
fn fnv1a_mix(h: u64, seed: u64) -> u64 {
let mut v = h.wrapping_add(seed.wrapping_mul(0x9e3779b97f4a7c15));
v ^= v >> 30;
v = v.wrapping_mul(0xbf58476d1ce4e5b9);
v ^= v >> 27;
v = v.wrapping_mul(0x94d049bb133111eb);
v ^= v >> 31;
v
}
#[cfg(test)]
mod tests {
use super::*;
use chematic_smiles::parse;
#[test]
fn map4_same_molecule_identical() {
let mol = parse("CC").expect("parse");
let a = map4_default(&mol);
let b = map4_default(&mol);
assert_eq!(a, b, "MAP4 must be deterministic");
}
#[test]
fn map4_self_similarity_is_one() {
let mol = parse("c1ccccc1").expect("parse benzene");
let fp = map4_default(&mol);
assert!((tanimoto_map4(&fp, &fp) - 1.0).abs() < 1e-9);
}
#[test]
fn map4_similar_molecules_higher_than_dissimilar() {
let benzene = parse("c1ccccc1").expect("benzene");
let toluene = parse("Cc1ccccc1").expect("toluene");
let methane = parse("C").expect("methane");
let fp_benz = map4_default(&benzene);
let fp_tol = map4_default(&toluene);
let fp_meth = map4_default(&methane);
let sim_close = tanimoto_map4(&fp_benz, &fp_tol);
let sim_far = tanimoto_map4(&fp_benz, &fp_meth);
assert!(sim_close > sim_far,
"benzene-toluene ({:.3}) should be > benzene-methane ({:.3})",
sim_close, sim_far);
}
#[test]
fn map4_length_equals_n_permutations() {
let mol = parse("CCO").expect("parse");
let cfg = Map4Config { max_radius: 1, n_permutations: 512 };
let fp = map4(&mol, &cfg);
assert_eq!(fp.len(), 512);
}
#[test]
fn tanimoto_bounds() {
let mol1 = parse("CC").expect("parse");
let mol2 = parse("CCC").expect("parse");
let fp1 = map4_default(&mol1);
let fp2 = map4_default(&mol2);
let t = tanimoto_map4(&fp1, &fp2);
assert!((0.0..=1.0).contains(&t), "Tanimoto out of range: {}", t);
}
}