use chematic_core::{AtomIdx, BondIdx, Molecule};
use crate::bitvec::BitVec2048;
use crate::ecfp::{bond_type_int as bond_byte, fnv1a};
#[derive(Debug, Clone)]
pub struct TopoPathConfig {
pub max_len: usize,
pub nbits: usize,
}
impl Default for TopoPathConfig {
fn default() -> Self {
Self { max_len: 7, nbits: 2048 }
}
}
pub fn topo_path(mol: &Molecule, config: &TopoPathConfig) -> BitVec2048 {
let mut fp = BitVec2048::new();
if mol.atom_count() == 0 {
return fp;
}
let n = mol.atom_count();
let mut path_atoms: Vec<u8> = Vec::with_capacity(config.max_len);
let mut path_bonds: Vec<u8> = Vec::with_capacity(config.max_len.saturating_sub(1));
let mut visited: Vec<bool> = vec![false; n];
for start in 0..n {
let start_idx = AtomIdx(start as u32);
path_atoms.push(mol.atom(start_idx).element.atomic_number());
visited[start] = true;
dfs(mol, start_idx, None, &mut path_atoms, &mut path_bonds, &mut visited, &mut fp, config);
path_atoms.pop();
visited[start] = false;
}
fp
}
fn dfs(
mol: &Molecule,
atom: AtomIdx,
from_bond: Option<BondIdx>,
path_atoms: &mut Vec<u8>,
path_bonds: &mut Vec<u8>,
visited: &mut Vec<bool>,
fp: &mut BitVec2048,
config: &TopoPathConfig,
) {
if path_atoms.len() >= 2 {
hash_path(path_atoms, path_bonds, fp, config.nbits);
}
if path_atoms.len() >= config.max_len {
return;
}
for (nbr, bid) in mol.neighbors(atom) {
if Some(bid) == from_bond { continue; }
if visited[nbr.0 as usize] { continue; }
let order = mol.bond(bid).order;
path_bonds.push(bond_byte(order));
path_atoms.push(mol.atom(nbr).element.atomic_number());
visited[nbr.0 as usize] = true;
dfs(mol, nbr, Some(bid), path_atoms, path_bonds, visited, fp, config);
visited[nbr.0 as usize] = false;
path_atoms.pop();
path_bonds.pop();
}
}
fn hash_path(atoms: &[u8], bonds: &[u8], fp: &mut BitVec2048, nbits: usize) {
let len = atoms.len() * 2 - 1;
let mut fwd = Vec::with_capacity(len);
let mut rev = Vec::with_capacity(len);
for i in 0..atoms.len() {
fwd.push(atoms[i]);
if i + 1 < atoms.len() { fwd.push(bonds[i]); }
}
for i in (0..atoms.len()).rev() {
rev.push(atoms[i]);
if i > 0 { rev.push(bonds[i - 1]); }
}
let canonical = if fwd <= rev { fwd } else { rev };
let hash = fnv1a(&canonical);
fp.set((hash % nbits as u64) as usize);
}
pub fn tanimoto_topo_path(a: &Molecule, b: &Molecule) -> f64 {
let cfg = TopoPathConfig::default();
topo_path(a, &cfg).tanimoto(&topo_path(b, &cfg))
}
#[cfg(test)]
mod tests {
use super::*;
use chematic_smiles::parse;
fn cfg() -> TopoPathConfig { TopoPathConfig::default() }
#[test]
fn ethane_nonzero() {
let mol = parse("CC").unwrap();
assert!(topo_path(&mol, &cfg()).popcount() > 0);
}
#[test]
fn benzene_nonzero() {
let mol = parse("c1ccccc1").unwrap();
assert!(topo_path(&mol, &cfg()).popcount() > 0);
}
#[test]
fn aspirin_many_bits() {
let mol = parse("CC(=O)Oc1ccccc1C(=O)O").unwrap();
let fp = topo_path(&mol, &cfg());
assert!(fp.popcount() > 10, "aspirin topo_path got {}", fp.popcount());
}
#[test]
fn deterministic() {
let mol = parse("c1ccccc1").unwrap();
assert_eq!(topo_path(&mol, &cfg()), topo_path(&mol, &cfg()));
}
#[test]
fn longer_max_len_more_bits() {
let mol = parse("CC(=O)Oc1ccccc1C(=O)O").unwrap();
let fp2 = topo_path(&mol, &TopoPathConfig { max_len: 2, nbits: 2048 });
let fp7 = topo_path(&mol, &cfg());
assert!(
fp2.popcount() <= fp7.popcount(),
"max_len=2 ({}) should have <= bits than max_len=7 ({})",
fp2.popcount(), fp7.popcount()
);
}
#[test]
fn ethane_vs_benzene_differ() {
let e = parse("CC").unwrap();
let b = parse("c1ccccc1").unwrap();
let t = topo_path(&e, &cfg()).tanimoto(&topo_path(&b, &cfg()));
assert!(t < 1.0, "ethane and benzene must differ (tanimoto={t})");
}
}