1use chematic_core::{AtomIdx, BondIdx, Molecule};
8
9use crate::bitvec::BitVec2048;
10use crate::ecfp::{bond_type_int as bond_byte, fnv1a};
11
12#[derive(Debug, Clone)]
14pub struct TopoPathConfig {
15 pub max_len: usize,
17 pub nbits: usize,
19}
20
21impl Default for TopoPathConfig {
22 fn default() -> Self {
23 Self { max_len: 7, nbits: 2048 }
24 }
25}
26
27pub fn topo_path(mol: &Molecule, config: &TopoPathConfig) -> BitVec2048 {
34 let mut fp = BitVec2048::new();
35 if mol.atom_count() == 0 {
36 return fp;
37 }
38 let n = mol.atom_count();
39 let mut path_atoms: Vec<u8> = Vec::with_capacity(config.max_len);
40 let mut path_bonds: Vec<u8> = Vec::with_capacity(config.max_len.saturating_sub(1));
41 let mut visited: Vec<bool> = vec![false; n];
42
43 for start in 0..n {
44 let start_idx = AtomIdx(start as u32);
45 path_atoms.push(mol.atom(start_idx).element.atomic_number());
46 visited[start] = true;
47 dfs(mol, start_idx, None, &mut path_atoms, &mut path_bonds, &mut visited, &mut fp, config);
48 path_atoms.pop();
49 visited[start] = false;
50 }
51 fp
52}
53
54fn dfs(
55 mol: &Molecule,
56 atom: AtomIdx,
57 from_bond: Option<BondIdx>,
58 path_atoms: &mut Vec<u8>,
59 path_bonds: &mut Vec<u8>,
60 visited: &mut Vec<bool>,
61 fp: &mut BitVec2048,
62 config: &TopoPathConfig,
63) {
64 if path_atoms.len() >= 2 {
66 hash_path(path_atoms, path_bonds, fp, config.nbits);
67 }
68 if path_atoms.len() >= config.max_len {
69 return;
70 }
71 for (nbr, bid) in mol.neighbors(atom) {
72 if Some(bid) == from_bond { continue; }
73 if visited[nbr.0 as usize] { continue; }
74 let order = mol.bond(bid).order;
75 path_bonds.push(bond_byte(order));
76 path_atoms.push(mol.atom(nbr).element.atomic_number());
77 visited[nbr.0 as usize] = true;
78 dfs(mol, nbr, Some(bid), path_atoms, path_bonds, visited, fp, config);
79 visited[nbr.0 as usize] = false;
80 path_atoms.pop();
81 path_bonds.pop();
82 }
83}
84
85fn hash_path(atoms: &[u8], bonds: &[u8], fp: &mut BitVec2048, nbits: usize) {
86 let len = atoms.len() * 2 - 1;
88 let mut fwd = Vec::with_capacity(len);
89 let mut rev = Vec::with_capacity(len);
90 for i in 0..atoms.len() {
91 fwd.push(atoms[i]);
92 if i + 1 < atoms.len() { fwd.push(bonds[i]); }
93 }
94 for i in (0..atoms.len()).rev() {
95 rev.push(atoms[i]);
96 if i > 0 { rev.push(bonds[i - 1]); }
97 }
98 let canonical = if fwd <= rev { fwd } else { rev };
99 let hash = fnv1a(&canonical);
100 fp.set((hash % nbits as u64) as usize);
101}
102
103pub fn tanimoto_topo_path(a: &Molecule, b: &Molecule) -> f64 {
105 let cfg = TopoPathConfig::default();
106 topo_path(a, &cfg).tanimoto(&topo_path(b, &cfg))
107}
108
109#[cfg(test)]
110mod tests {
111 use super::*;
112 use chematic_smiles::parse;
113
114 fn cfg() -> TopoPathConfig { TopoPathConfig::default() }
115
116 #[test]
117 fn ethane_nonzero() {
118 let mol = parse("CC").unwrap();
119 assert!(topo_path(&mol, &cfg()).popcount() > 0);
120 }
121
122 #[test]
123 fn benzene_nonzero() {
124 let mol = parse("c1ccccc1").unwrap();
125 assert!(topo_path(&mol, &cfg()).popcount() > 0);
126 }
127
128 #[test]
129 fn aspirin_many_bits() {
130 let mol = parse("CC(=O)Oc1ccccc1C(=O)O").unwrap();
131 let fp = topo_path(&mol, &cfg());
132 assert!(fp.popcount() > 10, "aspirin topo_path got {}", fp.popcount());
133 }
134
135 #[test]
136 fn deterministic() {
137 let mol = parse("c1ccccc1").unwrap();
138 assert_eq!(topo_path(&mol, &cfg()), topo_path(&mol, &cfg()));
139 }
140
141 #[test]
142 fn longer_max_len_more_bits() {
143 let mol = parse("CC(=O)Oc1ccccc1C(=O)O").unwrap();
144 let fp2 = topo_path(&mol, &TopoPathConfig { max_len: 2, nbits: 2048 });
145 let fp7 = topo_path(&mol, &cfg());
146 assert!(
147 fp2.popcount() <= fp7.popcount(),
148 "max_len=2 ({}) should have <= bits than max_len=7 ({})",
149 fp2.popcount(), fp7.popcount()
150 );
151 }
152
153 #[test]
154 fn ethane_vs_benzene_differ() {
155 let e = parse("CC").unwrap();
156 let b = parse("c1ccccc1").unwrap();
157 let t = topo_path(&e, &cfg()).tanimoto(&topo_path(&b, &cfg()));
158 assert!(t < 1.0, "ethane and benzene must differ (tanimoto={t})");
159 }
160}