1use chematic_core::{AtomIdx, BondIdx, BondOrder, Molecule};
8
9use crate::bitvec::BitVec2048;
10
11const FNV_OFFSET: u64 = 14695981039346656037;
12const FNV_PRIME: u64 = 1099511628211;
13
14fn fnv1a(bytes: &[u8]) -> u64 {
15 let mut h = FNV_OFFSET;
16 for &b in bytes {
17 h ^= b as u64;
18 h = h.wrapping_mul(FNV_PRIME);
19 }
20 h
21}
22
23fn bond_byte(order: BondOrder) -> u8 {
24 match order {
25 BondOrder::Single | BondOrder::Up | BondOrder::Down => 1,
26 BondOrder::Double => 2,
27 BondOrder::Triple => 3,
28 BondOrder::Aromatic => 4,
29 BondOrder::Quadruple => 5,
30 }
31}
32
33#[derive(Debug, Clone)]
35pub struct TopoPathConfig {
36 pub max_len: usize,
38 pub nbits: usize,
40}
41
42impl Default for TopoPathConfig {
43 fn default() -> Self {
44 Self { max_len: 7, nbits: 2048 }
45 }
46}
47
48pub fn topo_path(mol: &Molecule, config: &TopoPathConfig) -> BitVec2048 {
55 let mut fp = BitVec2048::new();
56 if mol.atom_count() == 0 {
57 return fp;
58 }
59 let n = mol.atom_count();
60 let mut path_atoms: Vec<u8> = Vec::with_capacity(config.max_len);
61 let mut path_bonds: Vec<u8> = Vec::with_capacity(config.max_len.saturating_sub(1));
62 let mut visited: Vec<bool> = vec![false; n];
63
64 for start in 0..n {
65 let start_idx = AtomIdx(start as u32);
66 path_atoms.push(mol.atom(start_idx).element.atomic_number());
67 visited[start] = true;
68 dfs(mol, start_idx, None, &mut path_atoms, &mut path_bonds, &mut visited, &mut fp, config);
69 path_atoms.pop();
70 visited[start] = false;
71 }
72 fp
73}
74
75fn dfs(
76 mol: &Molecule,
77 atom: AtomIdx,
78 from_bond: Option<BondIdx>,
79 path_atoms: &mut Vec<u8>,
80 path_bonds: &mut Vec<u8>,
81 visited: &mut Vec<bool>,
82 fp: &mut BitVec2048,
83 config: &TopoPathConfig,
84) {
85 if path_atoms.len() >= 2 {
87 hash_path(path_atoms, path_bonds, fp, config.nbits);
88 }
89 if path_atoms.len() >= config.max_len {
90 return;
91 }
92 for (nbr, bid) in mol.neighbors(atom) {
93 if Some(bid) == from_bond { continue; }
94 if visited[nbr.0 as usize] { continue; }
95 let order = mol.bond(bid).order;
96 path_bonds.push(bond_byte(order));
97 path_atoms.push(mol.atom(nbr).element.atomic_number());
98 visited[nbr.0 as usize] = true;
99 dfs(mol, nbr, Some(bid), path_atoms, path_bonds, visited, fp, config);
100 visited[nbr.0 as usize] = false;
101 path_atoms.pop();
102 path_bonds.pop();
103 }
104}
105
106fn hash_path(atoms: &[u8], bonds: &[u8], fp: &mut BitVec2048, nbits: usize) {
107 let len = atoms.len() * 2 - 1;
109 let mut fwd = Vec::with_capacity(len);
110 let mut rev = Vec::with_capacity(len);
111 for i in 0..atoms.len() {
112 fwd.push(atoms[i]);
113 if i + 1 < atoms.len() { fwd.push(bonds[i]); }
114 }
115 for i in (0..atoms.len()).rev() {
116 rev.push(atoms[i]);
117 if i > 0 { rev.push(bonds[i - 1]); }
118 }
119 let canonical = if fwd <= rev { fwd } else { rev };
120 let hash = fnv1a(&canonical);
121 fp.set((hash % nbits as u64) as usize);
122}
123
124#[cfg(test)]
125mod tests {
126 use super::*;
127 use chematic_smiles::parse;
128
129 fn cfg() -> TopoPathConfig { TopoPathConfig::default() }
130
131 #[test]
132 fn ethane_nonzero() {
133 let mol = parse("CC").unwrap();
134 assert!(topo_path(&mol, &cfg()).popcount() > 0);
135 }
136
137 #[test]
138 fn benzene_nonzero() {
139 let mol = parse("c1ccccc1").unwrap();
140 assert!(topo_path(&mol, &cfg()).popcount() > 0);
141 }
142
143 #[test]
144 fn aspirin_many_bits() {
145 let mol = parse("CC(=O)Oc1ccccc1C(=O)O").unwrap();
146 let fp = topo_path(&mol, &cfg());
147 assert!(fp.popcount() > 10, "aspirin topo_path got {}", fp.popcount());
148 }
149
150 #[test]
151 fn deterministic() {
152 let mol = parse("c1ccccc1").unwrap();
153 assert_eq!(topo_path(&mol, &cfg()), topo_path(&mol, &cfg()));
154 }
155
156 #[test]
157 fn longer_max_len_more_bits() {
158 let mol = parse("CC(=O)Oc1ccccc1C(=O)O").unwrap();
159 let fp2 = topo_path(&mol, &TopoPathConfig { max_len: 2, nbits: 2048 });
160 let fp7 = topo_path(&mol, &cfg());
161 assert!(
162 fp2.popcount() <= fp7.popcount(),
163 "max_len=2 ({}) should have <= bits than max_len=7 ({})",
164 fp2.popcount(), fp7.popcount()
165 );
166 }
167
168 #[test]
169 fn ethane_vs_benzene_differ() {
170 let e = parse("CC").unwrap();
171 let b = parse("c1ccccc1").unwrap();
172 let t = topo_path(&e, &cfg()).tanimoto(&topo_path(&b, &cfg()));
173 assert!(t < 1.0, "ethane and benzene must differ (tanimoto={t})");
174 }
175}