Skip to main content

chematic_fp/
topo_path.rs

1//! Topological path fingerprints.
2//!
3//! Enumerates all simple paths of length 1..=`max_len` bonds and hashes
4//! them with FNV-1a. Paths are canonicalised (forward vs. reverse) so that
5//! each unique path is hashed only once.
6
7use chematic_core::{AtomIdx, BondIdx, BondOrder, Molecule};
8
9use crate::bitvec::BitVec2048;
10
11const FNV_OFFSET: u64 = 14695981039346656037;
12const FNV_PRIME: u64 = 1099511628211;
13
14fn fnv1a(bytes: &[u8]) -> u64 {
15    let mut h = FNV_OFFSET;
16    for &b in bytes {
17        h ^= b as u64;
18        h = h.wrapping_mul(FNV_PRIME);
19    }
20    h
21}
22
23fn bond_byte(order: BondOrder) -> u8 {
24    match order {
25        BondOrder::Single | BondOrder::Up | BondOrder::Down => 1,
26        BondOrder::Double => 2,
27        BondOrder::Triple => 3,
28        BondOrder::Aromatic => 4,
29        BondOrder::Quadruple => 5,
30    }
31}
32
33/// Configuration for topological path fingerprints.
34#[derive(Debug, Clone)]
35pub struct TopoPathConfig {
36    /// Maximum number of atoms in a path (default: 7).
37    pub max_len: usize,
38    /// Bitvector size in bits (default: 2048).
39    pub nbits: usize,
40}
41
42impl Default for TopoPathConfig {
43    fn default() -> Self {
44        Self { max_len: 7, nbits: 2048 }
45    }
46}
47
48/// Compute a topological path fingerprint for `mol`.
49///
50/// All simple paths of 2..=`config.max_len` atoms are enumerated,
51/// encoded as interleaved `[atomic_num, bond_order, atomic_num, ...]` bytes,
52/// canonicalised (min of forward/reverse encoding), hashed with FNV-1a, and
53/// folded into a `BitVec2048` via `hash % nbits`.
54pub fn topo_path(mol: &Molecule, config: &TopoPathConfig) -> BitVec2048 {
55    let mut fp = BitVec2048::new();
56    if mol.atom_count() == 0 {
57        return fp;
58    }
59    let n = mol.atom_count();
60    let mut path_atoms: Vec<u8> = Vec::with_capacity(config.max_len);
61    let mut path_bonds: Vec<u8> = Vec::with_capacity(config.max_len.saturating_sub(1));
62    let mut visited: Vec<bool> = vec![false; n];
63
64    for start in 0..n {
65        let start_idx = AtomIdx(start as u32);
66        path_atoms.push(mol.atom(start_idx).element.atomic_number());
67        visited[start] = true;
68        dfs(mol, start_idx, None, &mut path_atoms, &mut path_bonds, &mut visited, &mut fp, config);
69        path_atoms.pop();
70        visited[start] = false;
71    }
72    fp
73}
74
75fn dfs(
76    mol: &Molecule,
77    atom: AtomIdx,
78    from_bond: Option<BondIdx>,
79    path_atoms: &mut Vec<u8>,
80    path_bonds: &mut Vec<u8>,
81    visited: &mut Vec<bool>,
82    fp: &mut BitVec2048,
83    config: &TopoPathConfig,
84) {
85    // Record path when it has at least 2 atoms (1 bond)
86    if path_atoms.len() >= 2 {
87        hash_path(path_atoms, path_bonds, fp, config.nbits);
88    }
89    if path_atoms.len() >= config.max_len {
90        return;
91    }
92    for (nbr, bid) in mol.neighbors(atom) {
93        if Some(bid) == from_bond { continue; }
94        if visited[nbr.0 as usize] { continue; }
95        let order = mol.bond(bid).order;
96        path_bonds.push(bond_byte(order));
97        path_atoms.push(mol.atom(nbr).element.atomic_number());
98        visited[nbr.0 as usize] = true;
99        dfs(mol, nbr, Some(bid), path_atoms, path_bonds, visited, fp, config);
100        visited[nbr.0 as usize] = false;
101        path_atoms.pop();
102        path_bonds.pop();
103    }
104}
105
106fn hash_path(atoms: &[u8], bonds: &[u8], fp: &mut BitVec2048, nbits: usize) {
107    // Interleaved encoding: [a0, b0, a1, b1, ..., a_n-1]
108    let len = atoms.len() * 2 - 1;
109    let mut fwd = Vec::with_capacity(len);
110    let mut rev = Vec::with_capacity(len);
111    for i in 0..atoms.len() {
112        fwd.push(atoms[i]);
113        if i + 1 < atoms.len() { fwd.push(bonds[i]); }
114    }
115    for i in (0..atoms.len()).rev() {
116        rev.push(atoms[i]);
117        if i > 0 { rev.push(bonds[i - 1]); }
118    }
119    let canonical = if fwd <= rev { fwd } else { rev };
120    let hash = fnv1a(&canonical);
121    fp.set((hash % nbits as u64) as usize);
122}
123
124#[cfg(test)]
125mod tests {
126    use super::*;
127    use chematic_smiles::parse;
128
129    fn cfg() -> TopoPathConfig { TopoPathConfig::default() }
130
131    #[test]
132    fn ethane_nonzero() {
133        let mol = parse("CC").unwrap();
134        assert!(topo_path(&mol, &cfg()).popcount() > 0);
135    }
136
137    #[test]
138    fn benzene_nonzero() {
139        let mol = parse("c1ccccc1").unwrap();
140        assert!(topo_path(&mol, &cfg()).popcount() > 0);
141    }
142
143    #[test]
144    fn aspirin_many_bits() {
145        let mol = parse("CC(=O)Oc1ccccc1C(=O)O").unwrap();
146        let fp = topo_path(&mol, &cfg());
147        assert!(fp.popcount() > 10, "aspirin topo_path got {}", fp.popcount());
148    }
149
150    #[test]
151    fn deterministic() {
152        let mol = parse("c1ccccc1").unwrap();
153        assert_eq!(topo_path(&mol, &cfg()), topo_path(&mol, &cfg()));
154    }
155
156    #[test]
157    fn longer_max_len_more_bits() {
158        let mol = parse("CC(=O)Oc1ccccc1C(=O)O").unwrap();
159        let fp2 = topo_path(&mol, &TopoPathConfig { max_len: 2, nbits: 2048 });
160        let fp7 = topo_path(&mol, &cfg());
161        assert!(
162            fp2.popcount() <= fp7.popcount(),
163            "max_len=2 ({}) should have <= bits than max_len=7 ({})",
164            fp2.popcount(), fp7.popcount()
165        );
166    }
167
168    #[test]
169    fn ethane_vs_benzene_differ() {
170        let e = parse("CC").unwrap();
171        let b = parse("c1ccccc1").unwrap();
172        let t = topo_path(&e, &cfg()).tanimoto(&topo_path(&b, &cfg()));
173        assert!(t < 1.0, "ethane and benzene must differ (tanimoto={t})");
174    }
175}