Skip to main content

chematic_fp/
topo_path.rs

1//! Topological path fingerprints.
2//!
3//! Enumerates all simple paths of length 1..=`max_len` bonds and hashes
4//! them with FNV-1a. Paths are canonicalised (forward vs. reverse) so that
5//! each unique path is hashed only once.
6
7use chematic_core::{AtomIdx, BondIdx, Molecule};
8
9use crate::bitvec::BitVec2048;
10use crate::ecfp::{bond_type_int as bond_byte, fnv1a};
11
12/// Configuration for topological path fingerprints.
13#[derive(Debug, Clone)]
14pub struct TopoPathConfig {
15    /// Maximum number of atoms in a path (default: 7).
16    pub max_len: usize,
17    /// Bitvector size in bits (default: 2048).
18    pub nbits: usize,
19}
20
21impl Default for TopoPathConfig {
22    fn default() -> Self {
23        Self { max_len: 7, nbits: 2048 }
24    }
25}
26
27/// Compute a topological path fingerprint for `mol`.
28///
29/// All simple paths of 2..=`config.max_len` atoms are enumerated,
30/// encoded as interleaved `[atomic_num, bond_order, atomic_num, ...]` bytes,
31/// canonicalised (min of forward/reverse encoding), hashed with FNV-1a, and
32/// folded into a `BitVec2048` via `hash % nbits`.
33pub fn topo_path(mol: &Molecule, config: &TopoPathConfig) -> BitVec2048 {
34    let mut fp = BitVec2048::new();
35    if mol.atom_count() == 0 {
36        return fp;
37    }
38    let n = mol.atom_count();
39    let mut path_atoms: Vec<u8> = Vec::with_capacity(config.max_len);
40    let mut path_bonds: Vec<u8> = Vec::with_capacity(config.max_len.saturating_sub(1));
41    let mut visited: Vec<bool> = vec![false; n];
42
43    for start in 0..n {
44        let start_idx = AtomIdx(start as u32);
45        path_atoms.push(mol.atom(start_idx).element.atomic_number());
46        visited[start] = true;
47        dfs(mol, start_idx, None, &mut path_atoms, &mut path_bonds, &mut visited, &mut fp, config);
48        path_atoms.pop();
49        visited[start] = false;
50    }
51    fp
52}
53
54fn dfs(
55    mol: &Molecule,
56    atom: AtomIdx,
57    from_bond: Option<BondIdx>,
58    path_atoms: &mut Vec<u8>,
59    path_bonds: &mut Vec<u8>,
60    visited: &mut Vec<bool>,
61    fp: &mut BitVec2048,
62    config: &TopoPathConfig,
63) {
64    // Record path when it has at least 2 atoms (1 bond)
65    if path_atoms.len() >= 2 {
66        hash_path(path_atoms, path_bonds, fp, config.nbits);
67    }
68    if path_atoms.len() >= config.max_len {
69        return;
70    }
71    for (nbr, bid) in mol.neighbors(atom) {
72        if Some(bid) == from_bond { continue; }
73        if visited[nbr.0 as usize] { continue; }
74        let order = mol.bond(bid).order;
75        path_bonds.push(bond_byte(order));
76        path_atoms.push(mol.atom(nbr).element.atomic_number());
77        visited[nbr.0 as usize] = true;
78        dfs(mol, nbr, Some(bid), path_atoms, path_bonds, visited, fp, config);
79        visited[nbr.0 as usize] = false;
80        path_atoms.pop();
81        path_bonds.pop();
82    }
83}
84
85fn hash_path(atoms: &[u8], bonds: &[u8], fp: &mut BitVec2048, nbits: usize) {
86    // Interleaved encoding: [a0, b0, a1, b1, ..., a_n-1]
87    let len = atoms.len() * 2 - 1;
88    let mut fwd = Vec::with_capacity(len);
89    let mut rev = Vec::with_capacity(len);
90    for i in 0..atoms.len() {
91        fwd.push(atoms[i]);
92        if i + 1 < atoms.len() { fwd.push(bonds[i]); }
93    }
94    for i in (0..atoms.len()).rev() {
95        rev.push(atoms[i]);
96        if i > 0 { rev.push(bonds[i - 1]); }
97    }
98    let canonical = if fwd <= rev { fwd } else { rev };
99    let hash = fnv1a(&canonical);
100    fp.set((hash % nbits as u64) as usize);
101}
102
103/// Tanimoto similarity between two molecules using topological path fingerprints.
104pub fn tanimoto_topo_path(a: &Molecule, b: &Molecule) -> f64 {
105    let cfg = TopoPathConfig::default();
106    topo_path(a, &cfg).tanimoto(&topo_path(b, &cfg))
107}
108
109#[cfg(test)]
110mod tests {
111    use super::*;
112    use chematic_smiles::parse;
113
114    fn cfg() -> TopoPathConfig { TopoPathConfig::default() }
115
116    #[test]
117    fn ethane_nonzero() {
118        let mol = parse("CC").unwrap();
119        assert!(topo_path(&mol, &cfg()).popcount() > 0);
120    }
121
122    #[test]
123    fn benzene_nonzero() {
124        let mol = parse("c1ccccc1").unwrap();
125        assert!(topo_path(&mol, &cfg()).popcount() > 0);
126    }
127
128    #[test]
129    fn aspirin_many_bits() {
130        let mol = parse("CC(=O)Oc1ccccc1C(=O)O").unwrap();
131        let fp = topo_path(&mol, &cfg());
132        assert!(fp.popcount() > 10, "aspirin topo_path got {}", fp.popcount());
133    }
134
135    #[test]
136    fn deterministic() {
137        let mol = parse("c1ccccc1").unwrap();
138        assert_eq!(topo_path(&mol, &cfg()), topo_path(&mol, &cfg()));
139    }
140
141    #[test]
142    fn longer_max_len_more_bits() {
143        let mol = parse("CC(=O)Oc1ccccc1C(=O)O").unwrap();
144        let fp2 = topo_path(&mol, &TopoPathConfig { max_len: 2, nbits: 2048 });
145        let fp7 = topo_path(&mol, &cfg());
146        assert!(
147            fp2.popcount() <= fp7.popcount(),
148            "max_len=2 ({}) should have <= bits than max_len=7 ({})",
149            fp2.popcount(), fp7.popcount()
150        );
151    }
152
153    #[test]
154    fn ethane_vs_benzene_differ() {
155        let e = parse("CC").unwrap();
156        let b = parse("c1ccccc1").unwrap();
157        let t = topo_path(&e, &cfg()).tanimoto(&topo_path(&b, &cfg()));
158        assert!(t < 1.0, "ethane and benzene must differ (tanimoto={t})");
159    }
160}