use chematic_core::Molecule;
use crate::bitvec::BitVec2048;
use crate::ecfp::{EcfpConfig, ecfp};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum FpType {
#[default]
Ecfp4,
Ecfp6,
Ecfp4Chiral,
Fcfp4,
Maccs,
TopoPath,
}
fn compute_fp(mol: &Molecule, fp_type: FpType) -> BitVec2048 {
match fp_type {
FpType::Ecfp4 => ecfp(mol, &EcfpConfig::default()),
FpType::Ecfp6 => ecfp(mol, &EcfpConfig { radius: 3, nbits: 2048, use_chirality: false, use_double_fold: false }),
FpType::Ecfp4Chiral => ecfp(mol, &EcfpConfig { radius: 2, nbits: 2048, use_chirality: true, use_double_fold: false }),
FpType::Fcfp4 => crate::fcfp::fcfp4(mol),
FpType::Maccs => crate::maccs::maccs(mol),
FpType::TopoPath => crate::topo_path::topo_path(mol, &crate::topo_path::TopoPathConfig::default()),
}
}
pub fn nearest_neighbors(
query: &Molecule,
db: &[Molecule],
k: usize,
fp_type: FpType,
) -> Vec<(usize, f64)> {
if k == 0 || db.is_empty() {
return vec![];
}
let query_fp = compute_fp(query, fp_type);
let mut scores: Vec<(usize, f64)> = db
.iter()
.enumerate()
.map(|(i, mol)| {
let fp = compute_fp(mol, fp_type);
(i, query_fp.tanimoto(&fp))
})
.filter(|(_, t)| *t > 0.0)
.collect();
scores.sort_unstable_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
scores.truncate(k);
scores
}
pub fn nearest_neighbors_from_fp(
query_fp: &BitVec2048,
db_fps: &[BitVec2048],
k: usize,
) -> Vec<(usize, f64)> {
if k == 0 || db_fps.is_empty() {
return vec![];
}
let mut scores: Vec<(usize, f64)> = db_fps
.iter()
.enumerate()
.map(|(i, fp)| (i, query_fp.tanimoto(fp)))
.filter(|(_, t)| *t > 0.0)
.collect();
scores.sort_unstable_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
scores.truncate(k);
scores
}
#[cfg(test)]
mod tests {
use super::*;
use chematic_smiles::parse;
fn benzene() -> Molecule { parse("c1ccccc1").unwrap() }
fn toluene() -> Molecule { parse("Cc1ccccc1").unwrap() }
fn naphthalene() -> Molecule { parse("c1ccc2ccccc2c1").unwrap() }
fn ethane() -> Molecule { parse("CC").unwrap() }
#[test]
fn test_nn_self_is_first() {
let query = benzene();
let db = vec![ethane(), toluene(), benzene(), naphthalene()];
let results = nearest_neighbors(&query, &db, 3, FpType::Ecfp4);
assert!(!results.is_empty());
assert_eq!(results[0].0, 2, "benzene should match itself first");
assert!((results[0].1 - 1.0).abs() < 1e-9, "self-similarity should be 1.0");
}
#[test]
fn test_nn_returns_k_results() {
let query = benzene();
let db = vec![ethane(), toluene(), benzene(), naphthalene()];
let results = nearest_neighbors(&query, &db, 2, FpType::Ecfp4);
assert!(results.len() <= 2, "should return at most k results");
}
#[test]
fn test_nn_sorted_descending() {
let query = benzene();
let db = vec![ethane(), toluene(), benzene(), naphthalene()];
let results = nearest_neighbors(&query, &db, 4, FpType::Ecfp4);
for w in results.windows(2) {
assert!(w[0].1 >= w[1].1, "results should be sorted by descending Tanimoto");
}
}
#[test]
fn test_nn_empty_db() {
let query = benzene();
let results = nearest_neighbors(&query, &[], 5, FpType::Ecfp4);
assert!(results.is_empty());
}
#[test]
fn test_nn_k_zero() {
let query = benzene();
let db = vec![benzene()];
let results = nearest_neighbors(&query, &db, 0, FpType::Ecfp4);
assert!(results.is_empty());
}
#[test]
fn test_nn_from_fp() {
let query = benzene();
let db = vec![ethane(), toluene(), benzene()];
let query_fp = crate::ecfp::ecfp4(&query);
let db_fps: Vec<_> = db.iter().map(|m| crate::ecfp::ecfp4(m)).collect();
let results = nearest_neighbors_from_fp(&query_fp, &db_fps, 3);
assert!(!results.is_empty());
assert_eq!(results[0].0, 2, "benzene fp should match itself");
}
#[test]
fn test_nn_maccs_type() {
let query = benzene();
let db = vec![toluene(), benzene(), ethane()];
let results = nearest_neighbors(&query, &db, 3, FpType::Maccs);
assert!(!results.is_empty());
assert_ne!(results[0].0, 2, "ethane should not be the top MACCS hit for benzene");
assert!(results[0].1 > 0.5, "top MACCS hit should have Tanimoto > 0.5");
}
}