use crate::chemistry::forcefield::crystalff::build_crystalff_query_molecule;
use crate::search::query::{
QueryMatchContext, atom_predicate_matches_with_context, bond_predicate_matches_with_context,
build_query_match_context,
};
use crate::{Atom, AtomQueryPredicate, Bond, BondOrder, BondQueryPredicate, Molecule};
use std::collections::BTreeMap;
use std::env;
use std::time::Instant;
fn row61_torsion349_substruct_trace_enabled(
mol: &Molecule,
query: &Molecule,
params: &SubstructMatchParams,
) -> bool {
env::var("RDKIT_ROW61_TRACE").ok().as_deref() == Some("1")
&& mol.num_atoms() == 26
&& query.num_atoms() == 5
&& query.num_bonds() == 4
&& !params.uniquify
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct SubstructMatchResult {
pub atom_mapping: Vec<usize>,
pub bond_mapping: Vec<usize>,
}
#[derive(Debug, Clone)]
pub struct SubstructMatchParams {
pub max_matches: usize,
pub uniquify: bool,
}
type RecursiveQueryMatchCache = BTreeMap<String, Vec<bool>>;
impl Default for SubstructMatchParams {
fn default() -> Self {
Self {
max_matches: 1000,
uniquify: true,
}
}
}
#[derive(Debug, Clone)]
struct Vf2Graph {
n_atoms: usize,
n_bonds: usize,
adjacency: Vec<Vec<(usize, usize)>>, }
fn build_vf2_graph(mol: &Molecule) -> Vf2Graph {
let n_atoms = mol.num_atoms();
let mut adjacency: Vec<Vec<(usize, usize)>> = vec![Vec::new(); n_atoms];
for (bond_idx, bond) in mol.bonds().iter().enumerate() {
let b = bond.begin().index();
let e = bond.end().index();
adjacency[b].push((e, bond_idx));
adjacency[e].push((b, bond_idx));
}
Vf2Graph {
n_atoms,
n_bonds: mol.num_bonds(),
adjacency,
}
}
impl Vf2Graph {
fn out_degree(&self, node: usize) -> usize {
self.adjacency[node].len()
}
fn out_edges(&self, node: usize) -> &[(usize, usize)] {
&self.adjacency[node]
}
}
fn atom_matches(query_atom: &Atom, query_mol: &Molecule, mol_atom: &Atom, mol: &Molecule) -> bool {
if let Some(query_node) = query_atom.query() {
let query_ctx = build_query_match_context(mol);
return match query_node {
crate::QueryNode::Predicate(pred) => {
atom_query_predicate_matches_for_substruct(mol_atom, pred, mol, None, &query_ctx)
}
crate::QueryNode::And(children) => children.iter().all(|child| match child {
crate::QueryNode::Predicate(pred) => atom_query_predicate_matches_for_substruct(
mol_atom, pred, mol, None, &query_ctx,
),
_ => evaluate_atom_query(child, mol_atom, mol, None, &query_ctx),
}),
crate::QueryNode::Or(children) => children.iter().any(|child| match child {
crate::QueryNode::Predicate(pred) => atom_query_predicate_matches_for_substruct(
mol_atom, pred, mol, None, &query_ctx,
),
_ => evaluate_atom_query(child, mol_atom, mol, None, &query_ctx),
}),
crate::QueryNode::Not(child) => {
!evaluate_atom_query(child, mol_atom, mol, None, &query_ctx)
}
};
}
let q_an = query_atom.atomic_number();
let m_an = mol_atom.atomic_number();
if q_an != 0 && q_an != m_an {
return false;
}
if query_atom.is_aromatic() && !mol_atom.is_aromatic() {
return false;
}
if let Some(q_iso) = query_atom.isotope() {
if mol_atom.isotope() != Some(q_iso) {
return false;
}
}
let q_charge = query_atom.formal_charge();
if q_charge != 0 && q_charge != mol_atom.formal_charge() {
return false;
}
let _ = query_mol;
true
}
fn recursive_smarts_root_matches(
atom: &Atom,
recursive_smarts: &str,
mol: &Molecule,
recursive_cache: Option<&RecursiveQueryMatchCache>,
) -> bool {
if let Some(cache) = recursive_cache
&& let Some(match_starts) = cache.get(recursive_smarts)
{
return match_starts
.get(atom.id().index())
.copied()
.unwrap_or(false);
}
let Some(inner) = recursive_smarts
.strip_prefix("$(")
.and_then(|value| value.strip_suffix(')'))
else {
return false;
};
let Ok(query) = build_crystalff_query_molecule(inner) else {
return false;
};
substruct_match_impl(
mol,
&query,
&SubstructMatchParams {
max_matches: 1000,
uniquify: false,
},
)
.into_iter()
.any(|matched| matched.atom_mapping.first().copied() == Some(atom.id().index()))
}
fn atom_query_predicate_matches_for_substruct(
atom: &Atom,
pred: &AtomQueryPredicate,
mol: &Molecule,
recursive_cache: Option<&RecursiveQueryMatchCache>,
query_ctx: &QueryMatchContext,
) -> bool {
match pred {
AtomQueryPredicate::RecursiveSmarts(recursive_smarts) => {
recursive_smarts_root_matches(atom, recursive_smarts, mol, recursive_cache)
}
_ => atom_predicate_matches_with_context(atom, pred, mol, query_ctx),
}
}
fn evaluate_atom_query(
query: &crate::QueryNode<AtomQueryPredicate>,
atom: &Atom,
mol: &Molecule,
recursive_cache: Option<&RecursiveQueryMatchCache>,
query_ctx: &QueryMatchContext,
) -> bool {
match query {
crate::QueryNode::Predicate(pred) => {
atom_query_predicate_matches_for_substruct(atom, pred, mol, recursive_cache, query_ctx)
}
crate::QueryNode::And(children) => children
.iter()
.all(|c| evaluate_atom_query(c, atom, mol, recursive_cache, query_ctx)),
crate::QueryNode::Or(children) => children
.iter()
.any(|c| evaluate_atom_query(c, atom, mol, recursive_cache, query_ctx)),
crate::QueryNode::Not(child) => {
!evaluate_atom_query(child, atom, mol, recursive_cache, query_ctx)
}
}
}
fn bond_matches(query_bond: &Bond, query_mol: &Molecule, mol_bond: &Bond, mol: &Molecule) -> bool {
if let Some(query_node) = query_bond.query() {
let query_ctx = build_query_match_context(mol);
return evaluate_bond_query(query_node, mol_bond, mol, &query_ctx);
}
let q_aromatic = query_bond.is_aromatic();
let m_aromatic = mol_bond.is_aromatic();
if q_aromatic && m_aromatic {
return true;
}
let q_order = query_bond.order();
let m_order = mol_bond.order();
if q_order == m_order {
return true;
}
if q_aromatic && m_order == BondOrder::Single {
return true;
}
if m_aromatic && q_order == BondOrder::Single {
return true;
}
let _ = query_mol;
false
}
fn evaluate_bond_query(
query: &crate::QueryNode<BondQueryPredicate>,
bond: &Bond,
mol: &Molecule,
query_ctx: &QueryMatchContext,
) -> bool {
match query {
crate::QueryNode::Predicate(pred) => {
bond_predicate_matches_with_context(bond, pred, mol, query_ctx)
}
crate::QueryNode::And(children) => children
.iter()
.all(|c| evaluate_bond_query(c, bond, mol, query_ctx)),
crate::QueryNode::Or(children) => children
.iter()
.any(|c| evaluate_bond_query(c, bond, mol, query_ctx)),
crate::QueryNode::Not(child) => !evaluate_bond_query(child, bond, mol, query_ctx),
}
}
type NodeId = usize;
const NULL_NODE: NodeId = usize::MAX;
#[derive(Debug, Clone)]
struct Vf2Pair {
n1: NodeId,
n2: NodeId,
hasiter: bool,
nbr_node: NodeId,
nbr_cursor: usize,
nbr_end: usize,
}
impl Vf2Pair {
fn new() -> Self {
Self {
n1: NULL_NODE,
n2: NULL_NODE,
hasiter: false,
nbr_node: NULL_NODE,
nbr_cursor: 0,
nbr_end: 0,
}
}
}
#[derive(Debug, Clone, Copy)]
struct NodeInfo {
id: usize,
in_deg: usize,
out_deg: usize,
}
fn node_info_cmp1(a: &NodeInfo, b: &NodeInfo) -> std::cmp::Ordering {
a.out_deg
.cmp(&b.out_deg)
.then_with(|| a.in_deg.cmp(&b.in_deg))
}
fn node_info_cmp2(a: &NodeInfo, b: &NodeInfo) -> std::cmp::Ordering {
if a.in_deg == 0 && b.in_deg != 0 {
return std::cmp::Ordering::Greater;
}
if a.in_deg != 0 && b.in_deg == 0 {
return std::cmp::Ordering::Less;
}
a.out_deg
.cmp(&b.out_deg)
.then_with(|| a.in_deg.cmp(&b.in_deg))
}
fn sort_nodes_by_frequency(g: &Vf2Graph) -> Vec<NodeId> {
let mut vect: Vec<NodeInfo> = (0..g.n_atoms)
.map(|i| {
let deg = g.out_degree(i);
NodeInfo {
id: i,
in_deg: deg,
out_deg: deg,
}
})
.collect();
vect.sort_unstable_by(node_info_cmp1);
let mut i = 0;
while i < vect.len() {
let mut run = 1;
while i + run < vect.len()
&& vect[i + run].in_deg == vect[i].in_deg
&& vect[i + run].out_deg == vect[i].out_deg
{
run += 1;
}
for j in 0..run {
vect[i + j].in_deg += vect[i + j].out_deg; vect[i + j].out_deg = run; }
i += run;
}
vect.sort_unstable_by(node_info_cmp2);
vect.iter().map(|ni| ni.id).collect()
}
struct Vf2SubState<'a> {
g1: &'a Vf2Graph,
g2: &'a Vf2Graph,
n1: usize,
n2: usize,
core_len: usize,
t1_len: usize,
t2_len: usize,
core_1: Vec<NodeId>,
core_2: Vec<NodeId>,
term_1: Vec<usize>,
term_2: Vec<usize>,
order: Option<Vec<NodeId>>,
}
impl<'a> Vf2SubState<'a> {
fn new(g1: &'a Vf2Graph, g2: &'a Vf2Graph, sort_nodes: bool) -> Self {
let n1 = g1.n_atoms;
let n2 = g2.n_atoms;
let order = if sort_nodes {
Some(sort_nodes_by_frequency(g1))
} else {
None
};
Self {
g1,
g2,
n1,
n2,
core_len: 0,
t1_len: 0,
t2_len: 0,
core_1: vec![NULL_NODE; n1],
core_2: vec![NULL_NODE; n2],
term_1: vec![0usize; n1],
term_2: vec![0usize; n2],
order,
}
}
fn debug_order(&self) -> Option<&[NodeId]> {
self.order.as_deref()
}
fn is_goal(&self) -> bool {
self.core_len == self.n1
}
fn is_dead(&self) -> bool {
self.n1 > self.n2 || self.t1_len > self.t2_len
}
fn next_pair(&self, pair: &mut Vf2Pair) -> bool {
if pair.n1 == NULL_NODE {
pair.n1 = 0;
}
if pair.n2 == NULL_NODE {
pair.n2 = 0;
} else {
pair.n2 += 1;
}
if self.t1_len > self.core_len && self.t2_len > self.core_len {
while pair.n1 < self.n1
&& (self.core_1[pair.n1] != NULL_NODE || self.term_1[pair.n1] == 0)
{
pair.n1 += 1;
pair.n2 = 0;
}
if !pair.hasiter {
let mut mapped_terminal_neighbor = NULL_NODE;
for &(query_neighbor, _) in self.g1.out_edges(pair.n1) {
if self.core_1[query_neighbor] != NULL_NODE {
mapped_terminal_neighbor = self.core_1[query_neighbor];
break;
}
}
debug_assert_ne!(mapped_terminal_neighbor, NULL_NODE);
if mapped_terminal_neighbor != NULL_NODE {
pair.nbr_node = mapped_terminal_neighbor;
pair.nbr_cursor = 0;
pair.nbr_end = self.g2.out_edges(mapped_terminal_neighbor).len();
pair.hasiter = true;
}
}
} else if pair.n1 == 0 {
if let Some(order) = &self.order {
let mut i = 0;
while i < self.n1 {
let candidate = order[i];
if self.core_1[candidate] == NULL_NODE {
pair.n1 = candidate;
break;
}
i += 1;
}
if i == self.n1 {
pair.n1 = self.n1;
}
} else {
while pair.n1 < self.n1 && self.core_1[pair.n1] != NULL_NODE {
pair.n1 += 1;
pair.n2 = 0;
}
}
} else {
while pair.n1 < self.n1 && self.core_1[pair.n1] != NULL_NODE {
pair.n1 += 1;
pair.n2 = 0;
}
}
if pair.hasiter {
let neighbors = self.g2.out_edges(pair.nbr_node);
while pair.nbr_cursor < pair.nbr_end
&& self.core_2[neighbors[pair.nbr_cursor].0] != NULL_NODE
{
pair.nbr_cursor += 1;
}
if pair.nbr_cursor < pair.nbr_end {
pair.n2 = neighbors[pair.nbr_cursor].0;
pair.nbr_cursor += 1;
} else {
pair.n2 = self.n2;
}
} else if self.t1_len > self.core_len && self.t2_len > self.core_len {
while pair.n2 < self.n2
&& (self.core_2[pair.n2] != NULL_NODE || self.term_2[pair.n2] == 0)
{
pair.n2 += 1;
}
} else {
while pair.n2 < self.n2 && self.core_2[pair.n2] != NULL_NODE {
pair.n2 += 1;
}
}
let ok = pair.n1 < self.n1 && pair.n2 < self.n2;
if ok
&& env::var("RDKIT_ROW61_TRACE").ok().as_deref() == Some("1")
&& self.n1 == 5
&& self.n2 == 26
{
println!(
"row61_substruct_next_pair core_len={} t1_len={} t2_len={} hasiter={} n1={} n2={}",
self.core_len, self.t1_len, self.t2_len, pair.hasiter, pair.n1, pair.n2
);
}
ok
}
fn is_feasible_pair(
&self,
node1: NodeId,
node2: NodeId,
atom_fn: &impl Fn(usize, usize) -> bool,
bond_fn: &impl Fn(usize, usize) -> bool,
) -> bool {
debug_assert!(node1 < self.n1);
debug_assert!(node2 < self.n2);
debug_assert_eq!(self.core_1[node1], NULL_NODE);
debug_assert_eq!(self.core_2[node2], NULL_NODE);
if self.g1.out_degree(node1) > self.g2.out_degree(node2) {
return false;
}
if !atom_fn(node1, node2) {
return false;
}
for &(other1, edge_idx1) in self.g1.out_edges(node1) {
if other1 == node1 {
continue;
}
if self.core_1[other1] != NULL_NODE {
let other2 = self.core_1[other1];
let bond_found = self.find_bond(node2, other2);
match bond_found {
Some(edge_idx2) => {
if !bond_fn(edge_idx1, edge_idx2) {
return false;
}
}
None => return false,
}
}
}
true
}
fn find_bond(&self, a: NodeId, b: NodeId) -> Option<usize> {
for &(nbr, bond_idx) in self.g2.out_edges(a) {
if nbr == b {
return Some(bond_idx);
}
}
None
}
fn add_pair(&mut self, node1: NodeId, node2: NodeId) {
self.core_len += 1;
let depth = self.core_len;
if self.term_1[node1] == 0 {
self.term_1[node1] = depth;
self.t1_len += 1;
}
if self.term_2[node2] == 0 {
self.term_2[node2] = depth;
self.t2_len += 1;
}
self.core_1[node1] = node2;
self.core_2[node2] = node1;
for &(other, _) in self.g1.out_edges(node1) {
if other == node1 {
continue;
}
if self.term_1[other] == 0 {
self.term_1[other] = depth;
self.t1_len += 1;
}
}
for &(other, _) in self.g2.out_edges(node2) {
if other == node2 {
continue;
}
if self.term_2[other] == 0 {
self.term_2[other] = depth;
self.t2_len += 1;
}
}
}
fn back_track(&mut self, node1: NodeId, node2: NodeId) {
let depth = self.core_len;
if self.term_1[node1] == depth {
self.term_1[node1] = 0;
self.t1_len -= 1;
}
for &(other, _) in self.g1.out_edges(node1) {
if other == node1 {
continue;
}
if self.term_1[other] == depth {
self.term_1[other] = 0;
self.t1_len -= 1;
}
}
if self.term_2[node2] == depth {
self.term_2[node2] = 0;
self.t2_len -= 1;
}
for &(other, _) in self.g2.out_edges(node2) {
if other == node2 {
continue;
}
if self.term_2[other] == depth {
self.term_2[other] = 0;
self.t2_len -= 1;
}
}
self.core_1[node1] = NULL_NODE;
self.core_2[node2] = NULL_NODE;
self.core_len -= 1;
}
fn get_core_set(&self) -> (Vec<NodeId>, Vec<NodeId>) {
let mut c1 = Vec::with_capacity(self.core_len);
let mut c2 = Vec::with_capacity(self.core_len);
for i in 0..self.n1 {
if self.core_1[i] != NULL_NODE {
c1.push(i);
c2.push(self.core_1[i]);
}
}
(c1, c2)
}
}
fn vf2_match(
state: &mut Vf2SubState,
atom_fn: &impl Fn(usize, usize) -> bool,
bond_fn: &impl Fn(usize, usize) -> bool,
match_check: Option<&impl Fn(&[NodeId], &[NodeId]) -> bool>,
) -> Option<(Vec<NodeId>, Vec<NodeId>)> {
if state.is_goal() {
let (c1, c2) = state.get_core_set();
if match_check.map_or(true, |check| check(&c1, &c2)) {
return Some((c1, c2));
}
}
if state.is_dead() {
return None;
}
let mut pair = Vf2Pair::new();
while state.next_pair(&mut pair) {
if state.is_feasible_pair(pair.n1, pair.n2, atom_fn, bond_fn) {
state.add_pair(pair.n1, pair.n2);
if let Some(result) = vf2_match(state, atom_fn, bond_fn, match_check) {
return Some(result);
}
state.back_track(pair.n1, pair.n2);
}
}
None
}
fn vf2_match_all(
state: &mut Vf2SubState,
atom_fn: &impl Fn(usize, usize) -> bool,
bond_fn: &impl Fn(usize, usize) -> bool,
match_check: Option<&impl Fn(&[NodeId], &[NodeId]) -> bool>,
results: &mut Vec<(Vec<NodeId>, Vec<NodeId>)>,
max_matches: usize,
) -> bool {
if state.is_goal() {
let (c1, c2) = state.get_core_set();
if match_check.map_or(true, |check| check(&c1, &c2)) {
results.push((c1, c2));
return max_matches > 0 && results.len() >= max_matches;
}
}
if state.is_dead() {
return false;
}
let mut pair = Vf2Pair::new();
while state.next_pair(&mut pair) {
if state.is_feasible_pair(pair.n1, pair.n2, atom_fn, bond_fn) {
state.add_pair(pair.n1, pair.n2);
if vf2_match_all(state, atom_fn, bond_fn, match_check, results, max_matches) {
return true;
}
state.back_track(pair.n1, pair.n2);
}
}
false
}
fn match_mask(atom_mapping: &[usize], mol_num_atoms: usize) -> Vec<bool> {
let mut mask = vec![false; mol_num_atoms];
for &ma in atom_mapping {
if ma < mol_num_atoms {
mask[ma] = true;
}
}
mask
}
fn rdkit_order_and_uniquify_matches(
mut results: Vec<SubstructMatchResult>,
mol_num_atoms: usize,
params: &SubstructMatchParams,
) -> Vec<SubstructMatchResult> {
if params.uniquify {
let mut uniquified: Vec<(Vec<bool>, SubstructMatchResult)> = Vec::new();
for result in results.drain(..) {
let mask = match_mask(&result.atom_mapping, mol_num_atoms);
if let Some((_, existing)) = uniquified
.iter_mut()
.find(|(existing_mask, _)| *existing_mask == mask)
{
if result.atom_mapping < existing.atom_mapping {
*existing = result;
}
} else {
uniquified.push((mask, result));
}
}
results = uniquified.into_iter().map(|(_, result)| result).collect();
results.sort_by(|lhs, rhs| lhs.atom_mapping.cmp(&rhs.atom_mapping));
}
if results.len() > params.max_matches {
results.truncate(params.max_matches);
}
results
}
#[allow(dead_code)]
fn build_bond_mapping(
query_atom_to_mol: &[Option<usize>],
query: &Vf2Graph,
mol: &Vf2Graph,
) -> Vec<usize> {
let mut bond_mapping = Vec::with_capacity(query.n_bonds);
for bond_idx in 0..query.n_bonds {
let mut q_begin = NULL_NODE;
let mut q_end = NULL_NODE;
for qa in 0..query.n_atoms {
for &(nbr, eidx) in &query.adjacency[qa] {
if eidx == bond_idx {
q_begin = qa;
q_end = nbr;
break;
}
}
if q_begin != NULL_NODE {
break;
}
}
if q_begin != NULL_NODE {
let m_begin = query_atom_to_mol[q_begin];
let m_end = query_atom_to_mol[q_end];
if let (Some(mb), Some(me)) = (m_begin, m_end) {
let mut mol_bond_idx = NULL_NODE;
for &(nbr, eidx) in &mol.adjacency[mb] {
if nbr == me {
mol_bond_idx = eidx;
break;
}
}
bond_mapping.push(mol_bond_idx);
} else {
bond_mapping.push(NULL_NODE);
}
} else {
bond_mapping.push(NULL_NODE);
}
}
bond_mapping
}
fn collect_recursive_smarts_from_atom_query(
query: &crate::QueryNode<AtomQueryPredicate>,
recursive_smarts: &mut Vec<String>,
) {
match query {
crate::QueryNode::Predicate(AtomQueryPredicate::RecursiveSmarts(smarts)) => {
recursive_smarts.push(smarts.clone());
}
crate::QueryNode::Predicate(_) => {}
crate::QueryNode::And(children) | crate::QueryNode::Or(children) => {
for child in children {
collect_recursive_smarts_from_atom_query(child, recursive_smarts);
}
}
crate::QueryNode::Not(child) => {
collect_recursive_smarts_from_atom_query(child, recursive_smarts);
}
}
}
fn populate_recursive_query_match_cache(
mol: &Molecule,
query: &Molecule,
recursive_cache: &mut RecursiveQueryMatchCache,
) {
let mut recursive_smarts = Vec::new();
for atom in query.atoms() {
if let Some(query_node) = atom.query() {
collect_recursive_smarts_from_atom_query(query_node, &mut recursive_smarts);
}
}
recursive_smarts.sort();
recursive_smarts.dedup();
for recursive_smarts in recursive_smarts {
if recursive_cache.contains_key(&recursive_smarts) {
continue;
}
let Some(inner) = recursive_smarts
.strip_prefix("$(")
.and_then(|value| value.strip_suffix(')'))
else {
recursive_cache.insert(recursive_smarts, vec![false; mol.num_atoms()]);
continue;
};
let Ok(inner_query) = build_crystalff_query_molecule(inner) else {
recursive_cache.insert(recursive_smarts, vec![false; mol.num_atoms()]);
continue;
};
populate_recursive_query_match_cache(mol, &inner_query, recursive_cache);
let matches = substruct_match_impl_with_recursive_cache(
mol,
&inner_query,
&SubstructMatchParams {
max_matches: 1000,
uniquify: false,
},
Some(recursive_cache),
);
let mut match_starts = vec![false; mol.num_atoms()];
for matched in matches {
if let Some(&root_atom_idx) = matched.atom_mapping.first()
&& root_atom_idx != NULL_NODE
&& root_atom_idx < match_starts.len()
{
match_starts[root_atom_idx] = true;
}
}
recursive_cache.insert(recursive_smarts, match_starts);
}
}
fn substruct_match_impl_with_recursive_cache(
mol: &Molecule,
query: &Molecule,
params: &SubstructMatchParams,
recursive_cache: Option<&RecursiveQueryMatchCache>,
) -> Vec<SubstructMatchResult> {
let m_num_atoms = mol.num_atoms();
let q_num_atoms = query.num_atoms();
let trace_row64_substruct =
env::var("RDKIT_ROW64_SUBSTRUCT_TRACE").ok().as_deref() == Some("1") && m_num_atoms == 106;
let trace_row61_torsion349 = row61_torsion349_substruct_trace_enabled(mol, query, params);
let query_ctx = build_query_match_context(mol);
if m_num_atoms == 0 || q_num_atoms == 0 || q_num_atoms > m_num_atoms {
return Vec::new();
}
let graph_start = trace_row64_substruct.then(Instant::now);
let q_graph = build_vf2_graph(query);
let m_graph = build_vf2_graph(mol);
let graph_elapsed = graph_start.map(|start| start.elapsed().as_secs_f64());
let atom_fn = |qi: usize, mj: usize| -> bool {
let qa = &query.atoms()[qi];
let ma = &mol.atoms()[mj];
atom_matches_with_recursive_cache(qa, query, ma, mol, recursive_cache, &query_ctx)
};
let bond_fn = |qei: usize, mei: usize| -> bool {
let qb = &query.bonds()[qei];
let mb = &mol.bonds()[mei];
if let Some(query_node) = qb.query() {
evaluate_bond_query(query_node, mb, mol, &query_ctx)
} else {
bond_matches(qb, query, mb, mol)
}
};
let mut state = Vf2SubState::new(&q_graph, &m_graph, false);
if trace_row61_torsion349 {
println!(
"row61_substruct_begin q_num_atoms={} q_num_bonds={} m_num_atoms={} order={:?}",
q_num_atoms,
query.num_bonds(),
m_num_atoms,
state.debug_order()
);
}
let mut raw_matches: Vec<(Vec<NodeId>, Vec<NodeId>)> = Vec::new();
let check_fn = |_c1: &[NodeId], _c2: &[NodeId]| -> bool { true };
let vf2_start = trace_row64_substruct.then(Instant::now);
vf2_match_all(
&mut state,
&atom_fn,
&bond_fn,
Some(&check_fn),
&mut raw_matches,
params.max_matches,
);
if let Some(vf2_start) = vf2_start {
println!(
"row64_substruct_core q_atoms={} q_bonds={} graph_build={:.6} vf2={:.6} raw_matches={}",
q_num_atoms,
query.num_bonds(),
graph_elapsed.unwrap_or(0.0),
vf2_start.elapsed().as_secs_f64(),
raw_matches.len()
);
}
let mut results: Vec<SubstructMatchResult> = Vec::new();
for (c1, c2) in &raw_matches {
if trace_row61_torsion349 {
println!("row61_substruct_raw_core c1={c1:?} c2={c2:?}");
}
let mut atom_to_mol: Vec<Option<usize>> = vec![None; q_num_atoms];
for (&qa, &ma) in c1.iter().zip(c2.iter()) {
if qa < q_num_atoms {
atom_to_mol[qa] = Some(ma);
}
}
let mut bond_mapping = Vec::with_capacity(query.num_bonds());
for qbond in query.bonds() {
let q_begin = qbond.begin().index();
let q_end = qbond.end().index();
let m_begin = atom_to_mol[q_begin];
let m_end = atom_to_mol[q_end];
match (m_begin, m_end) {
(Some(mb), Some(me)) => {
let found = m_graph.adjacency[mb]
.iter()
.find(|&&(nbr, _)| nbr == me)
.map(|&(_, eidx)| eidx);
bond_mapping.push(found.unwrap_or(NULL_NODE));
}
_ => {
bond_mapping.push(NULL_NODE);
}
}
}
results.push(SubstructMatchResult {
atom_mapping: atom_to_mol
.into_iter()
.map(|x| x.unwrap_or(NULL_NODE))
.collect(),
bond_mapping,
});
}
rdkit_order_and_uniquify_matches(results, m_num_atoms, params)
}
fn atom_matches_with_recursive_cache(
query_atom: &Atom,
query_mol: &Molecule,
mol_atom: &Atom,
mol: &Molecule,
recursive_cache: Option<&RecursiveQueryMatchCache>,
query_ctx: &QueryMatchContext,
) -> bool {
if let Some(query_node) = query_atom.query() {
return match query_node {
crate::QueryNode::Predicate(pred) => atom_query_predicate_matches_for_substruct(
mol_atom,
pred,
mol,
recursive_cache,
query_ctx,
),
crate::QueryNode::And(children) => children.iter().all(|child| match child {
crate::QueryNode::Predicate(pred) => atom_query_predicate_matches_for_substruct(
mol_atom,
pred,
mol,
recursive_cache,
query_ctx,
),
_ => evaluate_atom_query(child, mol_atom, mol, recursive_cache, query_ctx),
}),
crate::QueryNode::Or(children) => children.iter().any(|child| match child {
crate::QueryNode::Predicate(pred) => atom_query_predicate_matches_for_substruct(
mol_atom,
pred,
mol,
recursive_cache,
query_ctx,
),
_ => evaluate_atom_query(child, mol_atom, mol, recursive_cache, query_ctx),
}),
crate::QueryNode::Not(child) => {
!evaluate_atom_query(child, mol_atom, mol, recursive_cache, query_ctx)
}
};
}
atom_matches(query_atom, query_mol, mol_atom, mol)
}
fn substruct_match_impl(
mol: &Molecule,
query: &Molecule,
params: &SubstructMatchParams,
) -> Vec<SubstructMatchResult> {
let trace_row64_substruct = env::var("RDKIT_ROW64_SUBSTRUCT_TRACE").ok().as_deref()
== Some("1")
&& mol.num_atoms() == 106;
let recursive_cache_start = trace_row64_substruct.then(Instant::now);
let mut recursive_cache = RecursiveQueryMatchCache::new();
populate_recursive_query_match_cache(mol, query, &mut recursive_cache);
let recursive_cache_elapsed = recursive_cache_start.map(|start| start.elapsed().as_secs_f64());
let match_start = trace_row64_substruct.then(Instant::now);
let results =
substruct_match_impl_with_recursive_cache(mol, query, params, Some(&recursive_cache));
if let Some(match_start) = match_start {
println!(
"row64_substruct_timing q_atoms={} q_bonds={} recursive_cache={:.6} match_core={:.6} matches={}",
query.num_atoms(),
query.num_bonds(),
recursive_cache_elapsed.unwrap_or(0.0),
match_start.elapsed().as_secs_f64(),
results.len()
);
}
results
}
pub fn has_substruct_match(mol: &Molecule, query: &Molecule) -> bool {
let params = SubstructMatchParams::default();
let mut params = params;
params.max_matches = 1;
!substruct_match_impl(mol, query, ¶ms).is_empty()
}
pub fn get_substruct_match(mol: &Molecule, query: &Molecule) -> Option<SubstructMatchResult> {
let params = SubstructMatchParams::default();
let mut params = params;
params.max_matches = 1;
substruct_match_impl(mol, query, ¶ms).into_iter().next()
}
pub fn get_substruct_matches(mol: &Molecule, query: &Molecule) -> Vec<SubstructMatchResult> {
let params = SubstructMatchParams::default();
substruct_match_impl(mol, query, ¶ms)
}
pub fn get_substruct_matches_with_params(
mol: &Molecule,
query: &Molecule,
params: &SubstructMatchParams,
) -> Vec<SubstructMatchResult> {
substruct_match_impl(mol, query, params)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::MoleculeBuilder;
fn make_mol_c() -> Molecule {
let mut builder = MoleculeBuilder::new();
builder.add_atom(crate::AtomSpec::new(crate::Element::C));
builder.build().expect("build methane")
}
fn make_mol_cc() -> Molecule {
let mut builder = MoleculeBuilder::new();
let c0 = builder.add_atom(crate::AtomSpec::new(crate::Element::C));
let c1 = builder.add_atom(crate::AtomSpec::new(crate::Element::C));
builder
.add_bond(crate::BondSpec::new(c0, c1, BondOrder::Single))
.expect("add bond");
builder.build().expect("build ethane")
}
fn make_mol_cco() -> Molecule {
let mut builder = MoleculeBuilder::new();
let c0 = builder.add_atom(crate::AtomSpec::new(crate::Element::C));
let c1 = builder.add_atom(crate::AtomSpec::new(crate::Element::C));
let o = builder.add_atom(crate::AtomSpec::new(crate::Element::O));
builder
.add_bond(crate::BondSpec::new(c0, c1, BondOrder::Single))
.expect("add C-C bond");
builder
.add_bond(crate::BondSpec::new(c1, o, BondOrder::Single))
.expect("add C-O bond");
builder.build().expect("build ethanol")
}
fn make_mol_coc() -> Molecule {
let mut builder = MoleculeBuilder::new();
let c0 = builder.add_atom(crate::AtomSpec::new(crate::Element::C));
let o = builder.add_atom(crate::AtomSpec::new(crate::Element::O));
let c1 = builder.add_atom(crate::AtomSpec::new(crate::Element::C));
builder
.add_bond(crate::BondSpec::new(c0, o, BondOrder::Single))
.expect("add C-O bond 1");
builder
.add_bond(crate::BondSpec::new(o, c1, BondOrder::Single))
.expect("add O-C bond 2");
builder.build().expect("build dimethyl ether")
}
#[test]
fn test_has_substruct_match_self() {
let c = make_mol_c();
assert!(
has_substruct_match(&c, &c),
"a molecule should match itself"
);
}
#[test]
fn test_has_substruct_match_cc_in_cco() {
let cc = make_mol_cc();
let cco = make_mol_cco();
assert!(
has_substruct_match(&cco, &cc),
"CCO should contain CC as substructure"
);
}
#[test]
fn test_has_substruct_match_no_match() {
let c = make_mol_c();
let cco = make_mol_cco();
assert!(
!has_substruct_match(&c, &cco),
"a single carbon should not contain CCO"
);
}
#[test]
fn test_get_substruct_match_self() {
let cco = make_mol_cco();
let result = get_substruct_match(&cco, &cco);
assert!(result.is_some(), "self-match should return Some");
let result = result.unwrap();
assert_eq!(result.atom_mapping.len(), 3);
for (qa, ma) in result.atom_mapping.iter().enumerate() {
assert_eq!(*ma, qa, "self-match should have identity mapping");
}
}
#[test]
fn test_get_substruct_match_cc_in_cco() {
let cc = make_mol_cc();
let cco = make_mol_cco();
let result = get_substruct_match(&cco, &cc);
assert!(result.is_some(), "CC should match in CCO");
}
#[test]
fn test_get_substruct_match_no_match() {
let c = make_mol_c();
let cco = make_mol_cco();
let result = get_substruct_match(&c, &cco);
assert!(
result.is_none(),
"C should not match CCO (query larger than mol)"
);
}
#[test]
fn test_get_substruct_matches_cco_in_cco() {
let cco = make_mol_cco();
let matches = get_substruct_matches(&cco, &cco);
assert!(!matches.is_empty(), "should find at least self-match");
}
#[test]
fn test_substruct_coc_matches_cco() {
let coc = make_mol_coc();
let cco = make_mol_cco();
assert!(
!has_substruct_match(&cco, &coc),
"CCO should not match COC topology"
);
let mut builder = MoleculeBuilder::new();
let c = builder.add_atom(crate::AtomSpec::new(crate::Element::C));
let o = builder.add_atom(crate::AtomSpec::new(crate::Element::O));
builder
.add_bond(crate::BondSpec::new(c, o, BondOrder::Single))
.expect("add CO bond");
let co = builder.build().expect("build CO");
assert!(has_substruct_match(&cco, &co), "CCO should match CO");
}
#[test]
fn test_has_substruct_match_empty_mol() {
let empty = Molecule::new();
let c = make_mol_c();
assert!(
!has_substruct_match(&empty, &c),
"empty molecule should not match anything"
);
assert!(
!has_substruct_match(&c, &empty),
"molecule should not match empty query"
);
}
#[test]
fn test_substruct_match_params_max_matches() {
let c = make_mol_c();
let params = SubstructMatchParams {
max_matches: 1,
uniquify: true,
};
let matches = get_substruct_matches_with_params(&c, &c, ¶ms);
assert_eq!(matches.len(), 1, "max_matches=1 should return one match");
}
#[test]
fn test_atom_matches_basic() {
let mut builder = MoleculeBuilder::new();
let c0 = builder.add_atom(crate::AtomSpec::new(crate::Element::C));
let c1 = builder.add_atom(crate::AtomSpec::new(crate::Element::C));
builder
.add_bond(crate::BondSpec::new(c0, c1, BondOrder::Single))
.expect("add bond");
let mol = builder.build().expect("build");
let q_atom = &mol.atoms()[0];
let m_atom = &mol.atoms()[1];
assert!(
atom_matches(q_atom, &mol, m_atom, &mol),
"two carbons should match"
);
}
#[test]
fn test_atom_matches_different_elements() {
let mut builder = MoleculeBuilder::new();
let c = builder.add_atom(crate::AtomSpec::new(crate::Element::C));
let o = builder.add_atom(crate::AtomSpec::new(crate::Element::O));
builder
.add_bond(crate::BondSpec::new(c, o, BondOrder::Single))
.expect("add bond");
let mol = builder.build().expect("build");
let c_atom = &mol.atoms()[0]; let o_atom = &mol.atoms()[1]; assert!(
!atom_matches(c_atom, &mol, o_atom, &mol),
"C should not match O via basic atomic number check"
);
}
#[test]
fn test_bond_matches_single() {
let mut builder = MoleculeBuilder::new();
let c0 = builder.add_atom(crate::AtomSpec::new(crate::Element::C));
let c1 = builder.add_atom(crate::AtomSpec::new(crate::Element::C));
builder
.add_bond(crate::BondSpec::new(c0, c1, BondOrder::Single))
.expect("add bond");
let mol = builder.build().expect("build");
assert!(bond_matches(&mol.bonds()[0], &mol, &mol.bonds()[0], &mol));
}
#[test]
fn test_vf2_graph_building() {
let cc = make_mol_cc();
let g = build_vf2_graph(&cc);
assert_eq!(g.n_atoms, 2);
assert_eq!(g.n_bonds, 1);
assert_eq!(g.out_degree(0), 1);
assert_eq!(g.out_degree(1), 1);
assert_eq!(g.out_edges(0)[0].0, 1);
assert_eq!(g.out_edges(1)[0].0, 0);
}
#[test]
fn test_sort_nodes_by_frequency_small() {
let cc = make_mol_cc();
let g = build_vf2_graph(&cc);
let order = sort_nodes_by_frequency(&g);
assert_eq!(order.len(), 2);
assert!(order.contains(&0));
assert!(order.contains(&1));
}
#[test]
fn test_vf2_state_initial() {
let cc = make_mol_cc();
let g = build_vf2_graph(&cc);
let state = Vf2SubState::new(&g, &g, true);
assert!(!state.is_goal());
assert!(!state.is_dead());
assert_eq!(state.core_len, 0);
}
#[test]
fn test_vf2_next_pair_initial() {
let cc = make_mol_cc();
let g = build_vf2_graph(&cc);
let state = Vf2SubState::new(&g, &g, true);
let mut pair = Vf2Pair::new();
let has_next = state.next_pair(&mut pair);
assert!(has_next, "should find a next pair");
}
}