assembly_theory/assembly.rs
1//! Compute assembly indices of molecules.
2//!
3//! # Example
4//! ```
5//! # use std::{fs, path::PathBuf};
6//! use assembly_theory::assembly::index;
7//! use assembly_theory::loader::parse_molfile_str;
8//!
9//! # fn main() -> Result<(), std::io::Error> {
10//! // Load a molecule from a .mol file.
11//! let path = PathBuf::from(format!("./data/checks/anthracene.mol"));
12//! let molfile = fs::read_to_string(path)?;
13//! let anthracene = parse_molfile_str(&molfile).expect("Parsing failure.");
14//!
15//! // Compute the molecule's assembly index.
16//! assert_eq!(index(&anthracene), 6);
17//! # Ok(())
18//! # }
19//! ```
20
21use std::sync::{
22 atomic::{AtomicUsize, Ordering::Relaxed},
23 Arc,
24};
25
26use bit_set::BitSet;
27use clap::ValueEnum;
28use dashmap::DashMap;
29use rayon::iter::{IndexedParallelIterator, IntoParallelRefIterator, ParallelIterator};
30
31use crate::{
32 bounds::{bound_exceeded, Bound},
33 canonize::{canonize, CanonizeMode, Labeling},
34 enumerate::{enumerate_subgraphs, EnumerateMode},
35 kernels::KernelMode,
36 memoize::{Cache, MemoizeMode},
37 molecule::Molecule,
38 utils::connected_components_under_edges,
39};
40
41/// Parallelization strategy for the recursive search phase.
42#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum)]
43pub enum ParallelMode {
44 /// No parallelism.
45 None,
46 /// Create a task pool from the recursion's first level only.
47 DepthOne,
48 /// Spawn a new thread at every recursive call.
49 Always,
50}
51
52/// Compute assembly depth; see
53/// [Pagel et al. (2024)](https://arxiv.org/abs/2409.05993).
54///
55/// Note: This function is currently very (unusably) slow. It primarily exists
56/// in this crate as a proof of concept.
57///
58/// # Example
59/// ```
60/// # use std::{fs, path::PathBuf};
61/// use assembly_theory::assembly::depth;
62/// use assembly_theory::loader::parse_molfile_str;
63///
64/// # fn main() -> Result<(), std::io::Error> {
65/// // Load a molecule from a .mol file.
66/// let path = PathBuf::from(format!("./data/checks/benzene.mol"));
67/// let molfile = fs::read_to_string(path)?;
68/// let benzene = parse_molfile_str(&molfile).expect("Parsing failure.");
69///
70/// // Compute the molecule's assembly depth.
71/// assert_eq!(depth(&benzene), 3);
72/// # Ok(())
73/// # }
74/// ```
75pub fn depth(mol: &Molecule) -> u32 {
76 let mut ix = u32::MAX;
77 for (left, right) in mol.partitions().unwrap() {
78 let l = if left.is_basic_unit() {
79 0
80 } else {
81 depth(&left)
82 };
83
84 let r = if right.is_basic_unit() {
85 0
86 } else {
87 depth(&right)
88 };
89
90 ix = ix.min(l.max(r) + 1)
91 }
92 ix
93}
94
95/// Helper function for [`index_search`]; only public for benchmarking.
96///
97/// Return all pairs of non-overlapping, isomorphic subgraphs in the molecule,
98/// sorted to guarantee deterministic iteration.
99pub fn matches(
100 mol: &Molecule,
101 enumerate_mode: EnumerateMode,
102 canonize_mode: CanonizeMode,
103 parallel_mode: ParallelMode,
104) -> Vec<(BitSet, BitSet)> {
105 // Enumerate all connected, non-induced subgraphs with at most |E|/2 edges
106 // and bin them into isomorphism classes using canonization.
107 let isomorphism_classes = DashMap::<Labeling, Vec<BitSet>>::new();
108 let bin_subgraph = |subgraph: &BitSet| {
109 isomorphism_classes
110 .entry(canonize(mol, subgraph, canonize_mode))
111 .and_modify(|bucket| bucket.push(subgraph.clone()))
112 .or_insert(vec![subgraph.clone()]);
113 };
114 if parallel_mode == ParallelMode::None {
115 enumerate_subgraphs(mol, enumerate_mode)
116 .iter()
117 .for_each(bin_subgraph);
118 } else {
119 enumerate_subgraphs(mol, enumerate_mode)
120 .par_iter()
121 .for_each(bin_subgraph);
122 }
123
124 // In each isomorphism class, collect non-overlapping pairs of subgraphs.
125 let mut matches = Vec::new();
126 for bucket in isomorphism_classes.iter() {
127 for (i, first) in bucket.iter().enumerate() {
128 for second in &bucket[i..] {
129 if first.is_disjoint(second) {
130 if first > second {
131 matches.push((first.clone(), second.clone()));
132 } else {
133 matches.push((second.clone(), first.clone()));
134 }
135 }
136 }
137 }
138 }
139
140 // Sort pairs in a deterministic order and return.
141 matches.sort_by(|e1, e2| {
142 let ord = [
143 e2.0.len().cmp(&e1.0.len()), // Decreasing subgraph size.
144 e1.0.cmp(&e2.0), // First subgraph lexicographical.
145 e1.1.cmp(&e2.1), // Second subgraph lexicographical.
146 ];
147 let mut i = 0;
148 while ord[i] == std::cmp::Ordering::Equal {
149 i += 1;
150 }
151 ord[i]
152 });
153
154 matches
155}
156
157/// Determine the fragments produced from the given assembly state by removing
158/// the given pair of non-overlapping, isomorphic subgraphs and then adding one
159/// back; return `None` if not possible.
160fn fragments(mol: &Molecule, state: &[BitSet], h1: &BitSet, h2: &BitSet) -> Option<Vec<BitSet>> {
161 // Attempt to find fragments f1 and f2 containing h1 and h2, respectively;
162 // if either do not exist, exit without further fragmentation.
163 let f1 = state.iter().enumerate().find(|(_, c)| h1.is_subset(c));
164 let f2 = state.iter().enumerate().find(|(_, c)| h2.is_subset(c));
165 let (Some((i1, f1)), Some((i2, f2))) = (f1, f2) else {
166 return None;
167 };
168
169 let mut fragments = state.to_owned();
170
171 // If the same fragment f1 (== f2) contains both h1 and h2, replace this
172 // one fragment f1 with all connected components comprising f1 - (h1 U h2).
173 // Otherwise, replace fragments f1 and f2 with all connected components
174 // comprising f1 - h1 and f2 - h2, respectively.
175 if i1 == i2 {
176 let mut union = h1.clone();
177 union.union_with(h2);
178 let mut difference = f1.clone();
179 difference.difference_with(&union);
180 let c = connected_components_under_edges(mol.graph(), &difference);
181 fragments.extend(c);
182 fragments.swap_remove(i1);
183 } else {
184 let mut diff1 = f1.clone();
185 diff1.difference_with(h1);
186 let c1 = connected_components_under_edges(mol.graph(), &diff1);
187 fragments.extend(c1);
188
189 let mut diff2 = f2.clone();
190 diff2.difference_with(h2);
191 let c2 = connected_components_under_edges(mol.graph(), &diff2);
192 fragments.extend(c2);
193
194 fragments.swap_remove(i1.max(i2));
195 fragments.swap_remove(i1.min(i2));
196 }
197
198 // Drop any singleton fragments, add h1 as a fragment, and return.
199 fragments.retain(|i| i.len() > 1);
200 fragments.push(h1.clone());
201 Some(fragments)
202}
203
204/// Recursive helper for [`index_search`], only public for benchmarking.
205///
206/// Inputs:
207/// - `mol`: The molecule whose assembly index is being calculated.
208/// - `matches`: The remaining non-overlapping isomorphic subgraph pairs.
209/// - `removal_order`: TODO
210/// - `state`: The current assembly state, i.e., a list of fragments.
211/// - `state_index`: This assembly state's upper bound on the assembly index,
212/// i.e., edges(mol) - 1 - [edges(subgraphs removed) - #(subgraphs removed)].
213/// - `best_index`: The smallest assembly index for all assembly states so far.
214/// - `largest_remove`: An upper bound on the size of fragments that can be
215/// removed from this or any descendant assembly state.
216/// - `bounds`: The list of bounding strategies to apply.
217/// - `cache`: TODO
218/// - `parallel_mode`: The parallelism mode for this state's match iteration.
219///
220/// Returns, from this assembly state and any of its descendents:
221/// - `usize`: An updated upper bound on the assembly index. (Note: If this
222/// state is pruned by bounds or deemed redundant by memoization, then the
223/// upper bound returned is unchanged.)
224/// - `usize`: The number of assembly states searched.
225#[allow(clippy::too_many_arguments)]
226pub fn recurse_index_search(
227 mol: &Molecule,
228 matches: &[(BitSet, BitSet)],
229 removal_order: Vec<usize>,
230 state: &[BitSet],
231 state_index: usize,
232 best_index: Arc<AtomicUsize>,
233 largest_remove: usize,
234 bounds: &[Bound],
235 cache: &mut Cache,
236 parallel_mode: ParallelMode,
237) -> (usize, usize) {
238 // If any bounds would prune this assembly state or if memoization is
239 // enabled and this assembly state is preempted by the cached state, halt.
240 if bound_exceeded(
241 mol,
242 state,
243 state_index,
244 best_index.load(Relaxed),
245 largest_remove,
246 bounds,
247 ) || cache.memoize_state(mol, state, state_index, &removal_order)
248 {
249 return (state_index, 1);
250 }
251
252 // Keep track of the best assembly index found in any of this assembly
253 // state's children and the number of states searched, including this one.
254 let best_child_index = AtomicUsize::from(state_index);
255 let states_searched = AtomicUsize::from(1);
256
257 // Define a closure that handles recursing to a new assembly state based on
258 // the given (enumerated) pair of non-overlapping isomorphic subgraphs.
259 let recurse_on_match = |i: usize, h1: &BitSet, h2: &BitSet| {
260 if let Some(fragments) = fragments(mol, state, h1, h2) {
261 // If using depth-one parallelism, all descendant states should be
262 // computed serially.
263 let new_parallel = if parallel_mode == ParallelMode::DepthOne {
264 ParallelMode::None
265 } else {
266 parallel_mode
267 };
268
269 // Recurse using the remaining matches and updated fragments.
270 let (child_index, child_states_searched) = recurse_index_search(
271 mol,
272 &matches[i + 1..],
273 {
274 let mut clone = removal_order.clone();
275 clone.push(i);
276 clone
277 },
278 &fragments,
279 state_index - h1.len() + 1,
280 best_index.clone(),
281 h1.len(),
282 bounds,
283 &mut cache.clone(),
284 new_parallel,
285 );
286
287 // Update the best assembly indices (across children states and
288 // the entire search) and the number of descendant states searched.
289 best_child_index.fetch_min(child_index, Relaxed);
290 best_index.fetch_min(best_child_index.load(Relaxed), Relaxed);
291 states_searched.fetch_add(child_states_searched, Relaxed);
292 }
293 };
294
295 // Use the iterator type corresponding to the specified parallelism mode.
296 if parallel_mode == ParallelMode::None {
297 matches
298 .iter()
299 .enumerate()
300 .for_each(|(i, (h1, h2))| recurse_on_match(i, h1, h2));
301 } else {
302 matches
303 .par_iter()
304 .enumerate()
305 .for_each(|(i, (h1, h2))| recurse_on_match(i, h1, h2));
306 }
307
308 (
309 best_child_index.load(Relaxed),
310 states_searched.load(Relaxed),
311 )
312}
313
314/// Compute a molecule's assembly index and related information using a
315/// top-down recursive algorithm, parameterized by the specified options.
316///
317/// See [`EnumerateMode`], [`CanonizeMode`], [`ParallelMode`], [`KernelMode`],
318/// and [`Bound`] for details on how to customize the algorithm. Notably,
319/// bounds are applied in the order they appear in the `bounds` slice. It is
320/// generally better to provide bounds that are quick to compute first.
321///
322/// The results returned are:
323/// - The molecule's `u32` assembly index.
324/// - The molecule's `u32` number of non-overlapping isomorphic subgraph pairs.
325/// - The `usize` total number of assembly states searched, where an assembly
326/// state is a collection of fragments. Note that, depending on the algorithm
327/// parameters used, some states may be searched/counted multiple times.
328///
329/// # Example
330/// ```
331/// # use std::{fs, path::PathBuf};
332/// use assembly_theory::{
333/// assembly::{index_search, ParallelMode},
334/// bounds::Bound,
335/// canonize::CanonizeMode,
336/// enumerate::EnumerateMode,
337/// kernels::KernelMode,
338/// loader::parse_molfile_str,
339/// memoize::MemoizeMode,
340/// };
341///
342/// # fn main() -> Result<(), std::io::Error> {
343/// // Load a molecule from a .mol file.
344/// let path = PathBuf::from(format!("./data/checks/anthracene.mol"));
345/// let molfile = fs::read_to_string(path)?;
346/// let anthracene = parse_molfile_str(&molfile).expect("Parsing failure.");
347///
348/// // Compute the molecule's assembly index without parallelism, memoization,
349/// // kernelization, or bounds.
350/// let (slow_index, _, _) = index_search(
351/// &anthracene,
352/// EnumerateMode::GrowErode,
353/// CanonizeMode::TreeNauty,
354/// ParallelMode::None,
355/// MemoizeMode::None,
356/// KernelMode::None,
357/// &[],
358/// );
359///
360/// // Compute the molecule's assembly index with parallelism, memoization, and
361/// // some bounds.
362/// let (fast_index, _, _) = index_search(
363/// &anthracene,
364/// EnumerateMode::GrowErode,
365/// CanonizeMode::TreeNauty,
366/// ParallelMode::DepthOne,
367/// MemoizeMode::CanonIndex,
368/// KernelMode::None,
369/// &[Bound::Log, Bound::Int],
370/// );
371///
372/// assert_eq!(slow_index, 6);
373/// assert_eq!(fast_index, 6);
374/// # Ok(())
375/// # }
376/// ```
377pub fn index_search(
378 mol: &Molecule,
379 enumerate_mode: EnumerateMode,
380 canonize_mode: CanonizeMode,
381 parallel_mode: ParallelMode,
382 memoize_mode: MemoizeMode,
383 kernel_mode: KernelMode,
384 bounds: &[Bound],
385) -> (u32, u32, usize) {
386 // Catch not-yet-implemented modes.
387 if kernel_mode != KernelMode::None {
388 panic!("The chosen --kernel mode is not implemented yet!")
389 }
390
391 // Enumerate non-overlapping isomorphic subgraph pairs.
392 let matches = matches(mol, enumerate_mode, canonize_mode, parallel_mode);
393
394 // Create memoization cache.
395 let mut cache = Cache::new(memoize_mode, canonize_mode);
396
397 // Initialize the first fragment as the entire graph.
398 let mut init = BitSet::new();
399 init.extend(mol.graph().edge_indices().map(|ix| ix.index()));
400
401 // Search for the shortest assembly pathway recursively.
402 let edge_count = mol.graph().edge_count();
403 let best_index = Arc::new(AtomicUsize::from(edge_count - 1));
404 let (index, states_searched) = recurse_index_search(
405 mol,
406 &matches,
407 Vec::new(),
408 &[init],
409 edge_count - 1,
410 best_index,
411 edge_count,
412 bounds,
413 &mut cache,
414 parallel_mode,
415 );
416
417 (index as u32, matches.len() as u32, states_searched)
418}
419
420/// Compute a molecule's assembly index using an efficient default strategy.
421///
422/// # Example
423/// ```
424/// # use std::{fs, path::PathBuf};
425/// use assembly_theory::assembly::index;
426/// use assembly_theory::loader::parse_molfile_str;
427///
428/// # fn main() -> Result<(), std::io::Error> {
429/// // Load a molecule from a .mol file.
430/// let path = PathBuf::from(format!("./data/checks/anthracene.mol"));
431/// let molfile = fs::read_to_string(path)?;
432/// let anthracene = parse_molfile_str(&molfile).expect("Parsing failure.");
433///
434/// // Compute the molecule's assembly index.
435/// assert_eq!(index(&anthracene), 6);
436/// # Ok(())
437/// # }
438/// ```
439pub fn index(mol: &Molecule) -> u32 {
440 index_search(
441 mol,
442 EnumerateMode::GrowErode,
443 CanonizeMode::TreeNauty,
444 ParallelMode::DepthOne,
445 MemoizeMode::CanonIndex,
446 KernelMode::None,
447 &[Bound::Int, Bound::VecSimple, Bound::VecSmallFrags],
448 )
449 .0
450}