assembly_theory/assembly.rs
1//! Compute assembly indices of molecules.
2//!
3//! # Example
4//! ```
5//! # use std::{fs, path::PathBuf};
6//! use assembly_theory::assembly::index;
7//! use assembly_theory::loader::parse_molfile_str;
8//!
9//! # fn main() -> Result<(), std::io::Error> {
10//! // Load a molecule from a .mol file.
11//! let path = PathBuf::from(format!("./data/checks/anthracene.mol"));
12//! let molfile = fs::read_to_string(path)?;
13//! let anthracene = parse_molfile_str(&molfile).expect("Parsing failure.");
14//!
15//! // Compute the molecule's assembly index.
16//! assert_eq!(index(&anthracene), 6);
17//! # Ok(())
18//! # }
19//! ```
20
21use std::{
22 sync::{
23 atomic::{AtomicUsize, Ordering::Relaxed},
24 Arc,
25 },
26 time::Duration,
27};
28
29use bit_set::BitSet;
30use clap::ValueEnum;
31use rayon::iter::{ParallelBridge, ParallelIterator};
32use tokio::{runtime::Runtime, sync::oneshot, time::timeout as tktimeout};
33
34use crate::{
35 bounds::{state_bounds, Bound},
36 canonize::CanonizeMode,
37 kernels::KernelMode,
38 matches::Matches,
39 memoize::{Cache, MemoizeMode},
40 molecule::Molecule,
41 state::State,
42 utils::connected_components_under_edges,
43};
44
45/// Parallelization strategy for the recursive search phase.
46#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum)]
47pub enum ParallelMode {
48 /// No parallelism.
49 None,
50 /// Create a task pool from the recursion's first level only.
51 DepthOne,
52 /// Spawn a new thread at every recursive call.
53 Always,
54}
55
56/// Compute assembly depth; see
57/// [Pagel et al. (2024)](https://arxiv.org/abs/2409.05993).
58///
59/// Note: This function is currently very (unusably) slow. It primarily exists
60/// in this crate as a proof of concept.
61///
62/// # Example
63/// ```
64/// # use std::{fs, path::PathBuf};
65/// use assembly_theory::assembly::depth;
66/// use assembly_theory::loader::parse_molfile_str;
67///
68/// # fn main() -> Result<(), std::io::Error> {
69/// // Load a molecule from a .mol file.
70/// let path = PathBuf::from(format!("./data/checks/benzene.mol"));
71/// let molfile = fs::read_to_string(path)?;
72/// let benzene = parse_molfile_str(&molfile).expect("Parsing failure.");
73///
74/// // Compute the molecule's assembly depth.
75/// assert_eq!(depth(&benzene), 3);
76/// # Ok(())
77/// # }
78/// ```
79pub fn depth(mol: &Molecule) -> u32 {
80 let mut ix = u32::MAX;
81 for (left, right) in mol.partitions().unwrap() {
82 let l = if left.is_basic_unit() {
83 0
84 } else {
85 depth(&left)
86 };
87
88 let r = if right.is_basic_unit() {
89 0
90 } else {
91 depth(&right)
92 };
93
94 ix = ix.min(l.max(r) + 1)
95 }
96 ix
97}
98
99/// Determine the fragments produced from the given assembly state by removing
100/// the given pair of edge-disjoint, isomorphic subgraphs and then adding one
101/// back; return `None` if not possible.
102fn fragments(mol: &Molecule, state: &[BitSet], h1: &BitSet, h2: &BitSet) -> Option<Vec<BitSet>> {
103 // Attempt to find fragments f1 and f2 containing h1 and h2, respectively;
104 // if either do not exist, exit without further fragmentation.
105 let f1 = state.iter().enumerate().find(|(_, c)| h1.is_subset(c));
106 let f2 = state.iter().enumerate().find(|(_, c)| h2.is_subset(c));
107 let (Some((i1, f1)), Some((i2, f2))) = (f1, f2) else {
108 return None;
109 };
110
111 let mut fragments = state.to_owned();
112
113 // If the same fragment f1 (== f2) contains both h1 and h2, replace this
114 // one fragment f1 with all connected components comprising f1 - (h1 U h2).
115 // Otherwise, replace fragments f1 and f2 with all connected components
116 // comprising f1 - h1 and f2 - h2, respectively.
117 if i1 == i2 {
118 let mut union = h1.clone();
119 union.union_with(h2);
120 let mut difference = f1.clone();
121 difference.difference_with(&union);
122 let c = connected_components_under_edges(mol.graph(), &difference);
123 fragments.extend(c);
124 fragments.swap_remove(i1);
125 } else {
126 let mut diff1 = f1.clone();
127 diff1.difference_with(h1);
128 let c1 = connected_components_under_edges(mol.graph(), &diff1);
129 fragments.extend(c1);
130
131 let mut diff2 = f2.clone();
132 diff2.difference_with(h2);
133 let c2 = connected_components_under_edges(mol.graph(), &diff2);
134 fragments.extend(c2);
135
136 fragments.swap_remove(i1.max(i2));
137 fragments.swap_remove(i1.min(i2));
138 }
139
140 // Drop any singleton fragments, add h1 as a fragment, and return.
141 fragments.retain(|i| i.len() > 1);
142 fragments.push(h1.clone());
143 Some(fragments)
144}
145
146/// Recursive helper for [`index_search`], only public for benchmarking.
147///
148/// Inputs:
149/// - `mol`: The molecule whose assembly index is being calculated.
150/// - `matches`: Structural information about the molecule's matched fragments.
151/// - `state`: The current assembly state.
152/// - `best_index`: The smallest assembly index for all assembly states so far.
153/// - `bounds`: The list of bounding strategies to apply.
154/// - `cache`: Memoization cache storing previously searched assembly states.
155/// - `parallel_mode`: The parallelism mode for this state's match iteration.
156///
157/// Returns, from this assembly state and any of its descendents:
158/// - `usize`: An updated upper bound on the assembly index. (Note: If this
159/// state is pruned by bounds or deemed redundant by memoization, then the
160/// upper bound returned is unchanged.)
161/// - `usize`: The number of assembly states searched.
162pub fn recurse_index_search(
163 mol: &Molecule,
164 matches: &Matches,
165 state: &State,
166 best_index: Arc<AtomicUsize>,
167 bounds: &[Bound],
168 cache: &mut Cache,
169 parallel_mode: ParallelMode,
170) -> (usize, usize) {
171 // If any bounds would prune this assembly state or if memoization is
172 // enabled and this assembly state is preempted by the cached state, halt.
173 if state_bounds(mol, state, best_index.load(Relaxed), bounds) || cache.memoize_state(mol, state)
174 {
175 return (state.index(), 1);
176 }
177
178 // Generate a list of matches (i.e., pairs of edge-disjoint, isomorphic
179 // fragments) to remove from this state.
180 let (intermediate_frags, matches_to_remove): (Vec<BitSet>, Vec<usize>) =
181 matches.matches_to_remove(mol, state, best_index.load(Relaxed), bounds);
182
183 // Keep track of the best assembly index found in any of this assembly
184 // state's children and the number of states searched, including this one.
185 let best_child_index = AtomicUsize::from(state.index());
186 let states_searched = AtomicUsize::from(1);
187
188 // Define a closure that handles recursing to a new assembly state based on
189 // the given match.
190 let recurse_on_match = |i: usize, match_ix: usize| {
191 let (h1, h2) = matches.match_fragments(match_ix);
192
193 if let Some(fragments) = fragments(mol, &intermediate_frags, h1, h2) {
194 // If using depth-one parallelism, all descendant states should be
195 // computed serially.
196 let new_parallel = if parallel_mode == ParallelMode::DepthOne {
197 ParallelMode::None
198 } else {
199 parallel_mode
200 };
201
202 // Recurse using the remaining matches and updated fragments.
203 let (child_index, child_states_searched) = recurse_index_search(
204 mol,
205 matches,
206 &state.update(fragments, i, match_ix, h1.len()),
207 best_index.clone(),
208 bounds,
209 &mut cache.clone(),
210 new_parallel,
211 );
212
213 // Update the best assembly indices (across children states and the
214 // entire search) and the number of descendant states searched.
215 best_child_index.fetch_min(child_index, Relaxed);
216 best_index.fetch_min(best_child_index.load(Relaxed), Relaxed);
217 states_searched.fetch_add(child_states_searched, Relaxed);
218 }
219 };
220
221 // Use the iterator type corresponding to the specified parallelism mode.
222 if parallel_mode == ParallelMode::None {
223 matches_to_remove
224 .iter()
225 .enumerate()
226 .for_each(|(i, match_ix)| recurse_on_match(i, *match_ix));
227 } else {
228 matches_to_remove
229 .iter()
230 .enumerate()
231 .par_bridge()
232 .for_each(|(i, match_ix)| recurse_on_match(i, *match_ix));
233 }
234
235 (
236 best_child_index.load(Relaxed),
237 states_searched.load(Relaxed),
238 )
239}
240
241/// Compute a molecule's assembly index and related information using a
242/// top-down recursive algorithm, parameterized by the specified options.
243///
244/// If `timeout` is `None`, run until the assembly index is found. Otherwise,
245/// stop after `timeout` milliseconds and return the best upper bound on the
246/// assembly index found so far.
247///
248/// See [`CanonizeMode`], [`ParallelMode`], [`KernelMode`], and [`Bound`] for
249/// details on how to customize the algorithm. Notably, bounds are applied in
250/// the order they appear in the `bounds` slice. It is generally better to
251/// provide bounds that are quick to compute first.
252///
253/// The results returned are:
254/// - The molecule's `u32` assembly index (or an upper bound if timed out).
255/// - The molecule's `u32` number of edge-disjoint isomorphic subgraph pairs.
256/// - The `usize` total number of assembly [`State`]s searched if search
257/// completes, and `None` otherwise (i.e., if search timed out).
258///
259/// # Example
260/// ```
261/// # use std::{fs, path::PathBuf};
262/// use assembly_theory::{
263/// assembly::{index_search, ParallelMode},
264/// bounds::Bound,
265/// canonize::CanonizeMode,
266/// kernels::KernelMode,
267/// loader::parse_molfile_str,
268/// memoize::MemoizeMode,
269/// };
270///
271/// # fn main() -> Result<(), std::io::Error> {
272/// // Load a molecule from a .mol file.
273/// let path = PathBuf::from(format!("./data/checks/anthracene.mol"));
274/// let molfile = fs::read_to_string(path)?;
275/// let anthracene = parse_molfile_str(&molfile).expect("Parsing failure.");
276///
277/// // Compute the molecule's assembly index without parallelism, memoization,
278/// // kernelization, or bounds.
279/// let (slow_index, _, _) = index_search(
280/// &anthracene,
281/// None,
282/// CanonizeMode::TreeNauty,
283/// ParallelMode::None,
284/// MemoizeMode::None,
285/// KernelMode::None,
286/// &[],
287/// );
288///
289/// // Compute the molecule's assembly index with parallelism, memoization, and
290/// // some bounds.
291/// let (fast_index, _, _) = index_search(
292/// &anthracene,
293/// None,
294/// CanonizeMode::TreeNauty,
295/// ParallelMode::DepthOne,
296/// MemoizeMode::CanonIndex,
297/// KernelMode::None,
298/// &[Bound::Log, Bound::Int],
299/// );
300///
301/// // Limit search to 1 ms, which should time out.
302/// let (index_bound, _, states_searched) = index_search(
303/// &anthracene,
304/// Some(1),
305/// CanonizeMode::TreeNauty,
306/// ParallelMode::None,
307/// MemoizeMode::None,
308/// KernelMode::None,
309/// &[],
310/// );
311///
312/// assert_eq!(slow_index, 6);
313/// assert_eq!(fast_index, 6);
314/// assert!(index_bound >= fast_index && states_searched == None);
315/// # Ok(())
316/// # }
317/// ```
318pub fn index_search(
319 mol: &Molecule,
320 timeout: Option<u64>,
321 canonize_mode: CanonizeMode,
322 parallel_mode: ParallelMode,
323 memoize_mode: MemoizeMode,
324 kernel_mode: KernelMode,
325 bounds: &[Bound],
326) -> (u32, u32, Option<usize>) {
327 // Catch not-yet-implemented modes.
328 if kernel_mode != KernelMode::None {
329 panic!("The chosen --kernel mode is not implemented yet!")
330 }
331
332 // Create the initial assembly state and memoization cache.
333 let state = State::new(mol);
334 let mut cache = Cache::new(memoize_mode, canonize_mode);
335
336 // Enumerate matches (i.e., pairs of edge-disjoint isomorphic fragments).
337 let matches = Matches::new(mol, canonize_mode);
338
339 // Use an `Arc` to track the best assembly index across parallel threads.
340 let best_index = Arc::new(AtomicUsize::from(mol.graph().edge_count() - 1));
341
342 // Search for the shortest assembly pathway recursively.
343 if let Some(timeout) = timeout {
344 // If a timeout is provided, we will search within an asynchronous task
345 // that can be interrupted after the specified duration (see below). To
346 // avoid subsequent scope issues, make copies of various variables.
347 let best_index_copy = best_index.clone();
348 let mol = mol.clone();
349 let bounds = bounds.to_vec();
350 let num_matches = matches.len();
351
352 // Search within a dedicated asynchronous runtime.
353 let rt = Runtime::new().unwrap();
354 let result = rt.block_on(async {
355 let (send, recv) = oneshot::channel();
356 rayon::spawn(move || {
357 let _ = send.send(recurse_index_search(
358 &mol,
359 &matches,
360 &state,
361 best_index_copy,
362 &bounds,
363 &mut cache,
364 parallel_mode,
365 ));
366 });
367 tktimeout(Duration::from_millis(timeout), recv).await
368 });
369
370 // If the search completes before the timeout, return the true assembly
371 // index. Otherwise, return the best upper bound on the assembly index
372 // found before timing out.
373 let (index, states_searched) = match result {
374 Ok(Ok((index, states_searched))) => (index, Some(states_searched)),
375 Err(_) => (best_index.load(Relaxed), None),
376 _ => panic!("An unexpected error occurred in async index_search"),
377 };
378 (index as u32, num_matches as u32, states_searched)
379 } else {
380 // Otherwise, if no timeout is provided, run the search normally.
381 let (index, states_searched) = recurse_index_search(
382 mol,
383 &matches,
384 &state,
385 best_index,
386 bounds,
387 &mut cache,
388 parallel_mode,
389 );
390 (index as u32, matches.len() as u32, Some(states_searched))
391 }
392}
393
394/// Compute a molecule's assembly index using an efficient default strategy.
395///
396/// To customize assembly index calculation beyond the default strategy, see
397/// [`index_search`].
398///
399/// # Example
400/// ```
401/// # use std::{fs, path::PathBuf};
402/// use assembly_theory::assembly::index;
403/// use assembly_theory::loader::parse_molfile_str;
404///
405/// # fn main() -> Result<(), std::io::Error> {
406/// // Load a molecule from a .mol file.
407/// let path = PathBuf::from(format!("./data/checks/anthracene.mol"));
408/// let molfile = fs::read_to_string(path)?;
409/// let anthracene = parse_molfile_str(&molfile).expect("Parsing failure.");
410///
411/// // Compute the molecule's assembly index.
412/// assert_eq!(index(&anthracene), 6);
413/// # Ok(())
414/// # }
415/// ```
416pub fn index(mol: &Molecule) -> u32 {
417 index_search(
418 mol,
419 None,
420 CanonizeMode::TreeNauty,
421 ParallelMode::DepthOne,
422 MemoizeMode::CanonIndex,
423 KernelMode::None,
424 &[Bound::Int, Bound::MatchableEdges],
425 )
426 .0
427}