assembly_theory/assembly.rs
1//! Compute assembly indices of molecules.
2//!
3//! # Example
4//! ```
5//! # use std::{fs, path::PathBuf};
6//! use assembly_theory::assembly::index;
7//! use assembly_theory::loader::parse_molfile_str;
8//!
9//! # fn main() -> Result<(), std::io::Error> {
10//! // Load a molecule from a .mol file.
11//! let path = PathBuf::from(format!("./data/checks/anthracene.mol"));
12//! let molfile = fs::read_to_string(path)?;
13//! let anthracene = parse_molfile_str(&molfile).expect("Parsing failure.");
14//!
15//! // Compute the molecule's assembly index.
16//! assert_eq!(index(&anthracene), 6);
17//! # Ok(())
18//! # }
19//! ```
20
21use std::sync::{
22 atomic::{AtomicUsize, Ordering::Relaxed},
23 Arc,
24};
25
26use bit_set::BitSet;
27use clap::ValueEnum;
28use rayon::iter::{ParallelBridge, ParallelIterator};
29
30use crate::{
31 bounds::{state_bounds, Bound},
32 canonize::CanonizeMode,
33 kernels::KernelMode,
34 matches::Matches,
35 memoize::{Cache, MemoizeMode},
36 molecule::Molecule,
37 state::State,
38 utils::connected_components_under_edges,
39};
40
41/// Parallelization strategy for the recursive search phase.
42#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum)]
43pub enum ParallelMode {
44 /// No parallelism.
45 None,
46 /// Create a task pool from the recursion's first level only.
47 DepthOne,
48 /// Spawn a new thread at every recursive call.
49 Always,
50}
51
52/// Compute assembly depth; see
53/// [Pagel et al. (2024)](https://arxiv.org/abs/2409.05993).
54///
55/// Note: This function is currently very (unusably) slow. It primarily exists
56/// in this crate as a proof of concept.
57///
58/// # Example
59/// ```
60/// # use std::{fs, path::PathBuf};
61/// use assembly_theory::assembly::depth;
62/// use assembly_theory::loader::parse_molfile_str;
63///
64/// # fn main() -> Result<(), std::io::Error> {
65/// // Load a molecule from a .mol file.
66/// let path = PathBuf::from(format!("./data/checks/benzene.mol"));
67/// let molfile = fs::read_to_string(path)?;
68/// let benzene = parse_molfile_str(&molfile).expect("Parsing failure.");
69///
70/// // Compute the molecule's assembly depth.
71/// assert_eq!(depth(&benzene), 3);
72/// # Ok(())
73/// # }
74/// ```
75pub fn depth(mol: &Molecule) -> u32 {
76 let mut ix = u32::MAX;
77 for (left, right) in mol.partitions().unwrap() {
78 let l = if left.is_basic_unit() {
79 0
80 } else {
81 depth(&left)
82 };
83
84 let r = if right.is_basic_unit() {
85 0
86 } else {
87 depth(&right)
88 };
89
90 ix = ix.min(l.max(r) + 1)
91 }
92 ix
93}
94
95/// Determine the fragments produced from the given assembly state by removing
96/// the given pair of edge-disjoint, isomorphic subgraphs and then adding one
97/// back; return `None` if not possible.
98fn fragments(mol: &Molecule, state: &[BitSet], h1: &BitSet, h2: &BitSet) -> Option<Vec<BitSet>> {
99 // Attempt to find fragments f1 and f2 containing h1 and h2, respectively;
100 // if either do not exist, exit without further fragmentation.
101 let f1 = state.iter().enumerate().find(|(_, c)| h1.is_subset(c));
102 let f2 = state.iter().enumerate().find(|(_, c)| h2.is_subset(c));
103 let (Some((i1, f1)), Some((i2, f2))) = (f1, f2) else {
104 return None;
105 };
106
107 let mut fragments = state.to_owned();
108
109 // If the same fragment f1 (== f2) contains both h1 and h2, replace this
110 // one fragment f1 with all connected components comprising f1 - (h1 U h2).
111 // Otherwise, replace fragments f1 and f2 with all connected components
112 // comprising f1 - h1 and f2 - h2, respectively.
113 if i1 == i2 {
114 let mut union = h1.clone();
115 union.union_with(h2);
116 let mut difference = f1.clone();
117 difference.difference_with(&union);
118 let c = connected_components_under_edges(mol.graph(), &difference);
119 fragments.extend(c);
120 fragments.swap_remove(i1);
121 } else {
122 let mut diff1 = f1.clone();
123 diff1.difference_with(h1);
124 let c1 = connected_components_under_edges(mol.graph(), &diff1);
125 fragments.extend(c1);
126
127 let mut diff2 = f2.clone();
128 diff2.difference_with(h2);
129 let c2 = connected_components_under_edges(mol.graph(), &diff2);
130 fragments.extend(c2);
131
132 fragments.swap_remove(i1.max(i2));
133 fragments.swap_remove(i1.min(i2));
134 }
135
136 // Drop any singleton fragments, add h1 as a fragment, and return.
137 fragments.retain(|i| i.len() > 1);
138 fragments.push(h1.clone());
139 Some(fragments)
140}
141
142/// Recursive helper for [`index_search`], only public for benchmarking.
143///
144/// Inputs:
145/// - `mol`: The molecule whose assembly index is being calculated.
146/// - `matches`: Structural information about the molecule's matched fragments.
147/// - `state`: The current assembly state.
148/// - `best_index`: The smallest assembly index for all assembly states so far.
149/// - `bounds`: The list of bounding strategies to apply.
150/// - `cache`: Memoization cache storing previously searched assembly states.
151/// - `parallel_mode`: The parallelism mode for this state's match iteration.
152///
153/// Returns, from this assembly state and any of its descendents:
154/// - `usize`: An updated upper bound on the assembly index. (Note: If this
155/// state is pruned by bounds or deemed redundant by memoization, then the
156/// upper bound returned is unchanged.)
157/// - `usize`: The number of assembly states searched.
158pub fn recurse_index_search(
159 mol: &Molecule,
160 matches: &Matches,
161 state: &State,
162 best_index: Arc<AtomicUsize>,
163 bounds: &[Bound],
164 cache: &mut Cache,
165 parallel_mode: ParallelMode,
166) -> (usize, usize) {
167 // If any bounds would prune this assembly state or if memoization is
168 // enabled and this assembly state is preempted by the cached state, halt.
169 if state_bounds(mol, state, best_index.load(Relaxed), bounds) || cache.memoize_state(mol, state)
170 {
171 return (state.index(), 1);
172 }
173
174 // Generate a list of matches (i.e., pairs of edge-disjoint, isomorphic
175 // fragments) to remove from this state.
176 let (intermediate_frags, matches_to_remove): (Vec<BitSet>, Vec<usize>) =
177 matches.matches_to_remove(mol, state, best_index.load(Relaxed), bounds);
178
179 // Keep track of the best assembly index found in any of this assembly
180 // state's children and the number of states searched, including this one.
181 let best_child_index = AtomicUsize::from(state.index());
182 let states_searched = AtomicUsize::from(1);
183
184 // Define a closure that handles recursing to a new assembly state based on
185 // the given match.
186 let recurse_on_match = |i: usize, match_ix: usize| {
187 let (h1, h2) = matches.match_fragments(match_ix);
188
189 if let Some(fragments) = fragments(mol, &intermediate_frags, h1, h2) {
190 // If using depth-one parallelism, all descendant states should be
191 // computed serially.
192 let new_parallel = if parallel_mode == ParallelMode::DepthOne {
193 ParallelMode::None
194 } else {
195 parallel_mode
196 };
197
198 // Recurse using the remaining matches and updated fragments.
199 let (child_index, child_states_searched) = recurse_index_search(
200 mol,
201 matches,
202 &state.update(fragments, i, match_ix, h1.len()),
203 best_index.clone(),
204 bounds,
205 &mut cache.clone(),
206 new_parallel,
207 );
208
209 // Update the best assembly indices (across children states and
210 // the entire search) and the number of descendant states searched.
211 best_child_index.fetch_min(child_index, Relaxed);
212 best_index.fetch_min(best_child_index.load(Relaxed), Relaxed);
213 states_searched.fetch_add(child_states_searched, Relaxed);
214 }
215 };
216
217 // Use the iterator type corresponding to the specified parallelism mode.
218 if parallel_mode == ParallelMode::None {
219 matches_to_remove
220 .iter()
221 .enumerate()
222 .for_each(|(i, match_ix)| recurse_on_match(i, *match_ix));
223 } else {
224 matches_to_remove
225 .iter()
226 .enumerate()
227 .par_bridge()
228 .for_each(|(i, match_ix)| recurse_on_match(i, *match_ix));
229 }
230
231 (
232 best_child_index.load(Relaxed),
233 states_searched.load(Relaxed),
234 )
235}
236
237/// Compute a molecule's assembly index and related information using a
238/// top-down recursive algorithm, parameterized by the specified options.
239///
240/// See [`CanonizeMode`], [`ParallelMode`], [`KernelMode`], and [`Bound`] for
241/// details on how to customize the algorithm. Notably, bounds are applied in
242/// the order they appear in the `bounds` slice. It is generally better to
243/// provide bounds that are quick to compute first.
244///
245/// The results returned are:
246/// - The molecule's `u32` assembly index.
247/// - The molecule's `u32` number of edge-disjoint isomorphic subgraph pairs.
248/// - The `usize` total number of assembly states searched, where an assembly
249/// state is a collection of fragments. Note that, depending on the algorithm
250/// parameters used, some states may be searched/counted multiple times.
251///
252/// # Example
253/// ```
254/// # use std::{fs, path::PathBuf};
255/// use assembly_theory::{
256/// assembly::{index_search, ParallelMode},
257/// bounds::Bound,
258/// canonize::CanonizeMode,
259/// kernels::KernelMode,
260/// loader::parse_molfile_str,
261/// memoize::MemoizeMode,
262/// };
263///
264/// # fn main() -> Result<(), std::io::Error> {
265/// // Load a molecule from a .mol file.
266/// let path = PathBuf::from(format!("./data/checks/anthracene.mol"));
267/// let molfile = fs::read_to_string(path)?;
268/// let anthracene = parse_molfile_str(&molfile).expect("Parsing failure.");
269///
270/// // Compute the molecule's assembly index without parallelism, memoization,
271/// // kernelization, or bounds.
272/// let (slow_index, _, _) = index_search(
273/// &anthracene,
274/// CanonizeMode::TreeNauty,
275/// ParallelMode::None,
276/// MemoizeMode::None,
277/// KernelMode::None,
278/// &[],
279/// );
280///
281/// // Compute the molecule's assembly index with parallelism, memoization, and
282/// // some bounds.
283/// let (fast_index, _, _) = index_search(
284/// &anthracene,
285/// CanonizeMode::TreeNauty,
286/// ParallelMode::DepthOne,
287/// MemoizeMode::CanonIndex,
288/// KernelMode::None,
289/// &[Bound::Log, Bound::Int],
290/// );
291///
292/// assert_eq!(slow_index, 6);
293/// assert_eq!(fast_index, 6);
294/// # Ok(())
295/// # }
296/// ```
297pub fn index_search(
298 mol: &Molecule,
299 canonize_mode: CanonizeMode,
300 parallel_mode: ParallelMode,
301 memoize_mode: MemoizeMode,
302 kernel_mode: KernelMode,
303 bounds: &[Bound],
304) -> (u32, u32, usize) {
305 // Catch not-yet-implemented modes.
306 if kernel_mode != KernelMode::None {
307 panic!("The chosen --kernel mode is not implemented yet!")
308 }
309
310 // Enumerate matches (i.e., pairs of edge-disjoint isomorphic fragments).
311 let matches = Matches::new(mol, canonize_mode);
312
313 // Create memoization cache.
314 let mut cache = Cache::new(memoize_mode, canonize_mode);
315
316 // Create the initial assembly state.
317 let state = State::new(mol);
318
319 // Search for the shortest assembly pathway recursively.
320 let edge_count = mol.graph().edge_count();
321 let best_index = Arc::new(AtomicUsize::from(edge_count - 1));
322 let (index, states_searched) = recurse_index_search(
323 mol,
324 &matches,
325 &state,
326 best_index,
327 bounds,
328 &mut cache,
329 parallel_mode,
330 );
331
332 (index as u32, matches.len() as u32, states_searched)
333}
334
335/// Compute a molecule's assembly index using an efficient default strategy.
336///
337/// To customize assembly index calculation beyond the default strategy, see
338/// [`index_search`].
339///
340/// # Example
341/// ```
342/// # use std::{fs, path::PathBuf};
343/// use assembly_theory::assembly::index;
344/// use assembly_theory::loader::parse_molfile_str;
345///
346/// # fn main() -> Result<(), std::io::Error> {
347/// // Load a molecule from a .mol file.
348/// let path = PathBuf::from(format!("./data/checks/anthracene.mol"));
349/// let molfile = fs::read_to_string(path)?;
350/// let anthracene = parse_molfile_str(&molfile).expect("Parsing failure.");
351///
352/// // Compute the molecule's assembly index.
353/// assert_eq!(index(&anthracene), 6);
354/// # Ok(())
355/// # }
356/// ```
357pub fn index(mol: &Molecule) -> u32 {
358 index_search(
359 mol,
360 CanonizeMode::TreeNauty,
361 ParallelMode::DepthOne,
362 MemoizeMode::CanonIndex,
363 KernelMode::None,
364 &[Bound::Int, Bound::MatchableEdges],
365 )
366 .0
367}