Skip to main content

assembly_theory/
assembly.rs

1//! Compute assembly indices of molecules.
2//!
3//! # Example
4//! ```
5//! # use std::{fs, path::PathBuf};
6//! use assembly_theory::assembly::index;
7//! use assembly_theory::loader::parse_molfile_str;
8//!
9//! # fn main() -> Result<(), std::io::Error> {
10//! // Load a molecule from a .mol file.
11//! let path = PathBuf::from(format!("./data/checks/anthracene.mol"));
12//! let molfile = fs::read_to_string(path)?;
13//! let anthracene = parse_molfile_str(&molfile).expect("Parsing failure.");
14//!
15//! // Compute the molecule's assembly index.
16//! assert_eq!(index(&anthracene), 6);
17//! # Ok(())
18//! # }
19//! ```
20
21use std::{
22    sync::{
23        atomic::{AtomicUsize, Ordering::Relaxed},
24        Arc,
25    },
26    time::Duration,
27};
28
29use bit_set::BitSet;
30use clap::ValueEnum;
31use rayon::iter::{ParallelBridge, ParallelIterator};
32use tokio::{runtime::Runtime, sync::oneshot, time::timeout as tktimeout};
33
34use crate::{
35    bounds::{state_bounds, Bound},
36    canonize::CanonizeMode,
37    kernels::KernelMode,
38    matches::Matches,
39    memoize::{Cache, MemoizeMode},
40    molecule::Molecule,
41    state::State,
42    utils::connected_components_under_edges,
43};
44
45/// Parallelization strategy for the recursive search phase.
46#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum)]
47pub enum ParallelMode {
48    /// No parallelism.
49    None,
50    /// Create a task pool from the recursion's first level only.
51    DepthOne,
52    /// Spawn a new thread at every recursive call.
53    Always,
54}
55
56/// Compute assembly depth; see
57/// [Pagel et al. (2024)](https://arxiv.org/abs/2409.05993).
58///
59/// Note: This function is currently very (unusably) slow. It primarily exists
60/// in this crate as a proof of concept.
61///
62/// # Example
63/// ```
64/// # use std::{fs, path::PathBuf};
65/// use assembly_theory::assembly::depth;
66/// use assembly_theory::loader::parse_molfile_str;
67///
68/// # fn main() -> Result<(), std::io::Error> {
69/// // Load a molecule from a .mol file.
70/// let path = PathBuf::from(format!("./data/checks/benzene.mol"));
71/// let molfile = fs::read_to_string(path)?;
72/// let benzene = parse_molfile_str(&molfile).expect("Parsing failure.");
73///
74/// // Compute the molecule's assembly depth.
75/// assert_eq!(depth(&benzene), 3);
76/// # Ok(())
77/// # }
78/// ```
79pub fn depth(mol: &Molecule) -> u32 {
80    let mut ix = u32::MAX;
81    for (left, right) in mol.partitions().unwrap() {
82        let l = if left.is_basic_unit() {
83            0
84        } else {
85            depth(&left)
86        };
87
88        let r = if right.is_basic_unit() {
89            0
90        } else {
91            depth(&right)
92        };
93
94        ix = ix.min(l.max(r) + 1)
95    }
96    ix
97}
98
99/// Determine the fragments produced from the given assembly state by removing
100/// the given pair of edge-disjoint, isomorphic subgraphs and then adding one
101/// back; return `None` if not possible.
102fn fragments(mol: &Molecule, state: &[BitSet], h1: &BitSet, h2: &BitSet) -> Option<Vec<BitSet>> {
103    // Attempt to find fragments f1 and f2 containing h1 and h2, respectively;
104    // if either do not exist, exit without further fragmentation.
105    let f1 = state.iter().enumerate().find(|(_, c)| h1.is_subset(c));
106    let f2 = state.iter().enumerate().find(|(_, c)| h2.is_subset(c));
107    let (Some((i1, f1)), Some((i2, f2))) = (f1, f2) else {
108        return None;
109    };
110
111    let mut fragments = state.to_owned();
112
113    // If the same fragment f1 (== f2) contains both h1 and h2, replace this
114    // one fragment f1 with all connected components comprising f1 - (h1 U h2).
115    // Otherwise, replace fragments f1 and f2 with all connected components
116    // comprising f1 - h1 and f2 - h2, respectively.
117    if i1 == i2 {
118        let mut union = h1.clone();
119        union.union_with(h2);
120        let mut difference = f1.clone();
121        difference.difference_with(&union);
122        let c = connected_components_under_edges(mol.graph(), &difference);
123        fragments.extend(c);
124        fragments.swap_remove(i1);
125    } else {
126        let mut diff1 = f1.clone();
127        diff1.difference_with(h1);
128        let c1 = connected_components_under_edges(mol.graph(), &diff1);
129        fragments.extend(c1);
130
131        let mut diff2 = f2.clone();
132        diff2.difference_with(h2);
133        let c2 = connected_components_under_edges(mol.graph(), &diff2);
134        fragments.extend(c2);
135
136        fragments.swap_remove(i1.max(i2));
137        fragments.swap_remove(i1.min(i2));
138    }
139
140    // Drop any singleton fragments, add h1 as a fragment, and return.
141    fragments.retain(|i| i.len() > 1);
142    fragments.push(h1.clone());
143    Some(fragments)
144}
145
146/// Recursive helper for [`index_search`], only public for benchmarking.
147///
148/// Inputs:
149/// - `mol`: The molecule whose assembly index is being calculated.
150/// - `matches`: Structural information about the molecule's matched fragments.
151/// - `state`: The current assembly state.
152/// - `best_index`: The smallest assembly index for all assembly states so far.
153/// - `bounds`: The list of bounding strategies to apply.
154/// - `cache`: Memoization cache storing previously searched assembly states.
155/// - `parallel_mode`: The parallelism mode for this state's match iteration.
156///
157/// Returns, from this assembly state and any of its descendents:
158/// - `usize`: An updated upper bound on the assembly index. (Note: If this
159///   state is pruned by bounds or deemed redundant by memoization, then the
160///   upper bound returned is unchanged.)
161/// - `usize`: The number of assembly states searched.
162pub fn recurse_index_search(
163    mol: &Molecule,
164    matches: &Matches,
165    state: &State,
166    best_index: Arc<AtomicUsize>,
167    bounds: &[Bound],
168    cache: &mut Cache,
169    parallel_mode: ParallelMode,
170) -> (usize, usize) {
171    // If any bounds would prune this assembly state or if memoization is
172    // enabled and this assembly state is preempted by the cached state, halt.
173    if state_bounds(mol, state, best_index.load(Relaxed), bounds) || cache.memoize_state(mol, state)
174    {
175        return (state.index(), 1);
176    }
177
178    // Generate a list of matches (i.e., pairs of edge-disjoint, isomorphic
179    // fragments) to remove from this state.
180    let (intermediate_frags, matches_to_remove): (Vec<BitSet>, Vec<usize>) =
181        matches.matches_to_remove(mol, state, best_index.load(Relaxed), bounds);
182
183    // Keep track of the best assembly index found in any of this assembly
184    // state's children and the number of states searched, including this one.
185    let best_child_index = AtomicUsize::from(state.index());
186    let states_searched = AtomicUsize::from(1);
187
188    // Define a closure that handles recursing to a new assembly state based on
189    // the given match.
190    let recurse_on_match = |i: usize, match_ix: usize| {
191        let (h1, h2) = matches.match_fragments(match_ix);
192
193        if let Some(fragments) = fragments(mol, &intermediate_frags, h1, h2) {
194            // If using depth-one parallelism, all descendant states should be
195            // computed serially.
196            let new_parallel = if parallel_mode == ParallelMode::DepthOne {
197                ParallelMode::None
198            } else {
199                parallel_mode
200            };
201
202            // Recurse using the remaining matches and updated fragments.
203            let (child_index, child_states_searched) = recurse_index_search(
204                mol,
205                matches,
206                &state.update(fragments, i, match_ix, h1.len()),
207                best_index.clone(),
208                bounds,
209                &mut cache.clone(),
210                new_parallel,
211            );
212
213            // Update the best assembly indices (across children states and the
214            // entire search) and the number of descendant states searched.
215            best_child_index.fetch_min(child_index, Relaxed);
216            best_index.fetch_min(best_child_index.load(Relaxed), Relaxed);
217            states_searched.fetch_add(child_states_searched, Relaxed);
218        }
219    };
220
221    // Use the iterator type corresponding to the specified parallelism mode.
222    if parallel_mode == ParallelMode::None {
223        matches_to_remove
224            .iter()
225            .enumerate()
226            .for_each(|(i, match_ix)| recurse_on_match(i, *match_ix));
227    } else {
228        matches_to_remove
229            .iter()
230            .enumerate()
231            .par_bridge()
232            .for_each(|(i, match_ix)| recurse_on_match(i, *match_ix));
233    }
234
235    (
236        best_child_index.load(Relaxed),
237        states_searched.load(Relaxed),
238    )
239}
240
241/// Compute a molecule's assembly index and related information using a
242/// top-down recursive algorithm, parameterized by the specified options.
243///
244/// If `timeout` is `None`, run until the assembly index is found. Otherwise,
245/// stop after `timeout` milliseconds and return the best upper bound on the
246/// assembly index found so far.
247///
248/// See [`CanonizeMode`], [`ParallelMode`], [`KernelMode`], and [`Bound`] for
249/// details on how to customize the algorithm. Notably, bounds are applied in
250/// the order they appear in the `bounds` slice. It is generally better to
251/// provide bounds that are quick to compute first.
252///
253/// The results returned are:
254/// - The molecule's `u32` assembly index (or an upper bound if timed out).
255/// - The molecule's `u32` number of edge-disjoint isomorphic subgraph pairs.
256/// - The `usize` total number of assembly [`State`]s searched if search
257///   completes, and `None` otherwise (i.e., if search timed out).
258///
259/// # Example
260/// ```
261/// # use std::{fs, path::PathBuf};
262/// use assembly_theory::{
263///     assembly::{index_search, ParallelMode},
264///     bounds::Bound,
265///     canonize::CanonizeMode,
266///     kernels::KernelMode,
267///     loader::parse_molfile_str,
268///     memoize::MemoizeMode,
269/// };
270///
271/// # fn main() -> Result<(), std::io::Error> {
272/// // Load a molecule from a .mol file.
273/// let path = PathBuf::from(format!("./data/checks/anthracene.mol"));
274/// let molfile = fs::read_to_string(path)?;
275/// let anthracene = parse_molfile_str(&molfile).expect("Parsing failure.");
276///
277/// // Compute the molecule's assembly index without parallelism, memoization,
278/// // kernelization, or bounds.
279/// let (slow_index, _, _) = index_search(
280///     &anthracene,
281///     None,
282///     CanonizeMode::TreeNauty,
283///     ParallelMode::None,
284///     MemoizeMode::None,
285///     KernelMode::None,
286///     &[],
287/// );
288///
289/// // Compute the molecule's assembly index with parallelism, memoization, and
290/// // some bounds.
291/// let (fast_index, _, _) = index_search(
292///     &anthracene,
293///     None,
294///     CanonizeMode::TreeNauty,
295///     ParallelMode::DepthOne,
296///     MemoizeMode::CanonIndex,
297///     KernelMode::None,
298///     &[Bound::Log, Bound::Int],
299/// );
300///
301/// // Limit search to 1 ms, which should time out.
302/// let (index_bound, _, states_searched) = index_search(
303///     &anthracene,
304///     Some(1),
305///     CanonizeMode::TreeNauty,
306///     ParallelMode::None,
307///     MemoizeMode::None,
308///     KernelMode::None,
309///     &[],
310/// );
311///
312/// assert_eq!(slow_index, 6);
313/// assert_eq!(fast_index, 6);
314/// assert!(index_bound >= fast_index && states_searched == None);
315/// # Ok(())
316/// # }
317/// ```
318pub fn index_search(
319    mol: &Molecule,
320    timeout: Option<u64>,
321    canonize_mode: CanonizeMode,
322    parallel_mode: ParallelMode,
323    memoize_mode: MemoizeMode,
324    kernel_mode: KernelMode,
325    bounds: &[Bound],
326) -> (u32, u32, Option<usize>) {
327    // Catch not-yet-implemented modes.
328    if kernel_mode != KernelMode::None {
329        panic!("The chosen --kernel mode is not implemented yet!")
330    }
331
332    // Create the initial assembly state and memoization cache.
333    let state = State::new(mol);
334    let mut cache = Cache::new(memoize_mode, canonize_mode);
335
336    // Enumerate matches (i.e., pairs of edge-disjoint isomorphic fragments).
337    let matches = Matches::new(mol, canonize_mode);
338
339    // Use an `Arc` to track the best assembly index across parallel threads.
340    let best_index = Arc::new(AtomicUsize::from(mol.graph().edge_count() - 1));
341
342    // Search for the shortest assembly pathway recursively.
343    if let Some(timeout) = timeout {
344        // If a timeout is provided, we will search within an asynchronous task
345        // that can be interrupted after the specified duration (see below). To
346        // avoid subsequent scope issues, make copies of various variables.
347        let best_index_copy = best_index.clone();
348        let mol = mol.clone();
349        let bounds = bounds.to_vec();
350        let num_matches = matches.len();
351
352        // Search within a dedicated asynchronous runtime.
353        let rt = Runtime::new().unwrap();
354        let result = rt.block_on(async {
355            let (send, recv) = oneshot::channel();
356            rayon::spawn(move || {
357                let _ = send.send(recurse_index_search(
358                    &mol,
359                    &matches,
360                    &state,
361                    best_index_copy,
362                    &bounds,
363                    &mut cache,
364                    parallel_mode,
365                ));
366            });
367            tktimeout(Duration::from_millis(timeout), recv).await
368        });
369
370        // If the search completes before the timeout, return the true assembly
371        // index. Otherwise, return the best upper bound on the assembly index
372        // found before timing out.
373        let (index, states_searched) = match result {
374            Ok(Ok((index, states_searched))) => (index, Some(states_searched)),
375            Err(_) => (best_index.load(Relaxed), None),
376            _ => panic!("An unexpected error occurred in async index_search"),
377        };
378        (index as u32, num_matches as u32, states_searched)
379    } else {
380        // Otherwise, if no timeout is provided, run the search normally.
381        let (index, states_searched) = recurse_index_search(
382            mol,
383            &matches,
384            &state,
385            best_index,
386            bounds,
387            &mut cache,
388            parallel_mode,
389        );
390        (index as u32, matches.len() as u32, Some(states_searched))
391    }
392}
393
394/// Compute a molecule's assembly index using an efficient default strategy.
395///
396/// To customize assembly index calculation beyond the default strategy, see
397/// [`index_search`].
398///
399/// # Example
400/// ```
401/// # use std::{fs, path::PathBuf};
402/// use assembly_theory::assembly::index;
403/// use assembly_theory::loader::parse_molfile_str;
404///
405/// # fn main() -> Result<(), std::io::Error> {
406/// // Load a molecule from a .mol file.
407/// let path = PathBuf::from(format!("./data/checks/anthracene.mol"));
408/// let molfile = fs::read_to_string(path)?;
409/// let anthracene = parse_molfile_str(&molfile).expect("Parsing failure.");
410///
411/// // Compute the molecule's assembly index.
412/// assert_eq!(index(&anthracene), 6);
413/// # Ok(())
414/// # }
415/// ```
416pub fn index(mol: &Molecule) -> u32 {
417    index_search(
418        mol,
419        None,
420        CanonizeMode::TreeNauty,
421        ParallelMode::DepthOne,
422        MemoizeMode::CanonIndex,
423        KernelMode::None,
424        &[Bound::Int, Bound::MatchableEdges],
425    )
426    .0
427}