assembly_theory/
assembly.rs

1//! Compute assembly indices of molecules.
2//!
3//! # Example
4//! ```
5//! # use std::{fs, path::PathBuf};
6//! use assembly_theory::assembly::index;
7//! use assembly_theory::loader::parse_molfile_str;
8//!
9//! # fn main() -> Result<(), std::io::Error> {
10//! // Load a molecule from a .mol file.
11//! let path = PathBuf::from(format!("./data/checks/anthracene.mol"));
12//! let molfile = fs::read_to_string(path)?;
13//! let anthracene = parse_molfile_str(&molfile).expect("Parsing failure.");
14//!
15//! // Compute the molecule's assembly index.
16//! assert_eq!(index(&anthracene), 6);
17//! # Ok(())
18//! # }
19//! ```
20
21use std::sync::{
22    atomic::{AtomicUsize, Ordering::Relaxed},
23    Arc,
24};
25
26use bit_set::BitSet;
27use clap::ValueEnum;
28use rayon::iter::{ParallelBridge, ParallelIterator};
29
30use crate::{
31    bounds::{state_bounds, Bound},
32    canonize::CanonizeMode,
33    kernels::KernelMode,
34    matches::Matches,
35    memoize::{Cache, MemoizeMode},
36    molecule::Molecule,
37    state::State,
38    utils::connected_components_under_edges,
39};
40
41/// Parallelization strategy for the recursive search phase.
42#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum)]
43pub enum ParallelMode {
44    /// No parallelism.
45    None,
46    /// Create a task pool from the recursion's first level only.
47    DepthOne,
48    /// Spawn a new thread at every recursive call.
49    Always,
50}
51
52/// Compute assembly depth; see
53/// [Pagel et al. (2024)](https://arxiv.org/abs/2409.05993).
54///
55/// Note: This function is currently very (unusably) slow. It primarily exists
56/// in this crate as a proof of concept.
57///
58/// # Example
59/// ```
60/// # use std::{fs, path::PathBuf};
61/// use assembly_theory::assembly::depth;
62/// use assembly_theory::loader::parse_molfile_str;
63///
64/// # fn main() -> Result<(), std::io::Error> {
65/// // Load a molecule from a .mol file.
66/// let path = PathBuf::from(format!("./data/checks/benzene.mol"));
67/// let molfile = fs::read_to_string(path)?;
68/// let benzene = parse_molfile_str(&molfile).expect("Parsing failure.");
69///
70/// // Compute the molecule's assembly depth.
71/// assert_eq!(depth(&benzene), 3);
72/// # Ok(())
73/// # }
74/// ```
75pub fn depth(mol: &Molecule) -> u32 {
76    let mut ix = u32::MAX;
77    for (left, right) in mol.partitions().unwrap() {
78        let l = if left.is_basic_unit() {
79            0
80        } else {
81            depth(&left)
82        };
83
84        let r = if right.is_basic_unit() {
85            0
86        } else {
87            depth(&right)
88        };
89
90        ix = ix.min(l.max(r) + 1)
91    }
92    ix
93}
94
95/// Determine the fragments produced from the given assembly state by removing
96/// the given pair of edge-disjoint, isomorphic subgraphs and then adding one
97/// back; return `None` if not possible.
98fn fragments(mol: &Molecule, state: &[BitSet], h1: &BitSet, h2: &BitSet) -> Option<Vec<BitSet>> {
99    // Attempt to find fragments f1 and f2 containing h1 and h2, respectively;
100    // if either do not exist, exit without further fragmentation.
101    let f1 = state.iter().enumerate().find(|(_, c)| h1.is_subset(c));
102    let f2 = state.iter().enumerate().find(|(_, c)| h2.is_subset(c));
103    let (Some((i1, f1)), Some((i2, f2))) = (f1, f2) else {
104        return None;
105    };
106
107    let mut fragments = state.to_owned();
108
109    // If the same fragment f1 (== f2) contains both h1 and h2, replace this
110    // one fragment f1 with all connected components comprising f1 - (h1 U h2).
111    // Otherwise, replace fragments f1 and f2 with all connected components
112    // comprising f1 - h1 and f2 - h2, respectively.
113    if i1 == i2 {
114        let mut union = h1.clone();
115        union.union_with(h2);
116        let mut difference = f1.clone();
117        difference.difference_with(&union);
118        let c = connected_components_under_edges(mol.graph(), &difference);
119        fragments.extend(c);
120        fragments.swap_remove(i1);
121    } else {
122        let mut diff1 = f1.clone();
123        diff1.difference_with(h1);
124        let c1 = connected_components_under_edges(mol.graph(), &diff1);
125        fragments.extend(c1);
126
127        let mut diff2 = f2.clone();
128        diff2.difference_with(h2);
129        let c2 = connected_components_under_edges(mol.graph(), &diff2);
130        fragments.extend(c2);
131
132        fragments.swap_remove(i1.max(i2));
133        fragments.swap_remove(i1.min(i2));
134    }
135
136    // Drop any singleton fragments, add h1 as a fragment, and return.
137    fragments.retain(|i| i.len() > 1);
138    fragments.push(h1.clone());
139    Some(fragments)
140}
141
142/// Recursive helper for [`index_search`], only public for benchmarking.
143///
144/// Inputs:
145/// - `mol`: The molecule whose assembly index is being calculated.
146/// - `matches`: Structural information about the molecule's matched fragments.
147/// - `state`: The current assembly state.
148/// - `best_index`: The smallest assembly index for all assembly states so far.
149/// - `bounds`: The list of bounding strategies to apply.
150/// - `cache`: Memoization cache storing previously searched assembly states.
151/// - `parallel_mode`: The parallelism mode for this state's match iteration.
152///
153/// Returns, from this assembly state and any of its descendents:
154/// - `usize`: An updated upper bound on the assembly index. (Note: If this
155///   state is pruned by bounds or deemed redundant by memoization, then the
156///   upper bound returned is unchanged.)
157/// - `usize`: The number of assembly states searched.
158pub fn recurse_index_search(
159    mol: &Molecule,
160    matches: &Matches,
161    state: &State,
162    best_index: Arc<AtomicUsize>,
163    bounds: &[Bound],
164    cache: &mut Cache,
165    parallel_mode: ParallelMode,
166) -> (usize, usize) {
167    // If any bounds would prune this assembly state or if memoization is
168    // enabled and this assembly state is preempted by the cached state, halt.
169    if state_bounds(mol, state, best_index.load(Relaxed), bounds) || cache.memoize_state(mol, state)
170    {
171        return (state.index(), 1);
172    }
173
174    // Generate a list of matches (i.e., pairs of edge-disjoint, isomorphic
175    // fragments) to remove from this state.
176    let (intermediate_frags, matches_to_remove): (Vec<BitSet>, Vec<usize>) =
177        matches.matches_to_remove(mol, state, best_index.load(Relaxed), bounds);
178
179    // Keep track of the best assembly index found in any of this assembly
180    // state's children and the number of states searched, including this one.
181    let best_child_index = AtomicUsize::from(state.index());
182    let states_searched = AtomicUsize::from(1);
183
184    // Define a closure that handles recursing to a new assembly state based on
185    // the given match.
186    let recurse_on_match = |i: usize, match_ix: usize| {
187        let (h1, h2) = matches.match_fragments(match_ix);
188
189        if let Some(fragments) = fragments(mol, &intermediate_frags, h1, h2) {
190            // If using depth-one parallelism, all descendant states should be
191            // computed serially.
192            let new_parallel = if parallel_mode == ParallelMode::DepthOne {
193                ParallelMode::None
194            } else {
195                parallel_mode
196            };
197
198            // Recurse using the remaining matches and updated fragments.
199            let (child_index, child_states_searched) = recurse_index_search(
200                mol,
201                matches,
202                &state.update(fragments, i, match_ix, h1.len()),
203                best_index.clone(),
204                bounds,
205                &mut cache.clone(),
206                new_parallel,
207            );
208
209            // Update the best assembly indices (across children states and
210            // the entire search) and the number of descendant states searched.
211            best_child_index.fetch_min(child_index, Relaxed);
212            best_index.fetch_min(best_child_index.load(Relaxed), Relaxed);
213            states_searched.fetch_add(child_states_searched, Relaxed);
214        }
215    };
216
217    // Use the iterator type corresponding to the specified parallelism mode.
218    if parallel_mode == ParallelMode::None {
219        matches_to_remove
220            .iter()
221            .enumerate()
222            .for_each(|(i, match_ix)| recurse_on_match(i, *match_ix));
223    } else {
224        matches_to_remove
225            .iter()
226            .enumerate()
227            .par_bridge()
228            .for_each(|(i, match_ix)| recurse_on_match(i, *match_ix));
229    }
230
231    (
232        best_child_index.load(Relaxed),
233        states_searched.load(Relaxed),
234    )
235}
236
237/// Compute a molecule's assembly index and related information using a
238/// top-down recursive algorithm, parameterized by the specified options.
239///
240/// See [`CanonizeMode`], [`ParallelMode`], [`KernelMode`], and [`Bound`] for
241/// details on how to customize the algorithm. Notably, bounds are applied in
242/// the order they appear in the `bounds` slice. It is generally better to
243/// provide bounds that are quick to compute first.
244///
245/// The results returned are:
246/// - The molecule's `u32` assembly index.
247/// - The molecule's `u32` number of edge-disjoint isomorphic subgraph pairs.
248/// - The `usize` total number of assembly states searched, where an assembly
249///   state is a collection of fragments. Note that, depending on the algorithm
250///   parameters used, some states may be searched/counted multiple times.
251///
252/// # Example
253/// ```
254/// # use std::{fs, path::PathBuf};
255/// use assembly_theory::{
256///     assembly::{index_search, ParallelMode},
257///     bounds::Bound,
258///     canonize::CanonizeMode,
259///     kernels::KernelMode,
260///     loader::parse_molfile_str,
261///     memoize::MemoizeMode,
262/// };
263///
264/// # fn main() -> Result<(), std::io::Error> {
265/// // Load a molecule from a .mol file.
266/// let path = PathBuf::from(format!("./data/checks/anthracene.mol"));
267/// let molfile = fs::read_to_string(path)?;
268/// let anthracene = parse_molfile_str(&molfile).expect("Parsing failure.");
269///
270/// // Compute the molecule's assembly index without parallelism, memoization,
271/// // kernelization, or bounds.
272/// let (slow_index, _, _) = index_search(
273///     &anthracene,
274///     CanonizeMode::TreeNauty,
275///     ParallelMode::None,
276///     MemoizeMode::None,
277///     KernelMode::None,
278///     &[],
279/// );
280///
281/// // Compute the molecule's assembly index with parallelism, memoization, and
282/// // some bounds.
283/// let (fast_index, _, _) = index_search(
284///     &anthracene,
285///     CanonizeMode::TreeNauty,
286///     ParallelMode::DepthOne,
287///     MemoizeMode::CanonIndex,
288///     KernelMode::None,
289///     &[Bound::Log, Bound::Int],
290/// );
291///
292/// assert_eq!(slow_index, 6);
293/// assert_eq!(fast_index, 6);
294/// # Ok(())
295/// # }
296/// ```
297pub fn index_search(
298    mol: &Molecule,
299    canonize_mode: CanonizeMode,
300    parallel_mode: ParallelMode,
301    memoize_mode: MemoizeMode,
302    kernel_mode: KernelMode,
303    bounds: &[Bound],
304) -> (u32, u32, usize) {
305    // Catch not-yet-implemented modes.
306    if kernel_mode != KernelMode::None {
307        panic!("The chosen --kernel mode is not implemented yet!")
308    }
309
310    // Enumerate matches (i.e., pairs of edge-disjoint isomorphic fragments).
311    let matches = Matches::new(mol, canonize_mode);
312
313    // Create memoization cache.
314    let mut cache = Cache::new(memoize_mode, canonize_mode);
315
316    // Create the initial assembly state.
317    let state = State::new(mol);
318
319    // Search for the shortest assembly pathway recursively.
320    let edge_count = mol.graph().edge_count();
321    let best_index = Arc::new(AtomicUsize::from(edge_count - 1));
322    let (index, states_searched) = recurse_index_search(
323        mol,
324        &matches,
325        &state,
326        best_index,
327        bounds,
328        &mut cache,
329        parallel_mode,
330    );
331
332    (index as u32, matches.len() as u32, states_searched)
333}
334
335/// Compute a molecule's assembly index using an efficient default strategy.
336///
337/// To customize assembly index calculation beyond the default strategy, see
338/// [`index_search`].
339///
340/// # Example
341/// ```
342/// # use std::{fs, path::PathBuf};
343/// use assembly_theory::assembly::index;
344/// use assembly_theory::loader::parse_molfile_str;
345///
346/// # fn main() -> Result<(), std::io::Error> {
347/// // Load a molecule from a .mol file.
348/// let path = PathBuf::from(format!("./data/checks/anthracene.mol"));
349/// let molfile = fs::read_to_string(path)?;
350/// let anthracene = parse_molfile_str(&molfile).expect("Parsing failure.");
351///
352/// // Compute the molecule's assembly index.
353/// assert_eq!(index(&anthracene), 6);
354/// # Ok(())
355/// # }
356/// ```
357pub fn index(mol: &Molecule) -> u32 {
358    index_search(
359        mol,
360        CanonizeMode::TreeNauty,
361        ParallelMode::DepthOne,
362        MemoizeMode::CanonIndex,
363        KernelMode::None,
364        &[Bound::Int, Bound::MatchableEdges],
365    )
366    .0
367}