assembly_theory/
assembly.rs

1//! Compute assembly indices of molecules.
2//!
3//! # Example
4//! ```
5//! # use std::{fs, path::PathBuf};
6//! use assembly_theory::assembly::index;
7//! use assembly_theory::loader::parse_molfile_str;
8//!
9//! # fn main() -> Result<(), std::io::Error> {
10//! // Load a molecule from a .mol file.
11//! let path = PathBuf::from(format!("./data/checks/anthracene.mol"));
12//! let molfile = fs::read_to_string(path)?;
13//! let anthracene = parse_molfile_str(&molfile).expect("Parsing failure.");
14//!
15//! // Compute the molecule's assembly index.
16//! assert_eq!(index(&anthracene), 6);
17//! # Ok(())
18//! # }
19//! ```
20
21use std::sync::{
22    atomic::{AtomicUsize, Ordering::Relaxed},
23    Arc,
24};
25
26use bit_set::BitSet;
27use clap::ValueEnum;
28use dashmap::DashMap;
29use rayon::iter::{IndexedParallelIterator, IntoParallelRefIterator, ParallelIterator};
30
31use crate::{
32    bounds::{bound_exceeded, Bound},
33    canonize::{canonize, CanonizeMode, Labeling},
34    enumerate::{enumerate_subgraphs, EnumerateMode},
35    kernels::KernelMode,
36    memoize::{Cache, MemoizeMode},
37    molecule::Molecule,
38    utils::connected_components_under_edges,
39};
40
41/// Parallelization strategy for the recursive search phase.
42#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum)]
43pub enum ParallelMode {
44    /// No parallelism.
45    None,
46    /// Create a task pool from the recursion's first level only.
47    DepthOne,
48    /// Spawn a new thread at every recursive call.
49    Always,
50}
51
52/// Compute assembly depth; see
53/// [Pagel et al. (2024)](https://arxiv.org/abs/2409.05993).
54///
55/// Note: This function is currently very (unusably) slow. It primarily exists
56/// in this crate as a proof of concept.
57///
58/// # Example
59/// ```
60/// # use std::{fs, path::PathBuf};
61/// use assembly_theory::assembly::depth;
62/// use assembly_theory::loader::parse_molfile_str;
63///
64/// # fn main() -> Result<(), std::io::Error> {
65/// // Load a molecule from a .mol file.
66/// let path = PathBuf::from(format!("./data/checks/benzene.mol"));
67/// let molfile = fs::read_to_string(path)?;
68/// let benzene = parse_molfile_str(&molfile).expect("Parsing failure.");
69///
70/// // Compute the molecule's assembly depth.
71/// assert_eq!(depth(&benzene), 3);
72/// # Ok(())
73/// # }
74/// ```
75pub fn depth(mol: &Molecule) -> u32 {
76    let mut ix = u32::MAX;
77    for (left, right) in mol.partitions().unwrap() {
78        let l = if left.is_basic_unit() {
79            0
80        } else {
81            depth(&left)
82        };
83
84        let r = if right.is_basic_unit() {
85            0
86        } else {
87            depth(&right)
88        };
89
90        ix = ix.min(l.max(r) + 1)
91    }
92    ix
93}
94
95/// Helper function for [`index_search`]; only public for benchmarking.
96///
97/// Return all pairs of non-overlapping, isomorphic subgraphs in the molecule,
98/// sorted to guarantee deterministic iteration.
99pub fn matches(
100    mol: &Molecule,
101    enumerate_mode: EnumerateMode,
102    canonize_mode: CanonizeMode,
103    parallel_mode: ParallelMode,
104) -> Vec<(BitSet, BitSet)> {
105    // Enumerate all connected, non-induced subgraphs with at most |E|/2 edges
106    // and bin them into isomorphism classes using canonization.
107    let isomorphism_classes = DashMap::<Labeling, Vec<BitSet>>::new();
108    let bin_subgraph = |subgraph: &BitSet| {
109        isomorphism_classes
110            .entry(canonize(mol, subgraph, canonize_mode))
111            .and_modify(|bucket| bucket.push(subgraph.clone()))
112            .or_insert(vec![subgraph.clone()]);
113    };
114    if parallel_mode == ParallelMode::None {
115        enumerate_subgraphs(mol, enumerate_mode)
116            .iter()
117            .for_each(bin_subgraph);
118    } else {
119        enumerate_subgraphs(mol, enumerate_mode)
120            .par_iter()
121            .for_each(bin_subgraph);
122    }
123
124    // In each isomorphism class, collect non-overlapping pairs of subgraphs.
125    let mut matches = Vec::new();
126    for bucket in isomorphism_classes.iter() {
127        for (i, first) in bucket.iter().enumerate() {
128            for second in &bucket[i..] {
129                if first.is_disjoint(second) {
130                    if first > second {
131                        matches.push((first.clone(), second.clone()));
132                    } else {
133                        matches.push((second.clone(), first.clone()));
134                    }
135                }
136            }
137        }
138    }
139
140    // Sort pairs in a deterministic order and return.
141    matches.sort_by(|e1, e2| {
142        let ord = [
143            e2.0.len().cmp(&e1.0.len()), // Decreasing subgraph size.
144            e1.0.cmp(&e2.0),             // First subgraph lexicographical.
145            e1.1.cmp(&e2.1),             // Second subgraph lexicographical.
146        ];
147        let mut i = 0;
148        while ord[i] == std::cmp::Ordering::Equal {
149            i += 1;
150        }
151        ord[i]
152    });
153
154    matches
155}
156
157/// Determine the fragments produced from the given assembly state by removing
158/// the given pair of non-overlapping, isomorphic subgraphs and then adding one
159/// back; return `None` if not possible.
160fn fragments(mol: &Molecule, state: &[BitSet], h1: &BitSet, h2: &BitSet) -> Option<Vec<BitSet>> {
161    // Attempt to find fragments f1 and f2 containing h1 and h2, respectively;
162    // if either do not exist, exit without further fragmentation.
163    let f1 = state.iter().enumerate().find(|(_, c)| h1.is_subset(c));
164    let f2 = state.iter().enumerate().find(|(_, c)| h2.is_subset(c));
165    let (Some((i1, f1)), Some((i2, f2))) = (f1, f2) else {
166        return None;
167    };
168
169    let mut fragments = state.to_owned();
170
171    // If the same fragment f1 (== f2) contains both h1 and h2, replace this
172    // one fragment f1 with all connected components comprising f1 - (h1 U h2).
173    // Otherwise, replace fragments f1 and f2 with all connected components
174    // comprising f1 - h1 and f2 - h2, respectively.
175    if i1 == i2 {
176        let mut union = h1.clone();
177        union.union_with(h2);
178        let mut difference = f1.clone();
179        difference.difference_with(&union);
180        let c = connected_components_under_edges(mol.graph(), &difference);
181        fragments.extend(c);
182        fragments.swap_remove(i1);
183    } else {
184        let mut diff1 = f1.clone();
185        diff1.difference_with(h1);
186        let c1 = connected_components_under_edges(mol.graph(), &diff1);
187        fragments.extend(c1);
188
189        let mut diff2 = f2.clone();
190        diff2.difference_with(h2);
191        let c2 = connected_components_under_edges(mol.graph(), &diff2);
192        fragments.extend(c2);
193
194        fragments.swap_remove(i1.max(i2));
195        fragments.swap_remove(i1.min(i2));
196    }
197
198    // Drop any singleton fragments, add h1 as a fragment, and return.
199    fragments.retain(|i| i.len() > 1);
200    fragments.push(h1.clone());
201    Some(fragments)
202}
203
204/// Recursive helper for [`index_search`], only public for benchmarking.
205///
206/// Inputs:
207/// - `mol`: The molecule whose assembly index is being calculated.
208/// - `matches`: The remaining non-overlapping isomorphic subgraph pairs.
209/// - `removal_order`: TODO
210/// - `state`: The current assembly state, i.e., a list of fragments.
211/// - `state_index`: This assembly state's upper bound on the assembly index,
212///   i.e., edges(mol) - 1 - [edges(subgraphs removed) - #(subgraphs removed)].
213/// - `best_index`: The smallest assembly index for all assembly states so far.
214/// - `largest_remove`: An upper bound on the size of fragments that can be
215///   removed from this or any descendant assembly state.
216/// - `bounds`: The list of bounding strategies to apply.
217/// - `cache`: TODO
218/// - `parallel_mode`: The parallelism mode for this state's match iteration.
219///
220/// Returns, from this assembly state and any of its descendents:
221/// - `usize`: An updated upper bound on the assembly index. (Note: If this
222///   state is pruned by bounds or deemed redundant by memoization, then the
223///   upper bound returned is unchanged.)
224/// - `usize`: The number of assembly states searched.
225#[allow(clippy::too_many_arguments)]
226pub fn recurse_index_search(
227    mol: &Molecule,
228    matches: &[(BitSet, BitSet)],
229    removal_order: Vec<usize>,
230    state: &[BitSet],
231    state_index: usize,
232    best_index: Arc<AtomicUsize>,
233    largest_remove: usize,
234    bounds: &[Bound],
235    cache: &mut Cache,
236    parallel_mode: ParallelMode,
237) -> (usize, usize) {
238    // If any bounds would prune this assembly state or if memoization is
239    // enabled and this assembly state is preempted by the cached state, halt.
240    if bound_exceeded(
241        mol,
242        state,
243        state_index,
244        best_index.load(Relaxed),
245        largest_remove,
246        bounds,
247    ) || cache.memoize_state(mol, state, state_index, &removal_order)
248    {
249        return (state_index, 1);
250    }
251
252    // Keep track of the best assembly index found in any of this assembly
253    // state's children and the number of states searched, including this one.
254    let best_child_index = AtomicUsize::from(state_index);
255    let states_searched = AtomicUsize::from(1);
256
257    // Define a closure that handles recursing to a new assembly state based on
258    // the given (enumerated) pair of non-overlapping isomorphic subgraphs.
259    let recurse_on_match = |i: usize, h1: &BitSet, h2: &BitSet| {
260        if let Some(fragments) = fragments(mol, state, h1, h2) {
261            // If using depth-one parallelism, all descendant states should be
262            // computed serially.
263            let new_parallel = if parallel_mode == ParallelMode::DepthOne {
264                ParallelMode::None
265            } else {
266                parallel_mode
267            };
268
269            // Recurse using the remaining matches and updated fragments.
270            let (child_index, child_states_searched) = recurse_index_search(
271                mol,
272                &matches[i + 1..],
273                {
274                    let mut clone = removal_order.clone();
275                    clone.push(i);
276                    clone
277                },
278                &fragments,
279                state_index - h1.len() + 1,
280                best_index.clone(),
281                h1.len(),
282                bounds,
283                &mut cache.clone(),
284                new_parallel,
285            );
286
287            // Update the best assembly indices (across children states and
288            // the entire search) and the number of descendant states searched.
289            best_child_index.fetch_min(child_index, Relaxed);
290            best_index.fetch_min(best_child_index.load(Relaxed), Relaxed);
291            states_searched.fetch_add(child_states_searched, Relaxed);
292        }
293    };
294
295    // Use the iterator type corresponding to the specified parallelism mode.
296    if parallel_mode == ParallelMode::None {
297        matches
298            .iter()
299            .enumerate()
300            .for_each(|(i, (h1, h2))| recurse_on_match(i, h1, h2));
301    } else {
302        matches
303            .par_iter()
304            .enumerate()
305            .for_each(|(i, (h1, h2))| recurse_on_match(i, h1, h2));
306    }
307
308    (
309        best_child_index.load(Relaxed),
310        states_searched.load(Relaxed),
311    )
312}
313
314/// Compute a molecule's assembly index and related information using a
315/// top-down recursive algorithm, parameterized by the specified options.
316///
317/// See [`EnumerateMode`], [`CanonizeMode`], [`ParallelMode`], [`KernelMode`],
318/// and [`Bound`] for details on how to customize the algorithm. Notably,
319/// bounds are applied in the order they appear in the `bounds` slice. It is
320/// generally better to provide bounds that are quick to compute first.
321///
322/// The results returned are:
323/// - The molecule's `u32` assembly index.
324/// - The molecule's `u32` number of non-overlapping isomorphic subgraph pairs.
325/// - The `usize` total number of assembly states searched, where an assembly
326///   state is a collection of fragments. Note that, depending on the algorithm
327///   parameters used, some states may be searched/counted multiple times.
328///
329/// # Example
330/// ```
331/// # use std::{fs, path::PathBuf};
332/// use assembly_theory::{
333///     assembly::{index_search, ParallelMode},
334///     bounds::Bound,
335///     canonize::CanonizeMode,
336///     enumerate::EnumerateMode,
337///     kernels::KernelMode,
338///     loader::parse_molfile_str,
339///     memoize::MemoizeMode,
340/// };
341///
342/// # fn main() -> Result<(), std::io::Error> {
343/// // Load a molecule from a .mol file.
344/// let path = PathBuf::from(format!("./data/checks/anthracene.mol"));
345/// let molfile = fs::read_to_string(path)?;
346/// let anthracene = parse_molfile_str(&molfile).expect("Parsing failure.");
347///
348/// // Compute the molecule's assembly index without parallelism, memoization,
349/// // kernelization, or bounds.
350/// let (slow_index, _, _) = index_search(
351///     &anthracene,
352///     EnumerateMode::GrowErode,
353///     CanonizeMode::TreeNauty,
354///     ParallelMode::None,
355///     MemoizeMode::None,
356///     KernelMode::None,
357///     &[],
358/// );
359///
360/// // Compute the molecule's assembly index with parallelism, memoization, and
361/// // some bounds.
362/// let (fast_index, _, _) = index_search(
363///     &anthracene,
364///     EnumerateMode::GrowErode,
365///     CanonizeMode::TreeNauty,
366///     ParallelMode::DepthOne,
367///     MemoizeMode::CanonIndex,
368///     KernelMode::None,
369///     &[Bound::Log, Bound::Int],
370/// );
371///
372/// assert_eq!(slow_index, 6);
373/// assert_eq!(fast_index, 6);
374/// # Ok(())
375/// # }
376/// ```
377pub fn index_search(
378    mol: &Molecule,
379    enumerate_mode: EnumerateMode,
380    canonize_mode: CanonizeMode,
381    parallel_mode: ParallelMode,
382    memoize_mode: MemoizeMode,
383    kernel_mode: KernelMode,
384    bounds: &[Bound],
385) -> (u32, u32, usize) {
386    // Catch not-yet-implemented modes.
387    if kernel_mode != KernelMode::None {
388        panic!("The chosen --kernel mode is not implemented yet!")
389    }
390
391    // Enumerate non-overlapping isomorphic subgraph pairs.
392    let matches = matches(mol, enumerate_mode, canonize_mode, parallel_mode);
393
394    // Create memoization cache.
395    let mut cache = Cache::new(memoize_mode, canonize_mode);
396
397    // Initialize the first fragment as the entire graph.
398    let mut init = BitSet::new();
399    init.extend(mol.graph().edge_indices().map(|ix| ix.index()));
400
401    // Search for the shortest assembly pathway recursively.
402    let edge_count = mol.graph().edge_count();
403    let best_index = Arc::new(AtomicUsize::from(edge_count - 1));
404    let (index, states_searched) = recurse_index_search(
405        mol,
406        &matches,
407        Vec::new(),
408        &[init],
409        edge_count - 1,
410        best_index,
411        edge_count,
412        bounds,
413        &mut cache,
414        parallel_mode,
415    );
416
417    (index as u32, matches.len() as u32, states_searched)
418}
419
420/// Compute a molecule's assembly index using an efficient default strategy.
421///
422/// # Example
423/// ```
424/// # use std::{fs, path::PathBuf};
425/// use assembly_theory::assembly::index;
426/// use assembly_theory::loader::parse_molfile_str;
427///
428/// # fn main() -> Result<(), std::io::Error> {
429/// // Load a molecule from a .mol file.
430/// let path = PathBuf::from(format!("./data/checks/anthracene.mol"));
431/// let molfile = fs::read_to_string(path)?;
432/// let anthracene = parse_molfile_str(&molfile).expect("Parsing failure.");
433///
434/// // Compute the molecule's assembly index.
435/// assert_eq!(index(&anthracene), 6);
436/// # Ok(())
437/// # }
438/// ```
439pub fn index(mol: &Molecule) -> u32 {
440    index_search(
441        mol,
442        EnumerateMode::GrowErode,
443        CanonizeMode::TreeNauty,
444        ParallelMode::DepthOne,
445        MemoizeMode::CanonIndex,
446        KernelMode::None,
447        &[Bound::Int, Bound::VecSimple, Bound::VecSmallFrags],
448    )
449    .0
450}