assembly_theory/
bounds.rs

1//! Prune assembly states from which the assembly index cannot improve.
2//!
3//! Each bound takes information about the current assembly state (i.e., set of
4//! fragments) and computes an upper bound on the "savings" (in terms of number
5//! of joining operations) that can possibly be obtained when constructing the
6//! molecule using this state's fragments and subfragments thereof. Let
7//! `state_index` be this assembly state's assembly index, `best_index` be the
8//! smallest assembly index found across any assembly state so far, and `bound`
9//! be the upper bound on this assembly state's possible savings. If ever
10//! `state_index` - `bound` >= `best_index`, then no descendant of this
11//! assembly state can possibly yield an assembly index better than
12//! `best_index` and thus this assembly state can be pruned.
13
14use bit_set::BitSet;
15use clap::ValueEnum;
16
17use crate::molecule::{Bond, Element, Molecule};
18
19/// Type of upper bound on the "savings" possible from an assembly state.
20#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum)]
21pub enum Bound {
22    /// The shortest number of joining operations to create a molecule with |E|
23    /// bonds is log_2(|E|), i.e., if it is possible to always join the largest
24    /// fragment with itself to produce the molecule. Thus, an upper bound on
25    /// a state's savings is [#fragment bonds] - log_2([#fragment bonds]); see
26    /// [Jirasek et al. (2024)](https://doi.org/10.1021/acscentsci.4c00120).
27    Log,
28    /// An improvement over `Log` that also uses the size of the "largest
29    /// duplicatable subgraph" for this state in an integer addition chain; see
30    /// [Seet et al. (2024)](https://arxiv.org/abs/2410.09100).
31    Int,
32    /// Uses the types of bonds in the molecule to bound the number of assembly
33    /// steps remaining. The first time a unique bond type is added to the
34    /// graph, it could not have been part of a duplicate since that bond type
35    /// has not been used yet. Thus the number of unique bond types gives
36    /// information on how many more joins are required.
37    VecSimple,
38    /// Considers the fragments of size two in the current fragmentation. In
39    /// the remaining top-down process, such fragments will require one step to
40    /// remove if there is a duplicate set of two bonds in the graph.
41    /// Otherwise, they will require two steps.
42    VecSmallFrags,
43    /// A weighted independent set cover provides a bound on the size of a max.
44    /// weight clique in the compatibility graph. Uses a greedy algorithm  to
45    /// construct such a cover and obtain a bound. See
46    /// [Lamm et al. (2019)](https://doi.org/10.1137/1.9781611975499.12) for
47    /// the definition of a cover. (Note that they solve the equivalent
48    /// weighted independent set problem and thus use a clique cover instead.)
49    CoverNoSort,
50    /// Like `CoverNoSort`, buts sorts the vertices of the compatibility
51    /// graph by degree before creating the greedy independent set cover.
52    CoverSort,
53    /// Uses the compatibility graph to determine the largest duplicatable
54    /// subraphs remaining in each fragment. Uses this to bound the best
55    /// possible savings obtainable for each fragment.
56    CliqueBudget,
57}
58
59/// Edge information used in vector addition chain bounds.
60#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
61struct EdgeType {
62    bond: Bond,
63    ends: (Element, Element),
64}
65
66/// Returns `true` iff any of the given bounds would prune this assembly state.
67pub fn bound_exceeded(
68    mol: &Molecule,
69    fragments: &[BitSet],
70    state_index: usize,
71    best_index: usize,
72    largest_remove: usize,
73    bounds: &[Bound],
74) -> bool {
75    for bound_type in bounds {
76        let exceeds = match bound_type {
77            Bound::Log => state_index - log_bound(fragments) >= best_index,
78            Bound::Int => state_index - int_bound(fragments, largest_remove) >= best_index,
79            Bound::VecSimple => {
80                state_index - vec_simple_bound(fragments, largest_remove, mol) >= best_index
81            }
82            Bound::VecSmallFrags => {
83                state_index - vec_small_frags_bound(fragments, largest_remove, mol) >= best_index
84            }
85            _ => {
86                panic!("One of the chosen bounds is not implemented yet!")
87            }
88        };
89        if exceeds {
90            return true;
91        }
92    }
93    false
94}
95
96/// TODO
97fn log_bound(fragments: &[BitSet]) -> usize {
98    let mut size = 0;
99    for f in fragments {
100        size += f.len();
101    }
102
103    size - (size as f32).log2().ceil() as usize
104}
105
106/// TODO
107fn int_bound(fragments: &[BitSet], m: usize) -> usize {
108    let mut max_s: usize = 0;
109    let mut frag_sizes: Vec<usize> = Vec::new();
110
111    for f in fragments {
112        frag_sizes.push(f.len());
113    }
114
115    let size_sum: usize = frag_sizes.iter().sum();
116
117    // Test for all sizes m of largest removed duplicate
118    for max in 2..m + 1 {
119        let log = (max as f32).log2().ceil();
120        let mut aux_sum: usize = 0;
121
122        for len in &frag_sizes {
123            aux_sum += (len / max) + (len % max != 0) as usize
124        }
125
126        max_s = max_s.max(size_sum - log as usize - aux_sum);
127    }
128
129    max_s
130}
131
132/// TODO
133// Count number of unique edges in a fragment
134// Helper function for vector bounds
135fn unique_edges(fragment: &BitSet, mol: &Molecule) -> Vec<EdgeType> {
136    let g = mol.graph();
137    let mut nodes: Vec<Element> = Vec::new();
138    for v in g.node_weights() {
139        nodes.push(v.element());
140    }
141    let edges: Vec<petgraph::prelude::EdgeIndex> = g.edge_indices().collect();
142    let weights: Vec<Bond> = g.edge_weights().copied().collect();
143
144    // types will hold an element for every unique edge type in fragment
145    let mut types: Vec<EdgeType> = Vec::new();
146    for idx in fragment.iter() {
147        let bond = weights[idx];
148        let e = edges[idx];
149
150        let (e1, e2) = g.edge_endpoints(e).expect("bad");
151        let e1 = nodes[e1.index()];
152        let e2 = nodes[e2.index()];
153        let ends = if e1 < e2 { (e1, e2) } else { (e2, e1) };
154
155        let edge_type = EdgeType { bond, ends };
156
157        if types.contains(&edge_type) {
158            continue;
159        } else {
160            types.push(edge_type);
161        }
162    }
163
164    types
165}
166
167/// TODO
168fn vec_simple_bound(fragments: &[BitSet], m: usize, mol: &Molecule) -> usize {
169    // Calculate s (total number of edges)
170    // Calculate z (number of unique edges)
171    let mut s = 0;
172    for f in fragments {
173        s += f.len();
174    }
175
176    let mut union_set = BitSet::new();
177    for f in fragments {
178        union_set.union_with(f);
179    }
180    let z = unique_edges(&union_set, mol).len();
181
182    (s - z) - ((s - z) as f32 / m as f32).ceil() as usize
183}
184
185/// TODO
186fn vec_small_frags_bound(fragments: &[BitSet], m: usize, mol: &Molecule) -> usize {
187    let mut size_two_fragments: Vec<BitSet> = Vec::new();
188    let mut large_fragments: Vec<BitSet> = fragments.to_owned();
189    let mut indices_to_remove: Vec<usize> = Vec::new();
190
191    // Find and remove fragments of size 2
192    for (i, frag) in fragments.iter().enumerate() {
193        if frag.len() == 2 {
194            indices_to_remove.push(i);
195        }
196    }
197    for &index in indices_to_remove.iter().rev() {
198        let removed_bitset = large_fragments.remove(index);
199        size_two_fragments.push(removed_bitset);
200    }
201
202    // Compute z = num unique edges of large_fragments NOT also in size_two_fragments
203    let mut fragments_union = BitSet::new();
204    let mut size_two_fragments_union = BitSet::new();
205    for f in fragments {
206        fragments_union.union_with(f);
207    }
208    for f in size_two_fragments.iter() {
209        size_two_fragments_union.union_with(f);
210    }
211    let z = unique_edges(&fragments_union, mol).len()
212        - unique_edges(&size_two_fragments_union, mol).len();
213
214    // Compute s = total number of edges in fragments
215    // Compute sl = total number of edges in large fragments
216    let mut s = 0;
217    let mut sl = 0;
218    for f in fragments {
219        s += f.len();
220    }
221    for f in large_fragments {
222        sl += f.len();
223    }
224
225    // Find number of unique size two fragments
226    let mut size_two_types: Vec<(EdgeType, EdgeType)> = Vec::new();
227    for f in size_two_fragments.iter() {
228        let mut types = unique_edges(f, mol);
229        types.sort();
230        if types.len() == 1 {
231            size_two_types.push((types[0], types[0]));
232        } else {
233            size_two_types.push((types[0], types[1]));
234        }
235    }
236    size_two_types.sort();
237    size_two_types.dedup();
238
239    s - (z + size_two_types.len() + size_two_fragments.len())
240        - ((sl - z) as f32 / m as f32).ceil() as usize
241}