assembly_theory/
molecule.rs

1//! Graph-theoretic representation of a molecule.
2
3use std::{
4    collections::{BTreeSet, HashMap, HashSet},
5    fmt::Display,
6    str::FromStr,
7};
8
9use petgraph::{
10    dot::Dot,
11    graph::{EdgeIndex, Graph, NodeIndex},
12    Undirected,
13};
14
15use crate::utils::{edge_induced_subgraph, is_subset_connected};
16
17pub(crate) type Index = u32;
18pub(crate) type MGraph = Graph<Atom, Bond, Undirected, Index>;
19type EdgeSet = BTreeSet<EdgeIndex<Index>>;
20type NodeSet = BTreeSet<NodeIndex<Index>>;
21
22/// Thrown by [`Element::from_str`] if the string is not a valid chemical
23/// element.
24#[derive(Debug, Copy, Clone, PartialEq, Eq)]
25pub struct ParseElementError;
26
27macro_rules! periodic_table {
28    ( $(($element:ident, $name:literal, $atomicweight:literal),)* ) => {
29        #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
30        /// A chemical element on the periodic table.
31        pub enum Element {
32            $( $element, )*
33        }
34
35        impl Display for Element {
36            fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
37                match &self {
38                    $( Element::$element => write!(f, "{}", $name), )*
39                }
40            }
41        }
42
43        impl FromStr for Element {
44            type Err = ParseElementError;
45            fn from_str(s: &str) -> Result<Self, Self::Err> {
46                match s {
47                    $( $name => Ok(Element::$element), )*
48                    _ => Err(ParseElementError),
49                }
50            }
51        }
52
53        impl Element {
54            pub fn repr(self) -> u8 {
55                match &self {
56                    $( Element::$element => $atomicweight, )*
57                }
58            }
59        }
60    };
61}
62
63periodic_table!(
64    (Hydrogen, "H", 1),
65    (Helium, "He", 2),
66    (Lithium, "Li", 3),
67    (Beryllium, "Be", 4),
68    (Boron, "B", 5),
69    (Carbon, "C", 6),
70    (Nitrogen, "N", 7),
71    (Oxygen, "O", 8),
72    (Fluorine, "F", 9),
73    (Neon, "Ne", 10),
74    (Sodium, "Na", 11),
75    (Magnesium, "Mg", 12),
76    (Aluminum, "Al", 13),
77    (Silicon, "Si", 14),
78    (Phosphorus, "P", 15),
79    (Sulfur, "S", 16),
80    (Chlorine, "Cl", 17),
81    (Argon, "Ar", 18),
82    (Potassium, "K", 19),
83    (Calcium, "Ca", 20),
84    (Scandium, "Sc", 21),
85    (Titanium, "Ti", 22),
86    (Vanadium, "V", 23),
87    (Chromium, "Cr", 24),
88    (Manganese, "Mn", 25),
89    (Iron, "Fe", 26),
90    (Cobalt, "Co", 27),
91    (Nickel, "Ni", 28),
92    (Copper, "Cu", 29),
93    (Zinc, "Zn", 30),
94    (Gallium, "Ga", 31),
95    (Germanium, "Ge", 32),
96    (Arsenic, "As", 33),
97    (Selenium, "Se", 34),
98    (Bromine, "Br", 35),
99    (Krypton, "Kr", 36),
100    (Rubidium, "Rb", 37),
101    (Strontium, "Sr", 38),
102    (Yttrium, "Y", 39),
103    (Zirconium, "Zr", 40),
104    (Niobium, "Nb", 41),
105    (Molybdenum, "Mo", 42),
106    (Technetium, "Tc", 43),
107    (Ruthenium, "Ru", 44),
108    (Rhodium, "Rh", 45),
109    (Palladium, "Pd", 46),
110    (Silver, "Ag", 47),
111    (Cadmium, "Cd", 48),
112    (Indium, "In", 49),
113    (Tin, "Sn", 50),
114    (Antimony, "Sb", 51),
115    (Tellurium, "Te", 52),
116    (Iodine, "I", 53),
117    (Xenon, "Xe", 54),
118    (Cesium, "Cs", 55),
119    (Barium, "Ba", 56),
120    (Lanthanum, "La", 57),
121    (Cerium, "Ce", 58),
122    (Praseodymium, "Pr", 59),
123    (Neodymium, "Nd", 60),
124    (Promethium, "Pm", 61),
125    (Samarium, "Sm", 62),
126    (Europium, "Eu", 63),
127    (Gadolinium, "Gd", 64),
128    (Terbium, "Tb", 65),
129    (Dysprosium, "Dy", 66),
130    (Holmium, "Ho", 67),
131    (Erbium, "Er", 68),
132    (Thulium, "Tm", 69),
133    (Ytterbium, "Yb", 70),
134    (Lutetium, "Lu", 71),
135    (Hafnium, "Hf", 72),
136    (Tantalum, "Ta", 73),
137    (Wolfram, "W", 74),
138    (Rhenium, "Re", 75),
139    (Osmium, "Os", 76),
140    (Iridium, "Ir", 77),
141    (Platinum, "Pt", 78),
142    (Gold, "Au", 79),
143    (Mercury, "Hg", 80),
144    (Thallium, "Tl", 81),
145    (Lead, "Pb", 82),
146    (Bismuth, "Bi", 83),
147    (Polonium, "Po", 84),
148    (Astatine, "At", 85),
149    (Radon, "Rn", 86),
150    (Francium, "Fr", 87),
151    (Radium, "Ra", 88),
152    (Actinium, "Ac", 89),
153    (Thorium, "Th", 90),
154    (Protactinium, "Pa", 91),
155    (Uranium, "U", 92),
156    (Neptunium, "Np", 93),
157    (Plutonium, "Pu", 94),
158    (Americium, "Am", 95),
159    (Curium, "Cm", 96),
160    (Berkelium, "Bk", 97),
161    (Californium, "Cf", 98),
162    (Einsteinium, "Es", 99),
163    (Fermium, "Fm", 100),
164    (Mendelevium, "Md", 101),
165    (Nobelium, "No", 102),
166    (Lawrencium, "Lr", 103),
167    (Rutherfordium, "Rf", 104),
168    (Dubnium, "Db", 105),
169    (Seaborgium, "Sg", 106),
170    (Bohrium, "Bh", 107),
171    (Hassium, "Hs", 108),
172    (Meitnerium, "Mt", 109),
173    (Darmstadtium, "Ds", 110),
174    (Roentgenium, "Rg", 111),
175    (Copernicium, "Cn", 112),
176    (Nihonium, "Nh", 113),
177    (Flerovium, "Fl", 114),
178    (Moscovium, "Mc", 115),
179    (Livermorium, "Lv", 116),
180    (Tennessine, "Ts", 117),
181    (Oganesson, "Og", 118),
182);
183
184/// The nodes of a [`Molecule`] graph.
185///
186/// Atoms contain an element and have a (currently unused) `capacity` field.
187#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
188pub struct Atom {
189    element: Element,
190    capacity: u32,
191}
192
193impl Atom {
194    /// Construct an [`Atom`] of type `element` with capacity `capacity`.
195    pub fn new(element: Element, capacity: u32) -> Self {
196        Self { element, capacity }
197    }
198
199    /// Return this [`Atom`]'s element.
200    pub fn element(&self) -> Element {
201        self.element
202    }
203}
204
205/// The edges of a [`Molecule`] graph.
206///
207/// The `.mol` file spec describes seven types of bonds, but the assembly
208/// theory literature only considers single, double, and triple bonds. Notably,
209/// aromatic rings are represented by alternating single and double bonds
210/// instead of the aromatic bond type.
211#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
212pub enum Bond {
213    Single,
214    Double,
215    Triple,
216}
217
218/// Either an [`Atom`] or a [`Bond`].
219#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
220pub enum AtomOrBond {
221    Atom(Atom),
222    Bond(Bond),
223}
224
225/// Thrown by [`Bond::try_from`] when given anything other than a 1, 2, or 3.
226#[derive(Debug, Copy, Clone, PartialEq, Eq)]
227pub struct ParseBondError;
228
229impl TryFrom<usize> for Bond {
230    type Error = ParseBondError;
231    fn try_from(value: usize) -> Result<Self, Self::Error> {
232        match value {
233            1 => Ok(Bond::Single),
234            2 => Ok(Bond::Double),
235            3 => Ok(Bond::Triple),
236            _ => Err(ParseBondError),
237        }
238    }
239}
240
241impl Bond {
242    pub fn repr(self) -> u8 {
243        match self {
244            Bond::Single => 1,
245            Bond::Double => 2,
246            Bond::Triple => 3,
247        }
248    }
249}
250
251/// A simple, loopless graph with [`Atom`]s as nodes and [`Bond`]s as edges.
252///
253/// Assembly theory literature ignores hydrogen atoms by default. Molecules can
254/// have hydrogen atoms inserted into them, but by default are constructed
255/// without hydrogen atoms or bonds to hydrogen atoms.
256#[derive(Debug, Clone)]
257pub struct Molecule {
258    graph: MGraph,
259}
260
261impl Molecule {
262    /// Construct a [`Molecule`] from an existing `MGraph`.
263    pub(crate) fn from_graph(g: MGraph) -> Self {
264        Self { graph: g }
265    }
266
267    /// Return a representation of this molecule as an `MGraph`.
268    ///
269    /// Only public for benchmarking purposes.
270    pub fn graph(&self) -> &MGraph {
271        &self.graph
272    }
273
274    /// Return a pretty-printable representation of this molecule.
275    pub fn info(&self) -> String {
276        let dot = Dot::new(&self.graph);
277        format!("{dot:?}")
278    }
279
280    /// Return `true` iff this molecule contains self-loops or multiple edges
281    /// between any pair of nodes.
282    pub fn is_malformed(&self) -> bool {
283        let mut uniq = HashSet::new();
284        !self.graph.edge_indices().all(|ix| {
285            uniq.insert(ix)
286                && self
287                    .graph
288                    .edge_endpoints(ix)
289                    .is_some_and(|(src, dst)| src != dst)
290        })
291    }
292
293    /// Return `true` iff this molecule comprises only one bond (of any type).
294    pub fn is_basic_unit(&self) -> bool {
295        self.graph.edge_count() == 1 && self.graph.node_count() == 2
296    }
297
298    /// Join this molecule with `other` on edge `on`.
299    pub fn join(
300        &self,
301        other: &Molecule,
302        on: impl IntoIterator<Item = (NodeIndex<Index>, NodeIndex<Index>)>,
303    ) -> Option<Molecule> {
304        let mut output_graph = self.clone();
305
306        let mut v_set = NodeSet::new();
307        let mut io_map = HashMap::<NodeIndex<Index>, NodeIndex<Index>>::new();
308
309        for (u, v) in on.into_iter() {
310            v_set.insert(v);
311            io_map.insert(v, u);
312        }
313
314        for ix in other.graph.node_indices() {
315            if !v_set.contains(&ix) {
316                let w = *other.graph.node_weight(ix)?;
317                let out = output_graph.graph.add_node(w);
318                io_map.insert(ix, out);
319            }
320        }
321
322        for ix in other.graph.edge_indices() {
323            let (u, v) = other.graph.edge_endpoints(ix)?;
324            let um = io_map.get(&u)?;
325            let vm = io_map.get(&v)?;
326            let w = *other.graph.edge_weight(ix)?;
327
328            output_graph.graph.add_edge(*um, *vm, w);
329        }
330
331        Some(output_graph)
332    }
333
334    /// Return an iterator over all ways of partitioning this molecule into two
335    /// submolecules.
336    pub fn partitions(&self) -> Option<impl Iterator<Item = (Molecule, Molecule)> + '_> {
337        let mut solutions = HashSet::new();
338        let remaining_edges = self.graph.edge_indices().collect();
339        self.backtrack(
340            remaining_edges,
341            BTreeSet::new(),
342            BTreeSet::new(),
343            &mut solutions,
344        );
345        Some(solutions.into_iter().map(|(left, right)| {
346            (
347                Molecule {
348                    graph: edge_induced_subgraph(self.graph.clone(), &left),
349                },
350                Molecule {
351                    graph: edge_induced_subgraph(self.graph.clone(), &right),
352                },
353            )
354        }))
355    }
356
357    fn backtrack(
358        &self,
359        mut remaining_edges: Vec<EdgeIndex<Index>>,
360        left: EdgeSet,
361        right: EdgeSet,
362        solutions: &mut HashSet<(EdgeSet, EdgeSet)>,
363    ) {
364        if let Some(suffix) = remaining_edges.pop() {
365            let mut lc = left.clone();
366            lc.insert(suffix);
367
368            let mut rc = right.clone();
369            rc.insert(suffix);
370
371            self.backtrack(remaining_edges.clone(), lc, right, solutions);
372            self.backtrack(remaining_edges, left, rc, solutions);
373        } else if self.is_valid_partition(&left, &right) {
374            solutions.insert((left, right));
375        }
376    }
377
378    fn is_valid_partition(&self, left: &EdgeSet, right: &EdgeSet) -> bool {
379        !left.is_empty()
380            && !right.is_empty()
381            && is_subset_connected(&self.graph, left)
382            && is_subset_connected(&self.graph, right)
383    }
384
385    #[allow(dead_code)]
386    fn print_edgelist(&self, list: &[EdgeIndex], name: &str) {
387        println!(
388            "{name}: {:?}",
389            list.iter()
390                .map(|e| (
391                    e.index(),
392                    self.graph
393                        .edge_endpoints(*e)
394                        .map(|(i, j)| (
395                            i.index(),
396                            self.graph.node_weight(i).unwrap().element(),
397                            j.index(),
398                            self.graph.node_weight(j).unwrap().element(),
399                        ))
400                        .unwrap()
401                ))
402                .collect::<Vec<_>>()
403        );
404    }
405}
406
407mod tests {
408    #![allow(unused_imports)]
409    use super::*;
410
411    #[test]
412    fn element_to_string() {
413        assert!(Element::Hydrogen.to_string() == "H")
414    }
415
416    #[test]
417    fn element_from_string() {
418        assert!(str::parse("H") == Ok(Element::Hydrogen));
419        assert!(str::parse::<Element>("Foo").is_err());
420    }
421}