Skip to main content

molprint_core/smiles/
parser.rs

1use super::lexer::{LexerError, Token};
2use crate::mol::atom::Atom;
3use crate::mol::bond::BondType;
4use crate::mol::graph::{MolGraph, MolGraphExt};
5use crate::ring::assign_ring_info;
6use petgraph::graph::NodeIndex;
7use thiserror::Error;
8
9#[derive(Error, Debug)]
10pub enum ParseError {
11    #[error("lexer error: {0}")]
12    Lexer(#[from] LexerError),
13    #[error("unmatched ring closure {0}")]
14    UnmatchedRing(u8),
15    #[error("unmatched branch parenthesis")]
16    UnmatchedBranch,
17    #[error("empty SMILES string")]
18    Empty,
19    #[error("bond without target atom")]
20    DanglingBond,
21}
22
23/// Parse a SMILES string into a molecular graph.
24pub fn parse(smiles: &str) -> Result<MolGraph, ParseError> {
25    let tokens = super::lexer::tokenize(smiles)?;
26    if tokens.is_empty() {
27        return Err(ParseError::Empty);
28    }
29
30    let mut graph = MolGraph::new_undirected();
31    let mut current: Option<NodeIndex> = None;
32    let mut branch_stack: Vec<NodeIndex> = Vec::new();
33    let mut pending_bond: Option<BondType> = None;
34    // ring_id → (atom_index, bond_type_if_specified)
35    let mut ring_map: std::collections::HashMap<u8, (NodeIndex, Option<BondType>)> =
36        std::collections::HashMap::new();
37
38    for token in &tokens {
39        match token {
40            Token::Atom { element, aromatic } => {
41                let mut atom = Atom::new(*element);
42                atom.aromatic = *aromatic;
43                let idx = graph.add_node(atom);
44
45                if let Some(prev) = current {
46                    let bond =
47                        pending_bond
48                            .take()
49                            .unwrap_or(if *aromatic && graph[prev].aromatic {
50                                BondType::Aromatic
51                            } else {
52                                BondType::Single
53                            });
54                    graph.add_edge(prev, idx, bond);
55                }
56                current = Some(idx);
57            }
58
59            Token::BracketAtom {
60                isotope,
61                element,
62                aromatic,
63                h_count,
64                charge,
65                map_num,
66                chirality: _,
67            } => {
68                let mut atom = Atom::new(*element);
69                atom.isotope = *isotope;
70                atom.aromatic = *aromatic;
71                // Bracket atoms without H specification have 0 implicit H (SMILES spec).
72                // Use Some(0) so compute_implicit_h knows this is a bracket atom.
73                atom.explicit_h = Some(h_count.unwrap_or(0));
74                atom.charge = *charge;
75                atom.map_num = *map_num;
76                let idx = graph.add_node(atom);
77
78                if let Some(prev) = current {
79                    let bond =
80                        pending_bond
81                            .take()
82                            .unwrap_or(if *aromatic && graph[prev].aromatic {
83                                BondType::Aromatic
84                            } else {
85                                BondType::Single
86                            });
87                    graph.add_edge(prev, idx, bond);
88                }
89                current = Some(idx);
90            }
91
92            Token::Bond(bt) => {
93                pending_bond = Some(*bt);
94            }
95
96            Token::BondStereo(_) => {
97                // Stereo info stored separately; treat as single bond direction.
98                // Don't override pending_bond.
99            }
100
101            Token::OpenBranch => {
102                if let Some(cur) = current {
103                    branch_stack.push(cur);
104                }
105            }
106
107            Token::CloseBranch => {
108                current = branch_stack.pop();
109                if current.is_none() {
110                    return Err(ParseError::UnmatchedBranch);
111                }
112                pending_bond = None;
113            }
114
115            Token::RingClosure(ring_id) => {
116                let cur = current.ok_or(ParseError::DanglingBond)?;
117                if let Some((other, ring_bond)) = ring_map.remove(ring_id) {
118                    let bond = pending_bond.take().or(ring_bond).unwrap_or(
119                        if graph[cur].aromatic && graph[other].aromatic {
120                            BondType::Aromatic
121                        } else {
122                            BondType::Single
123                        },
124                    );
125                    graph.add_edge(cur, other, bond);
126                } else {
127                    ring_map.insert(*ring_id, (cur, pending_bond.take()));
128                }
129            }
130
131            Token::Dot => {
132                current = None;
133                pending_bond = None;
134            }
135        }
136    }
137
138    if let Some((&ring_id, _)) = ring_map.iter().next() {
139        return Err(ParseError::UnmatchedRing(ring_id));
140    }
141
142    if !branch_stack.is_empty() {
143        return Err(ParseError::UnmatchedBranch);
144    }
145
146    graph.assign_implicit_hydrogens();
147    assign_ring_info(&mut graph);
148
149    Ok(graph)
150}
151
152#[cfg(test)]
153mod tests {
154    use super::*;
155
156    #[test]
157    fn test_methane() {
158        let mol = parse("C").unwrap();
159        assert_eq!(mol.node_count(), 1);
160        assert_eq!(mol.edge_count(), 0);
161        assert_eq!(mol[NodeIndex::new(0)].h_count, 4);
162    }
163
164    #[test]
165    fn test_ethanol() {
166        let mol = parse("CCO").unwrap();
167        assert_eq!(mol.node_count(), 3);
168        assert_eq!(mol.edge_count(), 2);
169        assert_eq!(mol[NodeIndex::new(0)].h_count, 3);
170        assert_eq!(mol[NodeIndex::new(1)].h_count, 2);
171        assert_eq!(mol[NodeIndex::new(2)].h_count, 1);
172    }
173
174    #[test]
175    fn test_benzene() {
176        let mol = parse("c1ccccc1").unwrap();
177        assert_eq!(mol.node_count(), 6);
178        assert_eq!(mol.edge_count(), 6);
179        for i in 0..6 {
180            assert!(mol[NodeIndex::new(i)].aromatic);
181            assert_eq!(mol[NodeIndex::new(i)].h_count, 1);
182        }
183    }
184
185    #[test]
186    fn test_acetic_acid() {
187        let mol = parse("CC(=O)O").unwrap();
188        assert_eq!(mol.node_count(), 4);
189        assert_eq!(mol.edge_count(), 3);
190        let c1 = NodeIndex::new(1);
191        let o1 = NodeIndex::new(2);
192        let edge = mol.find_edge(c1, o1).unwrap();
193        assert_eq!(mol[edge], BondType::Double);
194    }
195
196    #[test]
197    fn test_cyclopropane() {
198        let mol = parse("C1CC1").unwrap();
199        assert_eq!(mol.node_count(), 3);
200        assert_eq!(mol.edge_count(), 3);
201    }
202
203    #[test]
204    fn test_charged_ammonium() {
205        let mol = parse("[NH4+]").unwrap();
206        assert_eq!(mol.node_count(), 1);
207        assert_eq!(mol[NodeIndex::new(0)].charge, 1);
208        assert_eq!(mol[NodeIndex::new(0)].h_count, 4);
209    }
210
211    #[test]
212    fn test_disconnected_fragments() {
213        let mol = parse("CC.OO").unwrap();
214        assert_eq!(mol.node_count(), 4);
215        assert_eq!(mol.edge_count(), 2);
216    }
217
218    #[test]
219    fn test_aspirin() {
220        let mol = parse("CC(=O)Oc1ccccc1C(=O)O").unwrap();
221        assert_eq!(mol.node_count(), 13);
222        assert_eq!(mol.edge_count(), 13);
223    }
224
225    #[test]
226    fn test_unmatched_ring_error() {
227        assert!(parse("C1CC").is_err());
228    }
229
230    #[test]
231    fn test_unmatched_branch_error() {
232        assert!(parse("CC(O").is_err());
233    }
234}