molprint_core/smiles/
parser.rs1use super::lexer::{LexerError, Token};
2use crate::mol::atom::Atom;
3use crate::mol::bond::BondType;
4use crate::mol::graph::{MolGraph, MolGraphExt};
5use crate::ring::assign_ring_info;
6use petgraph::graph::NodeIndex;
7use thiserror::Error;
8
9#[derive(Error, Debug)]
10pub enum ParseError {
11 #[error("lexer error: {0}")]
12 Lexer(#[from] LexerError),
13 #[error("unmatched ring closure {0}")]
14 UnmatchedRing(u8),
15 #[error("unmatched branch parenthesis")]
16 UnmatchedBranch,
17 #[error("empty SMILES string")]
18 Empty,
19 #[error("bond without target atom")]
20 DanglingBond,
21}
22
23pub fn parse(smiles: &str) -> Result<MolGraph, ParseError> {
25 let tokens = super::lexer::tokenize(smiles)?;
26 if tokens.is_empty() {
27 return Err(ParseError::Empty);
28 }
29
30 let mut graph = MolGraph::new_undirected();
31 let mut current: Option<NodeIndex> = None;
32 let mut branch_stack: Vec<NodeIndex> = Vec::new();
33 let mut pending_bond: Option<BondType> = None;
34 let mut ring_map: std::collections::HashMap<u8, (NodeIndex, Option<BondType>)> =
36 std::collections::HashMap::new();
37
38 for token in &tokens {
39 match token {
40 Token::Atom { element, aromatic } => {
41 let mut atom = Atom::new(*element);
42 atom.aromatic = *aromatic;
43 let idx = graph.add_node(atom);
44
45 if let Some(prev) = current {
46 let bond =
47 pending_bond
48 .take()
49 .unwrap_or(if *aromatic && graph[prev].aromatic {
50 BondType::Aromatic
51 } else {
52 BondType::Single
53 });
54 graph.add_edge(prev, idx, bond);
55 }
56 current = Some(idx);
57 }
58
59 Token::BracketAtom {
60 isotope,
61 element,
62 aromatic,
63 h_count,
64 charge,
65 map_num,
66 chirality: _,
67 } => {
68 let mut atom = Atom::new(*element);
69 atom.isotope = *isotope;
70 atom.aromatic = *aromatic;
71 atom.explicit_h = Some(h_count.unwrap_or(0));
74 atom.charge = *charge;
75 atom.map_num = *map_num;
76 let idx = graph.add_node(atom);
77
78 if let Some(prev) = current {
79 let bond =
80 pending_bond
81 .take()
82 .unwrap_or(if *aromatic && graph[prev].aromatic {
83 BondType::Aromatic
84 } else {
85 BondType::Single
86 });
87 graph.add_edge(prev, idx, bond);
88 }
89 current = Some(idx);
90 }
91
92 Token::Bond(bt) => {
93 pending_bond = Some(*bt);
94 }
95
96 Token::BondStereo(_) => {
97 }
100
101 Token::OpenBranch => {
102 if let Some(cur) = current {
103 branch_stack.push(cur);
104 }
105 }
106
107 Token::CloseBranch => {
108 current = branch_stack.pop();
109 if current.is_none() {
110 return Err(ParseError::UnmatchedBranch);
111 }
112 pending_bond = None;
113 }
114
115 Token::RingClosure(ring_id) => {
116 let cur = current.ok_or(ParseError::DanglingBond)?;
117 if let Some((other, ring_bond)) = ring_map.remove(ring_id) {
118 let bond = pending_bond.take().or(ring_bond).unwrap_or(
119 if graph[cur].aromatic && graph[other].aromatic {
120 BondType::Aromatic
121 } else {
122 BondType::Single
123 },
124 );
125 graph.add_edge(cur, other, bond);
126 } else {
127 ring_map.insert(*ring_id, (cur, pending_bond.take()));
128 }
129 }
130
131 Token::Dot => {
132 current = None;
133 pending_bond = None;
134 }
135 }
136 }
137
138 if let Some((&ring_id, _)) = ring_map.iter().next() {
139 return Err(ParseError::UnmatchedRing(ring_id));
140 }
141
142 if !branch_stack.is_empty() {
143 return Err(ParseError::UnmatchedBranch);
144 }
145
146 graph.assign_implicit_hydrogens();
147 assign_ring_info(&mut graph);
148
149 Ok(graph)
150}
151
152#[cfg(test)]
153mod tests {
154 use super::*;
155
156 #[test]
157 fn test_methane() {
158 let mol = parse("C").unwrap();
159 assert_eq!(mol.node_count(), 1);
160 assert_eq!(mol.edge_count(), 0);
161 assert_eq!(mol[NodeIndex::new(0)].h_count, 4);
162 }
163
164 #[test]
165 fn test_ethanol() {
166 let mol = parse("CCO").unwrap();
167 assert_eq!(mol.node_count(), 3);
168 assert_eq!(mol.edge_count(), 2);
169 assert_eq!(mol[NodeIndex::new(0)].h_count, 3);
170 assert_eq!(mol[NodeIndex::new(1)].h_count, 2);
171 assert_eq!(mol[NodeIndex::new(2)].h_count, 1);
172 }
173
174 #[test]
175 fn test_benzene() {
176 let mol = parse("c1ccccc1").unwrap();
177 assert_eq!(mol.node_count(), 6);
178 assert_eq!(mol.edge_count(), 6);
179 for i in 0..6 {
180 assert!(mol[NodeIndex::new(i)].aromatic);
181 assert_eq!(mol[NodeIndex::new(i)].h_count, 1);
182 }
183 }
184
185 #[test]
186 fn test_acetic_acid() {
187 let mol = parse("CC(=O)O").unwrap();
188 assert_eq!(mol.node_count(), 4);
189 assert_eq!(mol.edge_count(), 3);
190 let c1 = NodeIndex::new(1);
191 let o1 = NodeIndex::new(2);
192 let edge = mol.find_edge(c1, o1).unwrap();
193 assert_eq!(mol[edge], BondType::Double);
194 }
195
196 #[test]
197 fn test_cyclopropane() {
198 let mol = parse("C1CC1").unwrap();
199 assert_eq!(mol.node_count(), 3);
200 assert_eq!(mol.edge_count(), 3);
201 }
202
203 #[test]
204 fn test_charged_ammonium() {
205 let mol = parse("[NH4+]").unwrap();
206 assert_eq!(mol.node_count(), 1);
207 assert_eq!(mol[NodeIndex::new(0)].charge, 1);
208 assert_eq!(mol[NodeIndex::new(0)].h_count, 4);
209 }
210
211 #[test]
212 fn test_disconnected_fragments() {
213 let mol = parse("CC.OO").unwrap();
214 assert_eq!(mol.node_count(), 4);
215 assert_eq!(mol.edge_count(), 2);
216 }
217
218 #[test]
219 fn test_aspirin() {
220 let mol = parse("CC(=O)Oc1ccccc1C(=O)O").unwrap();
221 assert_eq!(mol.node_count(), 13);
222 assert_eq!(mol.edge_count(), 13);
223 }
224
225 #[test]
226 fn test_unmatched_ring_error() {
227 assert!(parse("C1CC").is_err());
228 }
229
230 #[test]
231 fn test_unmatched_branch_error() {
232 assert!(parse("CC(O").is_err());
233 }
234}