bigsmiles/parser.rs
1use crate::{
2 ast::{
3 BigSmiles, BigSmilesSegment, BondDescriptor, BondDescriptorKind, StochasticFragment,
4 StochasticObject,
5 },
6 error::ParseError,
7};
8
9/// Parse a BigSMILES string into a [`BigSmiles`] AST.
10///
11/// BigSMILES extends SMILES with stochastic objects `{...}` for polymer notation.
12///
13/// # Examples
14///
15/// ```rust
16/// use bigsmiles::parse;
17///
18/// let pe = parse("{[]CC[]}").unwrap();
19/// let ps = parse("{[]CC(c1ccccc1)[]}").unwrap();
20/// let copo = parse("{[$]CC[$],[$]CC(C)[$]}").unwrap();
21/// let pe_end = parse("CC{[$]CC[$]}CC").unwrap();
22/// ```
23pub fn parse(input: &str) -> Result<BigSmiles, ParseError> {
24 Parser::new(input).parse_bigsmiles()
25}
26
27// ── Connection-atom helper ────────────────────────────────────────────────────
28
29/// Returns the index (in the parsed `Molecule` node list) of the atom bonded
30/// to the **right** bond descriptor — i.e., the last atom written on the
31/// *main chain* (parenthesis depth 0) of the SMILES string.
32///
33/// In `CC(C)` the main-chain atoms are the first two Cs (indices 0 and 1);
34/// the methyl branch C (index 2) is at depth 1, so this returns `1`.
35///
36/// The left connection atom is always index `0` (the first atom written).
37fn right_connection_atom(smiles: &str) -> usize {
38 let mut atom_idx: usize = 0;
39 let mut right: usize = 0;
40 let mut depth: usize = 0;
41 let bytes = smiles.as_bytes();
42 let mut i = 0;
43
44 // Advance `i` past any ring-closure tokens (single digit or `%NN`).
45 // Ring closures appear directly after an atom symbol or `]`.
46 macro_rules! skip_ring_closures {
47 () => {
48 while i < bytes.len() {
49 if bytes[i] == b'%' && i + 2 < bytes.len() {
50 i += 3; // %NN
51 } else if bytes[i].is_ascii_digit() {
52 i += 1;
53 } else {
54 break;
55 }
56 }
57 };
58 }
59
60 while i < bytes.len() {
61 match bytes[i] {
62 b'(' => {
63 depth += 1;
64 i += 1;
65 }
66 b')' => {
67 depth -= 1;
68 i += 1;
69 }
70 b'[' => {
71 // Bracket atom — consume until matching `]`
72 i += 1;
73 while i < bytes.len() && bytes[i] != b']' {
74 i += 1;
75 }
76 if i < bytes.len() {
77 i += 1; // skip `]`
78 }
79 skip_ring_closures!();
80 if depth == 0 {
81 right = atom_idx;
82 }
83 atom_idx += 1;
84 }
85 // Bond characters and dot — not atoms
86 b'-' | b'=' | b'#' | b'$' | b':' | b'/' | b'\\' | b'.' => {
87 i += 1;
88 }
89 b'*' => {
90 i += 1;
91 skip_ring_closures!();
92 if depth == 0 {
93 right = atom_idx;
94 }
95 atom_idx += 1;
96 }
97 c if c.is_ascii_alphabetic() => {
98 // Two-letter organic atoms: Cl, Br
99 if (c == b'C' && i + 1 < bytes.len() && bytes[i + 1] == b'l')
100 || (c == b'B' && i + 1 < bytes.len() && bytes[i + 1] == b'r')
101 {
102 i += 2;
103 } else {
104 i += 1;
105 }
106 skip_ring_closures!();
107 if depth == 0 {
108 right = atom_idx;
109 }
110 atom_idx += 1;
111 }
112 _ => {
113 i += 1;
114 }
115 }
116 }
117
118 right
119}
120
121// ── Internal parser ──────────────────────────────────────────────────────────
122
123struct Parser<'a> {
124 input: &'a str,
125 pos: usize,
126}
127
128impl<'a> Parser<'a> {
129 fn new(input: &'a str) -> Self {
130 Parser { input, pos: 0 }
131 }
132
133 // ── Primitive helpers ────────────────────────────────────────────────────
134
135 fn peek(&self) -> Option<char> {
136 self.input[self.pos..].chars().next()
137 }
138
139 fn consume(&mut self) -> Option<char> {
140 let c = self.input[self.pos..].chars().next()?;
141 self.pos += c.len_utf8();
142 Some(c)
143 }
144
145 fn expect(&mut self, expected: char) -> Result<(), ParseError> {
146 match self.consume() {
147 Some(c) if c == expected => Ok(()),
148 Some(c) => Err(ParseError::UnexpectedChar(c, self.pos - c.len_utf8())),
149 None => Err(ParseError::UnexpectedEnd(self.pos)),
150 }
151 }
152
153 // ── Bond-descriptor lookahead ────────────────────────────────────────────
154
155 /// Returns true if the byte at `pos` starts a bond descriptor `[$/</>/]]`.
156 ///
157 /// Bond descriptors start with `[` followed by `$`, `<`, `>`, or `]` (empty).
158 /// This is unambiguous because SMILES bracket atoms always start with a letter,
159 /// digit, or `*` after `[`.
160 fn is_bd_at(&self, pos: usize) -> bool {
161 let bytes = self.input.as_bytes();
162 if pos >= bytes.len() || bytes[pos] != b'[' {
163 return false;
164 }
165 let next = pos + 1;
166 next < bytes.len() && matches!(bytes[next], b'$' | b'<' | b'>' | b']')
167 }
168
169 fn is_bd_here(&self) -> bool {
170 self.is_bd_at(self.pos)
171 }
172
173 /// Scans past a bond descriptor at `from_pos` (without mutating `self.pos`).
174 /// Returns the position immediately after the closing `]`, or `None` if no
175 /// valid BD is found at `from_pos`.
176 fn skip_bd_at(&self, from_pos: usize) -> Option<usize> {
177 let bytes = self.input.as_bytes();
178 if !self.is_bd_at(from_pos) {
179 return None;
180 }
181 let mut p = from_pos + 1; // skip `[`
182 // skip descriptor type char (`$`, `<`, `>`) — or nothing for `[]`
183 if p < bytes.len() && matches!(bytes[p], b'$' | b'<' | b'>') {
184 p += 1;
185 }
186 // skip optional digits
187 while p < bytes.len() && bytes[p].is_ascii_digit() {
188 p += 1;
189 }
190 // expect `]`
191 if p < bytes.len() && bytes[p] == b']' {
192 Some(p + 1)
193 } else {
194 None
195 }
196 }
197
198 /// Returns the first char after the bond descriptor starting at `self.pos`,
199 /// or `None` if the BD is malformed or at end-of-input.
200 fn peek_after_current_bd(&self) -> Option<char> {
201 let after = self.skip_bd_at(self.pos)?;
202 self.input[after..].chars().next()
203 }
204
205 /// Returns true if the token immediately following the BD at `self.pos`
206 /// is itself another bond descriptor.
207 fn is_bd_after_current_bd(&self) -> bool {
208 match self.skip_bd_at(self.pos) {
209 Some(after) => self.is_bd_at(after),
210 None => false,
211 }
212 }
213
214 // ── SMILES substring extraction ──────────────────────────────────────────
215
216 /// Extracts a SMILES substring from `self.pos` up to (but not including)
217 /// the next BigSMILES structural delimiter when *outside* a stochastic object.
218 /// Delimiters: `{` (start of stochastic object).
219 ///
220 /// Correctly skips over SMILES bracket atoms `[element...]`.
221 fn extract_outer_smiles(&mut self) -> &'a str {
222 let start = self.pos;
223 loop {
224 match self.peek() {
225 None | Some('{') => break,
226 Some('[') => {
227 // SMILES bracket atom — consume until matching `]`
228 self.pos += 1;
229 loop {
230 match self.consume() {
231 Some(']') | None => break,
232 _ => {}
233 }
234 }
235 }
236 Some(c) => {
237 self.pos += c.len_utf8();
238 }
239 }
240 }
241 &self.input[start..self.pos]
242 }
243
244 /// Extracts a SMILES substring from `self.pos` up to (but not including)
245 /// the next BigSMILES delimiter when *inside* a stochastic object.
246 /// Delimiters: `[bd`, `,`, `;`, `}`.
247 ///
248 /// Correctly skips over SMILES bracket atoms `[element...]`.
249 fn extract_inner_smiles(&mut self) -> &'a str {
250 let start = self.pos;
251 loop {
252 match self.peek() {
253 None | Some('}') | Some(',') | Some(';') => break,
254 Some('[') if self.is_bd_here() => break,
255 Some('[') => {
256 // SMILES bracket atom — consume until matching `]`
257 self.pos += 1;
258 loop {
259 match self.consume() {
260 Some(']') | None => break,
261 _ => {}
262 }
263 }
264 }
265 Some(c) => {
266 self.pos += c.len_utf8();
267 }
268 }
269 }
270 &self.input[start..self.pos]
271 }
272
273 // ── Bond descriptor parser ───────────────────────────────────────────────
274
275 fn parse_bond_descriptor(&mut self) -> Result<BondDescriptor, ParseError> {
276 let pos_before = self.pos;
277 self.expect('[')?;
278 let kind = match self.peek() {
279 Some(']') => BondDescriptorKind::NoBond,
280 Some('$') => {
281 self.consume();
282 BondDescriptorKind::NonDirectional
283 }
284 Some('<') => {
285 self.consume();
286 BondDescriptorKind::Head
287 }
288 Some('>') => {
289 self.consume();
290 BondDescriptorKind::Tail
291 }
292 Some(c) => return Err(ParseError::UnexpectedChar(c, self.pos)),
293 None => return Err(ParseError::UnexpectedEnd(self.pos)),
294 };
295 // Optional numeric index
296 let mut index_digits = String::new();
297 while let Some(c) = self.peek() {
298 if c.is_ascii_digit() {
299 index_digits.push(c);
300 self.consume();
301 } else {
302 break;
303 }
304 }
305 let index = if index_digits.is_empty() {
306 None
307 } else {
308 Some(
309 index_digits
310 .parse::<u32>()
311 .map_err(|_| ParseError::InvalidBondDescriptor(pos_before))?,
312 )
313 };
314 self.expect(']')
315 .map_err(|_| ParseError::InvalidBondDescriptor(pos_before))?;
316 Ok(BondDescriptor { kind, index })
317 }
318
319 // ── Stochastic fragment parser ───────────────────────────────────────────
320
321 fn parse_stochastic_fragment(&mut self) -> Result<StochasticFragment, ParseError> {
322 let left = self.parse_bond_descriptor()?;
323 let smiles_str = self.extract_inner_smiles();
324 if smiles_str.is_empty() {
325 return Err(ParseError::EmptySmiles);
326 }
327 let smiles_raw = smiles_str.to_owned();
328 let right_atom = right_connection_atom(smiles_str);
329 let molecule = opensmiles::parse(smiles_str)?;
330 let right = self.parse_bond_descriptor()?;
331 Ok(StochasticFragment {
332 left,
333 smiles_raw,
334 molecule,
335 left_atom: 0,
336 right_atom,
337 right,
338 })
339 }
340
341 // ── Stochastic object parser ─────────────────────────────────────────────
342
343 fn parse_stochastic_object(&mut self) -> Result<StochasticObject, ParseError> {
344 self.expect('{')?;
345
346 let mut obj = StochasticObject {
347 left_end: None,
348 repeat_units: Vec::new(),
349 end_groups: Vec::new(),
350 right_end: None,
351 };
352
353 // Handle empty stochastic object `{}`
354 if self.peek() == Some('}') {
355 self.consume();
356 return Ok(obj);
357 }
358
359 // Detect `left_end`: if the current BD is immediately followed by another BD,
360 // the first one is the "outer" terminal connecting to the left SMILES fragment.
361 // Example: `{[>][<]CC[>][<]}` → left_end = `[>]`, first fragment = `[<]CC[>]`
362 if self.is_bd_here() && self.is_bd_after_current_bd() {
363 obj.left_end = Some(self.parse_bond_descriptor()?);
364 }
365
366 // Parse repeat units (and optionally end groups after `;`)
367 let mut in_end_groups = false;
368 loop {
369 // Detect `right_end`: a BD immediately before `}` (no SMILES follows it).
370 // This BD is the outer terminal connecting to the right SMILES fragment.
371 if self.is_bd_here() && self.peek_after_current_bd() == Some('}') {
372 obj.right_end = Some(self.parse_bond_descriptor()?);
373 self.expect('}')?;
374 return Ok(obj);
375 }
376
377 // Parse a full stochastic fragment: [bd]SMILES[bd]
378 let frag = self.parse_stochastic_fragment()?;
379 if in_end_groups {
380 obj.end_groups.push(frag);
381 } else {
382 obj.repeat_units.push(frag);
383 }
384
385 // Decide what to do next
386 match self.peek() {
387 Some(',') => {
388 self.consume();
389 }
390 Some(';') => {
391 self.consume();
392 in_end_groups = true;
393 }
394 Some('}') => {
395 self.consume();
396 return Ok(obj);
397 }
398 // A bond descriptor immediately before `}` is the right terminal.
399 Some('[') if self.is_bd_here() && self.peek_after_current_bd() == Some('}') => {
400 obj.right_end = Some(self.parse_bond_descriptor()?);
401 self.expect('}')?;
402 return Ok(obj);
403 }
404 None => return Err(ParseError::UnclosedStochasticObject),
405 Some(c) => return Err(ParseError::UnexpectedChar(c, self.pos)),
406 }
407 }
408 }
409
410 // ── Top-level BigSMILES parser ───────────────────────────────────────────
411
412 fn parse_bigsmiles(mut self) -> Result<BigSmiles, ParseError> {
413 let mut segments = Vec::new();
414
415 while self.pos < self.input.len() {
416 match self.peek() {
417 Some('{') => {
418 let obj = self.parse_stochastic_object()?;
419 segments.push(BigSmilesSegment::Stochastic(obj));
420 }
421 Some(_) => {
422 let smiles_str = self.extract_outer_smiles();
423 if !smiles_str.is_empty() {
424 let mol = opensmiles::parse(smiles_str)?;
425 segments.push(BigSmilesSegment::Smiles(mol));
426 }
427 }
428 None => break,
429 }
430 }
431
432 Ok(BigSmiles { segments })
433 }
434}