Skip to main content

bigsmiles/
parser.rs

1use crate::{
2    ast::{
3        BigSmiles, BigSmilesSegment, BondDescriptor, BondDescriptorKind, StochasticFragment,
4        StochasticObject,
5    },
6    error::ParseError,
7};
8
9/// Parse a BigSMILES string into a [`BigSmiles`] AST.
10///
11/// BigSMILES extends SMILES with stochastic objects `{...}` for polymer notation.
12///
13/// # Examples
14///
15/// ```rust
16/// use bigsmiles::parse;
17///
18/// let pe = parse("{[]CC[]}").unwrap();
19/// let ps = parse("{[]CC(c1ccccc1)[]}").unwrap();
20/// let copo = parse("{[$]CC[$],[$]CC(C)[$]}").unwrap();
21/// let pe_end = parse("CC{[$]CC[$]}CC").unwrap();
22/// ```
23pub fn parse(input: &str) -> Result<BigSmiles, ParseError> {
24    Parser::new(input).parse_bigsmiles()
25}
26
27// ── Connection-atom helper ────────────────────────────────────────────────────
28
29/// Returns the index (in the parsed `Molecule` node list) of the atom bonded
30/// to the **right** bond descriptor — i.e., the last atom written on the
31/// *main chain* (parenthesis depth 0) of the SMILES string.
32///
33/// In `CC(C)` the main-chain atoms are the first two Cs (indices 0 and 1);
34/// the methyl branch C (index 2) is at depth 1, so this returns `1`.
35///
36/// The left connection atom is always index `0` (the first atom written).
37fn right_connection_atom(smiles: &str) -> usize {
38    let mut atom_idx: usize = 0;
39    let mut right: usize = 0;
40    let mut depth: usize = 0;
41    let bytes = smiles.as_bytes();
42    let mut i = 0;
43
44    // Advance `i` past any ring-closure tokens (single digit or `%NN`).
45    // Ring closures appear directly after an atom symbol or `]`.
46    macro_rules! skip_ring_closures {
47        () => {
48            while i < bytes.len() {
49                if bytes[i] == b'%' && i + 2 < bytes.len() {
50                    i += 3; // %NN
51                } else if bytes[i].is_ascii_digit() {
52                    i += 1;
53                } else {
54                    break;
55                }
56            }
57        };
58    }
59
60    while i < bytes.len() {
61        match bytes[i] {
62            b'(' => {
63                depth += 1;
64                i += 1;
65            }
66            b')' => {
67                depth -= 1;
68                i += 1;
69            }
70            b'[' => {
71                // Bracket atom — consume until matching `]`
72                i += 1;
73                while i < bytes.len() && bytes[i] != b']' {
74                    i += 1;
75                }
76                if i < bytes.len() {
77                    i += 1; // skip `]`
78                }
79                skip_ring_closures!();
80                if depth == 0 {
81                    right = atom_idx;
82                }
83                atom_idx += 1;
84            }
85            // Bond characters and dot — not atoms
86            b'-' | b'=' | b'#' | b'$' | b':' | b'/' | b'\\' | b'.' => {
87                i += 1;
88            }
89            b'*' => {
90                i += 1;
91                skip_ring_closures!();
92                if depth == 0 {
93                    right = atom_idx;
94                }
95                atom_idx += 1;
96            }
97            c if c.is_ascii_alphabetic() => {
98                // Two-letter organic atoms: Cl, Br
99                if (c == b'C' && i + 1 < bytes.len() && bytes[i + 1] == b'l')
100                    || (c == b'B' && i + 1 < bytes.len() && bytes[i + 1] == b'r')
101                {
102                    i += 2;
103                } else {
104                    i += 1;
105                }
106                skip_ring_closures!();
107                if depth == 0 {
108                    right = atom_idx;
109                }
110                atom_idx += 1;
111            }
112            _ => {
113                i += 1;
114            }
115        }
116    }
117
118    right
119}
120
121// ── Internal parser ──────────────────────────────────────────────────────────
122
123struct Parser<'a> {
124    input: &'a str,
125    pos: usize,
126}
127
128impl<'a> Parser<'a> {
129    fn new(input: &'a str) -> Self {
130        Parser { input, pos: 0 }
131    }
132
133    // ── Primitive helpers ────────────────────────────────────────────────────
134
135    fn peek(&self) -> Option<char> {
136        self.input[self.pos..].chars().next()
137    }
138
139    fn consume(&mut self) -> Option<char> {
140        let c = self.input[self.pos..].chars().next()?;
141        self.pos += c.len_utf8();
142        Some(c)
143    }
144
145    fn expect(&mut self, expected: char) -> Result<(), ParseError> {
146        match self.consume() {
147            Some(c) if c == expected => Ok(()),
148            Some(c) => Err(ParseError::UnexpectedChar(c, self.pos - c.len_utf8())),
149            None => Err(ParseError::UnexpectedEnd(self.pos)),
150        }
151    }
152
153    // ── Bond-descriptor lookahead ────────────────────────────────────────────
154
155    /// Returns true if the byte at `pos` starts a bond descriptor `[$/</>/]]`.
156    ///
157    /// Bond descriptors start with `[` followed by `$`, `<`, `>`, or `]` (empty).
158    /// This is unambiguous because SMILES bracket atoms always start with a letter,
159    /// digit, or `*` after `[`.
160    fn is_bd_at(&self, pos: usize) -> bool {
161        let bytes = self.input.as_bytes();
162        if pos >= bytes.len() || bytes[pos] != b'[' {
163            return false;
164        }
165        let next = pos + 1;
166        next < bytes.len() && matches!(bytes[next], b'$' | b'<' | b'>' | b']')
167    }
168
169    fn is_bd_here(&self) -> bool {
170        self.is_bd_at(self.pos)
171    }
172
173    /// Scans past a bond descriptor at `from_pos` (without mutating `self.pos`).
174    /// Returns the position immediately after the closing `]`, or `None` if no
175    /// valid BD is found at `from_pos`.
176    fn skip_bd_at(&self, from_pos: usize) -> Option<usize> {
177        let bytes = self.input.as_bytes();
178        if !self.is_bd_at(from_pos) {
179            return None;
180        }
181        let mut p = from_pos + 1; // skip `[`
182                                  // skip descriptor type char (`$`, `<`, `>`) — or nothing for `[]`
183        if p < bytes.len() && matches!(bytes[p], b'$' | b'<' | b'>') {
184            p += 1;
185        }
186        // skip optional digits
187        while p < bytes.len() && bytes[p].is_ascii_digit() {
188            p += 1;
189        }
190        // expect `]`
191        if p < bytes.len() && bytes[p] == b']' {
192            Some(p + 1)
193        } else {
194            None
195        }
196    }
197
198    /// Returns the first char after the bond descriptor starting at `self.pos`,
199    /// or `None` if the BD is malformed or at end-of-input.
200    fn peek_after_current_bd(&self) -> Option<char> {
201        let after = self.skip_bd_at(self.pos)?;
202        self.input[after..].chars().next()
203    }
204
205    /// Returns true if the token immediately following the BD at `self.pos`
206    /// is itself another bond descriptor.
207    fn is_bd_after_current_bd(&self) -> bool {
208        match self.skip_bd_at(self.pos) {
209            Some(after) => self.is_bd_at(after),
210            None => false,
211        }
212    }
213
214    // ── SMILES substring extraction ──────────────────────────────────────────
215
216    /// Extracts a SMILES substring from `self.pos` up to (but not including)
217    /// the next BigSMILES structural delimiter when *outside* a stochastic object.
218    /// Delimiters: `{` (start of stochastic object).
219    ///
220    /// Correctly skips over SMILES bracket atoms `[element...]`.
221    fn extract_outer_smiles(&mut self) -> &'a str {
222        let start = self.pos;
223        loop {
224            match self.peek() {
225                None | Some('{') => break,
226                Some('[') => {
227                    // SMILES bracket atom — consume until matching `]`
228                    self.pos += 1;
229                    loop {
230                        match self.consume() {
231                            Some(']') | None => break,
232                            _ => {}
233                        }
234                    }
235                }
236                Some(c) => {
237                    self.pos += c.len_utf8();
238                }
239            }
240        }
241        &self.input[start..self.pos]
242    }
243
244    /// Extracts a SMILES substring from `self.pos` up to (but not including)
245    /// the next BigSMILES delimiter when *inside* a stochastic object.
246    /// Delimiters: `[bd`, `,`, `;`, `}`.
247    ///
248    /// Correctly skips over SMILES bracket atoms `[element...]`.
249    fn extract_inner_smiles(&mut self) -> &'a str {
250        let start = self.pos;
251        loop {
252            match self.peek() {
253                None | Some('}') | Some(',') | Some(';') => break,
254                Some('[') if self.is_bd_here() => break,
255                Some('[') => {
256                    // SMILES bracket atom — consume until matching `]`
257                    self.pos += 1;
258                    loop {
259                        match self.consume() {
260                            Some(']') | None => break,
261                            _ => {}
262                        }
263                    }
264                }
265                Some(c) => {
266                    self.pos += c.len_utf8();
267                }
268            }
269        }
270        &self.input[start..self.pos]
271    }
272
273    // ── Bond descriptor parser ───────────────────────────────────────────────
274
275    fn parse_bond_descriptor(&mut self) -> Result<BondDescriptor, ParseError> {
276        let pos_before = self.pos;
277        self.expect('[')?;
278        let kind = match self.peek() {
279            Some(']') => BondDescriptorKind::NoBond,
280            Some('$') => {
281                self.consume();
282                BondDescriptorKind::NonDirectional
283            }
284            Some('<') => {
285                self.consume();
286                BondDescriptorKind::Head
287            }
288            Some('>') => {
289                self.consume();
290                BondDescriptorKind::Tail
291            }
292            Some(c) => return Err(ParseError::UnexpectedChar(c, self.pos)),
293            None => return Err(ParseError::UnexpectedEnd(self.pos)),
294        };
295        // Optional numeric index
296        let mut index_digits = String::new();
297        while let Some(c) = self.peek() {
298            if c.is_ascii_digit() {
299                index_digits.push(c);
300                self.consume();
301            } else {
302                break;
303            }
304        }
305        let index = if index_digits.is_empty() {
306            None
307        } else {
308            Some(
309                index_digits
310                    .parse::<u32>()
311                    .map_err(|_| ParseError::InvalidBondDescriptor(pos_before))?,
312            )
313        };
314        self.expect(']')
315            .map_err(|_| ParseError::InvalidBondDescriptor(pos_before))?;
316        Ok(BondDescriptor { kind, index })
317    }
318
319    // ── Stochastic fragment parser ───────────────────────────────────────────
320
321    fn parse_stochastic_fragment(&mut self) -> Result<StochasticFragment, ParseError> {
322        let left = self.parse_bond_descriptor()?;
323        let smiles_str = self.extract_inner_smiles();
324        if smiles_str.is_empty() {
325            return Err(ParseError::EmptySmiles);
326        }
327        let smiles_raw = smiles_str.to_owned();
328        let right_atom = right_connection_atom(smiles_str);
329        let molecule = opensmiles::parse(smiles_str)?;
330        let right = self.parse_bond_descriptor()?;
331        Ok(StochasticFragment {
332            left,
333            smiles_raw,
334            molecule,
335            left_atom: 0,
336            right_atom,
337            right,
338        })
339    }
340
341    // ── Stochastic object parser ─────────────────────────────────────────────
342
343    fn parse_stochastic_object(&mut self) -> Result<StochasticObject, ParseError> {
344        self.expect('{')?;
345
346        let mut obj = StochasticObject {
347            left_end: None,
348            repeat_units: Vec::new(),
349            end_groups: Vec::new(),
350            right_end: None,
351        };
352
353        // Handle empty stochastic object `{}`
354        if self.peek() == Some('}') {
355            self.consume();
356            return Ok(obj);
357        }
358
359        // Detect `left_end`: if the current BD is immediately followed by another BD,
360        // the first one is the "outer" terminal connecting to the left SMILES fragment.
361        // Example: `{[>][<]CC[>][<]}` → left_end = `[>]`, first fragment = `[<]CC[>]`
362        if self.is_bd_here() && self.is_bd_after_current_bd() {
363            obj.left_end = Some(self.parse_bond_descriptor()?);
364        }
365
366        // Parse repeat units (and optionally end groups after `;`)
367        let mut in_end_groups = false;
368        loop {
369            // Detect `right_end`: a BD immediately before `}` (no SMILES follows it).
370            // This BD is the outer terminal connecting to the right SMILES fragment.
371            if self.is_bd_here() && self.peek_after_current_bd() == Some('}') {
372                obj.right_end = Some(self.parse_bond_descriptor()?);
373                self.expect('}')?;
374                return Ok(obj);
375            }
376
377            // Parse a full stochastic fragment: [bd]SMILES[bd]
378            let frag = self.parse_stochastic_fragment()?;
379            if in_end_groups {
380                obj.end_groups.push(frag);
381            } else {
382                obj.repeat_units.push(frag);
383            }
384
385            // Decide what to do next
386            match self.peek() {
387                Some(',') => {
388                    self.consume();
389                }
390                Some(';') => {
391                    self.consume();
392                    in_end_groups = true;
393                }
394                Some('}') => {
395                    self.consume();
396                    return Ok(obj);
397                }
398                // A bond descriptor immediately before `}` is the right terminal.
399                Some('[') if self.is_bd_here() && self.peek_after_current_bd() == Some('}') => {
400                    obj.right_end = Some(self.parse_bond_descriptor()?);
401                    self.expect('}')?;
402                    return Ok(obj);
403                }
404                None => return Err(ParseError::UnclosedStochasticObject),
405                Some(c) => return Err(ParseError::UnexpectedChar(c, self.pos)),
406            }
407        }
408    }
409
410    // ── Top-level BigSMILES parser ───────────────────────────────────────────
411
412    fn parse_bigsmiles(mut self) -> Result<BigSmiles, ParseError> {
413        let mut segments = Vec::new();
414
415        while self.pos < self.input.len() {
416            match self.peek() {
417                Some('{') => {
418                    let obj = self.parse_stochastic_object()?;
419                    segments.push(BigSmilesSegment::Stochastic(obj));
420                }
421                Some(_) => {
422                    let smiles_str = self.extract_outer_smiles();
423                    if !smiles_str.is_empty() {
424                        let mol = opensmiles::parse(smiles_str)?;
425                        segments.push(BigSmilesSegment::Smiles(mol));
426                    }
427                }
428                None => break,
429            }
430        }
431
432        Ok(BigSmiles { segments })
433    }
434}