Skip to main content

sci_form/smarts/
parser.rs

1//! SMARTS parser: converts a SMARTS string into a pattern graph.
2
3use crate::graph::ChiralType;
4
5/// A parsed SMARTS pattern as a small graph of atom/bond queries.
6#[derive(Debug, Clone)]
7pub struct SmartsPattern {
8    pub atoms: Vec<SmartsAtom>,
9    pub bonds: Vec<SmartsBond>,
10}
11
12#[derive(Debug, Clone)]
13pub struct SmartsAtom {
14    pub query: AtomQuery,
15    pub map_idx: Option<u8>, // :N atom map number
16}
17
18#[derive(Debug, Clone)]
19pub struct SmartsBond {
20    pub from: usize,
21    pub to: usize,
22    pub query: BondQuery,
23}
24
25#[derive(Debug, Clone)]
26pub enum AtomQuery {
27    True,                  // matches any atom (used for *)
28    Element(u8),           // aliphatic element by atomic number
29    AromaticElem(u8),      // aromatic element (c=6, n=7, o=8, s=16, p=15)
30    AnyAromatic,           // 'a'
31    AnyAliphatic,          // 'A'
32    AtomicNum(u8),         // #N
33    NotAtomicNum(u8),      // !#N
34    TotalH(u8),            // HN
35    TotalDegree(u8),       // XN (total connections including implicit H)
36    HeavyDegree(u8),       // DN (connections to non-H)
37    RingBondCount(u8),     // xN
38    InRing,                // R (in any ring)
39    RingSize(u8),          // rN (in ring of exactly size N)
40    RingSizeRange(u8, u8), // r{N-M}
41    RingSizeMin(u8),       // r{N-}
42    FormalCharge(i8),      // +N or -N
43    Hybridization(u8),     // ^N
44    RingCount(u8),         // RN (number of SSSR rings containing this atom)
45    Chiral(ChiralType),
46    Recursive(Box<SmartsPattern>),
47    And(Vec<AtomQuery>),
48    Or(Vec<AtomQuery>),
49    Not(Box<AtomQuery>),
50}
51
52#[derive(Debug, Clone)]
53pub enum BondQuery {
54    Single,
55    Double,
56    Triple,
57    Aromatic, // ':'
58    Any,      // '~'
59    Ring,     // '@'
60    NotRing,  // '!@'
61    Implicit, // default (single or aromatic)
62    And(Vec<BondQuery>),
63    Not(Box<BondQuery>),
64}
65
66/// Parse a SMARTS string into a SmartsPattern.
67pub fn parse_smarts(smarts: &str) -> Result<SmartsPattern, String> {
68    let mut parser = SmartsParser::new(smarts);
69    parser.parse_chain(None)?;
70    Ok(SmartsPattern {
71        atoms: parser.atoms,
72        bonds: parser.bonds,
73    })
74}
75
76struct SmartsParser<'a> {
77    input: &'a [u8],
78    pos: usize,
79    atoms: Vec<SmartsAtom>,
80    bonds: Vec<SmartsBond>,
81    ring_opens: [Option<usize>; 10], // ring closure digits 0-9
82}
83
84impl<'a> SmartsParser<'a> {
85    fn new(s: &'a str) -> Self {
86        Self {
87            input: s.as_bytes(),
88            pos: 0,
89            atoms: Vec::new(),
90            bonds: Vec::new(),
91            ring_opens: [None; 10],
92        }
93    }
94
95    fn peek(&self) -> Option<u8> {
96        self.input.get(self.pos).copied()
97    }
98
99    fn advance(&mut self) -> Option<u8> {
100        let c = self.input.get(self.pos).copied();
101        if c.is_some() {
102            self.pos += 1;
103        }
104        c
105    }
106
107    fn expect(&mut self, ch: u8) -> Result<(), String> {
108        if self.advance() == Some(ch) {
109            Ok(())
110        } else {
111            Err(format!("expected '{}' at pos {}", ch as char, self.pos - 1))
112        }
113    }
114
115    /// Parse a chain of atoms/bonds, optionally connected to `prev_atom`.
116    fn parse_chain(&mut self, prev_atom: Option<usize>) -> Result<(), String> {
117        let mut prev = prev_atom;
118        while self.pos < self.input.len() {
119            let c = match self.peek() {
120                Some(c) => c,
121                None => break,
122            };
123
124            match c {
125                b')' => break, // end of branch
126                b'(' => {
127                    // Branch
128                    self.advance();
129                    self.parse_chain(prev)?;
130                    self.expect(b')')?;
131                }
132                b'[' | b'*' | b'c' | b'n' | b'o' | b's' | b'p' | b'C' | b'N' | b'O' | b'S'
133                | b'P' | b'F' | b'B' | b'I' | b'a' | b'A' | b'H' => {
134                    // Parse optional bond before atom
135                    let bond_q = self.parse_bond_if_present();
136                    let atom_idx = self.parse_atom()?;
137                    if let Some(p) = prev {
138                        self.bonds.push(SmartsBond {
139                            from: p,
140                            to: atom_idx,
141                            query: bond_q.unwrap_or(BondQuery::Implicit),
142                        });
143                    }
144                    prev = Some(atom_idx);
145                }
146                b'-' | b'=' | b'#' | b'~' | b'/' | b'\\' | b':' | b'!' | b'@' => {
147                    // Bond followed by atom
148                    let bond_q = self.parse_bond()?;
149                    let atom_idx = self.parse_atom()?;
150                    if let Some(p) = prev {
151                        self.bonds.push(SmartsBond {
152                            from: p,
153                            to: atom_idx,
154                            query: bond_q,
155                        });
156                    }
157                    prev = Some(atom_idx);
158                }
159                b'0'..=b'9' => {
160                    // Ring closure
161                    let digit = (self.advance().unwrap() - b'0') as usize;
162                    if let Some(open_atom) = self.ring_opens[digit] {
163                        self.bonds.push(SmartsBond {
164                            from: open_atom,
165                            to: prev.unwrap_or(0),
166                            query: BondQuery::Implicit,
167                        });
168                        self.ring_opens[digit] = None;
169                    } else {
170                        self.ring_opens[digit] = prev;
171                    }
172                }
173                _ => break,
174            }
175        }
176        Ok(())
177    }
178
179    /// Try to parse a bond query if one is present (without consuming atom chars).
180    fn parse_bond_if_present(&mut self) -> Option<BondQuery> {
181        match self.peek() {
182            Some(b'-') | Some(b'=') | Some(b'#') | Some(b'~') | Some(b'!') | Some(b'@')
183            | Some(b':') => self.parse_bond().ok(),
184            _ => None,
185        }
186    }
187
188    /// Parse a bond query.
189    fn parse_bond(&mut self) -> Result<BondQuery, String> {
190        let mut parts = Vec::new();
191        loop {
192            match self.peek() {
193                Some(b'-') => {
194                    self.advance();
195                    parts.push(BondQuery::Single);
196                }
197                Some(b'=') => {
198                    self.advance();
199                    parts.push(BondQuery::Double);
200                }
201                Some(b'#') => {
202                    self.advance();
203                    parts.push(BondQuery::Triple);
204                }
205                Some(b'~') => {
206                    self.advance();
207                    parts.push(BondQuery::Any);
208                }
209                Some(b':') => {
210                    self.advance();
211                    parts.push(BondQuery::Aromatic);
212                }
213                Some(b'@') => {
214                    self.advance();
215                    parts.push(BondQuery::Ring);
216                }
217                Some(b'!') => {
218                    self.advance();
219                    if self.peek() == Some(b'@') {
220                        self.advance();
221                        parts.push(BondQuery::NotRing);
222                    } else {
223                        let inner = self.parse_bond()?;
224                        parts.push(BondQuery::Not(Box::new(inner)));
225                    }
226                }
227                Some(b';') => {
228                    self.advance();
229                } // AND separator, continue
230                Some(b',') => {
231                    // OR — not typically used in torsion SMARTS bonds
232                    self.advance();
233                }
234                _ => break,
235            }
236        }
237        match parts.len() {
238            0 => Ok(BondQuery::Implicit),
239            1 => Ok(parts.pop().unwrap()),
240            _ => Ok(BondQuery::And(parts)),
241        }
242    }
243
244    /// Parse an atom (either bracket or organic subset).
245    fn parse_atom(&mut self) -> Result<usize, String> {
246        let atom = match self.peek() {
247            Some(b'[') => self.parse_bracket_atom()?,
248            Some(b'*') => {
249                self.advance();
250                SmartsAtom {
251                    query: AtomQuery::True,
252                    map_idx: None,
253                }
254            }
255            _ => self.parse_organic_atom()?,
256        };
257        let idx = self.atoms.len();
258        self.atoms.push(atom);
259        Ok(idx)
260    }
261
262    /// Parse an organic subset atom (single uppercase letter, possibly followed by lowercase).
263    fn parse_organic_atom(&mut self) -> Result<SmartsAtom, String> {
264        let c = self.advance().ok_or("unexpected end")?;
265        let query = match c {
266            b'C' if self.peek() == Some(b'l') => {
267                self.advance();
268                AtomQuery::Element(17)
269            }
270            b'B' if self.peek() == Some(b'r') => {
271                self.advance();
272                AtomQuery::Element(35)
273            }
274            b'C' => AtomQuery::Element(6),
275            b'N' => AtomQuery::Element(7),
276            b'O' => AtomQuery::Element(8),
277            b'S' => AtomQuery::Element(16),
278            b'P' => AtomQuery::Element(15),
279            b'F' => AtomQuery::Element(9),
280            b'B' => AtomQuery::Element(5),
281            b'I' => AtomQuery::Element(53),
282            b'H' => AtomQuery::Element(1),
283            b'c' => AtomQuery::AromaticElem(6),
284            b'n' => AtomQuery::AromaticElem(7),
285            b'o' => AtomQuery::AromaticElem(8),
286            b's' => AtomQuery::AromaticElem(16),
287            b'p' => AtomQuery::AromaticElem(15),
288            b'a' => AtomQuery::AnyAromatic,
289            b'A' => AtomQuery::AnyAliphatic,
290            _ => {
291                return Err(format!(
292                    "unexpected atom char '{}' at pos {}",
293                    c as char,
294                    self.pos - 1
295                ))
296            }
297        };
298        Ok(SmartsAtom {
299            query,
300            map_idx: None,
301        })
302    }
303
304    /// Parse a bracket atom [...]
305    fn parse_bracket_atom(&mut self) -> Result<SmartsAtom, String> {
306        self.expect(b'[')?;
307        let query = self.parse_atom_spec()?;
308        // Check for map class :N before closing bracket
309        let map_idx = if self.peek() == Some(b':') {
310            self.advance();
311            Some(self.parse_number()? as u8)
312        } else {
313            None
314        };
315        self.expect(b']')?;
316        Ok(SmartsAtom { query, map_idx })
317    }
318
319    /// Top-level atom spec: semicolon-separated low-priority AND groups.
320    /// Precedence (lowest to highest): ; (low AND) < , (OR) < implicit/& (high AND) < ! (NOT)
321    fn parse_atom_spec(&mut self) -> Result<AtomQuery, String> {
322        let mut parts = vec![self.parse_atom_query_or()?];
323        while self.peek() == Some(b';') {
324            self.advance();
325            parts.push(self.parse_atom_query_or()?);
326        }
327        if parts.len() == 1 {
328            Ok(parts.pop().unwrap())
329        } else {
330            Ok(AtomQuery::And(parts))
331        }
332    }
333
334    /// Parse an atom query with OR (comma-separated).
335    fn parse_atom_query_or(&mut self) -> Result<AtomQuery, String> {
336        let mut parts = vec![self.parse_atom_query_and()?];
337        while self.peek() == Some(b',') {
338            self.advance();
339            parts.push(self.parse_atom_query_and()?);
340        }
341        if parts.len() == 1 {
342            Ok(parts.pop().unwrap())
343        } else {
344            Ok(AtomQuery::Or(parts))
345        }
346    }
347
348    /// Parse an atom query with high-priority AND (implicit juxtaposition or &).
349    fn parse_atom_query_and(&mut self) -> Result<AtomQuery, String> {
350        let mut parts = Vec::new();
351        loop {
352            match self.peek() {
353                Some(b']') | Some(b',') | Some(b':') | Some(b';') | None => break,
354                Some(b'&') => {
355                    self.advance();
356                } // explicit high-priority AND
357                _ => parts.push(self.parse_atom_primitive()?),
358            }
359        }
360        match parts.len() {
361            0 => Ok(AtomQuery::True),
362            1 => Ok(parts.pop().unwrap()),
363            _ => Ok(AtomQuery::And(parts)),
364        }
365    }
366
367    /// Parse a single atom primitive.
368    fn parse_atom_primitive(&mut self) -> Result<AtomQuery, String> {
369        let c = self.peek().ok_or("unexpected end in atom spec")?;
370        match c {
371            b'!' => {
372                self.advance();
373                let inner = self.parse_atom_primitive()?;
374                Ok(AtomQuery::Not(Box::new(inner)))
375            }
376            b'#' => {
377                self.advance();
378                let n = self.parse_number()? as u8;
379                Ok(AtomQuery::AtomicNum(n))
380            }
381            b'@' => {
382                self.advance();
383                let chiral = if self.peek() == Some(b'@') {
384                    self.advance();
385                    ChiralType::TetrahedralCW
386                } else {
387                    ChiralType::TetrahedralCCW
388                };
389                Ok(AtomQuery::Chiral(chiral))
390            }
391            b'$' => {
392                // Recursive SMARTS: $(smarts)
393                self.advance();
394                self.expect(b'(')?;
395                let start = self.pos;
396                // Find matching closing paren, handling nested parens
397                let mut depth = 1;
398                while depth > 0 && self.pos < self.input.len() {
399                    match self.input[self.pos] {
400                        b'(' => depth += 1,
401                        b')' => depth -= 1,
402                        _ => {}
403                    }
404                    if depth > 0 {
405                        self.pos += 1;
406                    }
407                }
408                let inner_str = std::str::from_utf8(&self.input[start..self.pos])
409                    .map_err(|_| "invalid utf8 in recursive SMARTS")?;
410                self.expect(b')')?;
411                let inner = parse_smarts(inner_str)?;
412                Ok(AtomQuery::Recursive(Box::new(inner)))
413            }
414            b'X' => {
415                self.advance();
416                let n = self.parse_number()? as u8;
417                Ok(AtomQuery::TotalDegree(n))
418            }
419            b'x' => {
420                self.advance();
421                let n = self.parse_number()? as u8;
422                Ok(AtomQuery::RingBondCount(n))
423            }
424            b'H' => {
425                self.advance();
426                // H followed by digit = hydrogen count; otherwise H0 is "no H"
427                if self.peek().is_some_and(|c| c.is_ascii_digit()) {
428                    let n = self.parse_number()? as u8;
429                    Ok(AtomQuery::TotalH(n))
430                } else {
431                    // H without number means H >= 1 (at least one hydrogen)
432                    Ok(AtomQuery::TotalH(1))
433                }
434            }
435            b'D' => {
436                self.advance();
437                if self.peek().is_some_and(|c| c.is_ascii_digit()) {
438                    let n = self.parse_number()? as u8;
439                    Ok(AtomQuery::HeavyDegree(n))
440                } else {
441                    Ok(AtomQuery::HeavyDegree(1))
442                }
443            }
444            b'R' => {
445                self.advance();
446                if self.peek().is_some_and(|c| c.is_ascii_digit()) {
447                    let n = self.parse_number()? as u8;
448                    Ok(AtomQuery::RingCount(n))
449                } else {
450                    Ok(AtomQuery::InRing)
451                }
452            }
453            b'r' => {
454                self.advance();
455                if self.peek() == Some(b'{') {
456                    // r{N-M} or r{N-}
457                    self.advance(); // '{'
458                    if self.peek() == Some(b'-') {
459                        // r{-M} → ring size ≤ M
460                        self.advance();
461                        let m = self.parse_number()? as u8;
462                        self.expect(b'}')?;
463                        Ok(AtomQuery::RingSizeRange(3, m))
464                    } else {
465                        let n = self.parse_number()? as u8;
466                        if self.peek() == Some(b'-') {
467                            self.advance();
468                            if self.peek() == Some(b'}') {
469                                self.advance();
470                                Ok(AtomQuery::RingSizeMin(n))
471                            } else {
472                                let m = self.parse_number()? as u8;
473                                self.expect(b'}')?;
474                                Ok(AtomQuery::RingSizeRange(n, m))
475                            }
476                        } else {
477                            self.expect(b'}')?;
478                            Ok(AtomQuery::RingSize(n))
479                        }
480                    }
481                } else if self.peek().is_some_and(|c| c.is_ascii_digit()) {
482                    let n = self.parse_number()? as u8;
483                    Ok(AtomQuery::RingSize(n))
484                } else {
485                    Ok(AtomQuery::InRing)
486                }
487            }
488            b'+' => {
489                self.advance();
490                if self.peek().is_some_and(|c| c.is_ascii_digit()) {
491                    let n = self.parse_number()? as i8;
492                    Ok(AtomQuery::FormalCharge(n))
493                } else {
494                    Ok(AtomQuery::FormalCharge(1))
495                }
496            }
497            b'-' => {
498                // Careful: '-' can also be a bond. Inside bracket atom, it's charge.
499                self.advance();
500                if self.peek().is_some_and(|c| c.is_ascii_digit()) {
501                    let n = self.parse_number()? as i8;
502                    Ok(AtomQuery::FormalCharge(-n))
503                } else {
504                    Ok(AtomQuery::FormalCharge(-1))
505                }
506            }
507            b'^' => {
508                self.advance();
509                let n = self.parse_number()? as u8;
510                Ok(AtomQuery::Hybridization(n))
511            }
512            b'*' => {
513                self.advance();
514                Ok(AtomQuery::True)
515            }
516            b'a' => {
517                self.advance();
518                Ok(AtomQuery::AnyAromatic)
519            }
520            b'A' => {
521                self.advance();
522                // Check if followed by more letters (element symbol like Al, As, etc.)
523                // For CSD patterns, 'A' alone means aliphatic
524                Ok(AtomQuery::AnyAliphatic)
525            }
526            b'C' => {
527                self.advance();
528                if self.peek() == Some(b'l') {
529                    self.advance();
530                    Ok(AtomQuery::Element(17))
531                } else {
532                    Ok(AtomQuery::Element(6))
533                }
534            }
535            b'N' => {
536                self.advance();
537                Ok(AtomQuery::Element(7))
538            }
539            b'O' => {
540                self.advance();
541                Ok(AtomQuery::Element(8))
542            }
543            b'S' => {
544                self.advance();
545                Ok(AtomQuery::Element(16))
546            }
547            b'P' => {
548                self.advance();
549                Ok(AtomQuery::Element(15))
550            }
551            b'F' => {
552                self.advance();
553                Ok(AtomQuery::Element(9))
554            }
555            b'B' => {
556                self.advance();
557                if self.peek() == Some(b'r') {
558                    self.advance();
559                    Ok(AtomQuery::Element(35))
560                } else {
561                    Ok(AtomQuery::Element(5))
562                }
563            }
564            b'I' => {
565                self.advance();
566                Ok(AtomQuery::Element(53))
567            }
568            b'c' => {
569                self.advance();
570                Ok(AtomQuery::AromaticElem(6))
571            }
572            b'n' => {
573                self.advance();
574                Ok(AtomQuery::AromaticElem(7))
575            }
576            b'o' => {
577                self.advance();
578                Ok(AtomQuery::AromaticElem(8))
579            }
580            b's' => {
581                self.advance();
582                Ok(AtomQuery::AromaticElem(16))
583            }
584            b'p' => {
585                self.advance();
586                Ok(AtomQuery::AromaticElem(15))
587            }
588            _ => Err(format!(
589                "unexpected '{}' at pos {} in atom spec",
590                c as char, self.pos
591            )),
592        }
593    }
594
595    fn parse_number(&mut self) -> Result<i32, String> {
596        let start = self.pos;
597        while self.pos < self.input.len() && self.input[self.pos].is_ascii_digit() {
598            self.pos += 1;
599        }
600        if self.pos == start {
601            return Err(format!("expected number at pos {}", self.pos));
602        }
603        let s = std::str::from_utf8(&self.input[start..self.pos]).map_err(|_| "invalid utf8")?;
604        s.parse::<i32>().map_err(|e| e.to_string())
605    }
606}
607
608#[cfg(test)]
609mod tests {
610    use super::*;
611
612    #[test]
613    fn test_simple_pattern() {
614        let p = parse_smarts("[O:1]=[C:2]!@;-[O:3]~[CH0:4]").unwrap();
615        assert_eq!(p.atoms.len(), 4);
616        assert_eq!(p.bonds.len(), 3);
617        assert_eq!(p.atoms[0].map_idx, Some(1));
618        assert_eq!(p.atoms[3].map_idx, Some(4));
619    }
620
621    #[test]
622    fn test_recursive_smarts() {
623        let p = parse_smarts("[$([CX3]=O):1][NX3H1:2]!@;-[c:3][cH1:4]").unwrap();
624        assert_eq!(p.atoms.len(), 4);
625        if let AtomQuery::Recursive(ref inner) = p.atoms[0].query {
626            assert_eq!(inner.atoms.len(), 2); // CX3 and O
627        } else {
628            panic!("expected recursive");
629        }
630    }
631
632    #[test]
633    fn test_branch() {
634        let p = parse_smarts("[a:1][c:2]([a])!@;-[O:3][C:4]").unwrap();
635        assert_eq!(p.atoms.len(), 5); // a, c, a_branch, O, C
636        assert_eq!(p.bonds.len(), 4);
637    }
638
639    #[test]
640    fn test_chiral_atom_query() {
641        let p = parse_smarts("[C@@H:1]").unwrap();
642        assert_eq!(p.atoms.len(), 1);
643        assert_eq!(p.atoms[0].map_idx, Some(1));
644        if let AtomQuery::And(ref parts) = p.atoms[0].query {
645            assert!(parts
646                .iter()
647                .any(|q| matches!(q, AtomQuery::Chiral(ChiralType::TetrahedralCW))));
648            assert!(parts.iter().any(|q| matches!(q, AtomQuery::TotalH(1))));
649        } else {
650            panic!("expected AND query for chiral atom");
651        }
652    }
653
654    #[test]
655    fn test_ring_size_range() {
656        let p = parse_smarts("[c;r{9-}:2]").unwrap();
657        assert_eq!(p.atoms.len(), 1);
658        if let AtomQuery::And(ref parts) = p.atoms[0].query {
659            assert!(parts.iter().any(|q| matches!(q, AtomQuery::RingSizeMin(9))));
660        }
661    }
662
663    #[test]
664    fn test_parse_all_csd_patterns() {
665        let data = include_str!("../../tests/fixtures/smarts_patterns.txt");
666        let mut ok = 0;
667        let mut fail = 0;
668        let mut failures = Vec::new();
669        for line in data.lines() {
670            let smarts = line.split('\t').next().unwrap().trim();
671            if smarts.is_empty() {
672                continue;
673            }
674            match parse_smarts(smarts) {
675                Ok(p) => {
676                    ok += 1;
677                    let mapped: Vec<_> = p.atoms.iter().filter(|a| a.map_idx.is_some()).collect();
678                    if mapped.len() != 4 {
679                        failures.push(format!("WARN mapped={}: {}", mapped.len(), smarts));
680                    }
681                }
682                Err(e) => {
683                    fail += 1;
684                    failures.push(format!("FAIL: {} → {}", smarts, e));
685                }
686            }
687        }
688        for f in &failures {
689            eprintln!("{}", f);
690        }
691        eprintln!("\nParsed: {} ok, {} failed out of {}", ok, fail, ok + fail);
692        assert_eq!(fail, 0, "{} patterns failed to parse", fail);
693    }
694}