1use crate::graph::ChiralType;
4
5#[derive(Debug, Clone)]
7pub struct SmartsPattern {
8 pub atoms: Vec<SmartsAtom>,
9 pub bonds: Vec<SmartsBond>,
10}
11
12#[derive(Debug, Clone)]
13pub struct SmartsAtom {
14 pub query: AtomQuery,
15 pub map_idx: Option<u8>, }
17
18#[derive(Debug, Clone)]
19pub struct SmartsBond {
20 pub from: usize,
21 pub to: usize,
22 pub query: BondQuery,
23}
24
25#[derive(Debug, Clone)]
26pub enum AtomQuery {
27 True, Element(u8), AromaticElem(u8), AnyAromatic, AnyAliphatic, AtomicNum(u8), NotAtomicNum(u8), TotalH(u8), TotalDegree(u8), HeavyDegree(u8), RingBondCount(u8), InRing, RingSize(u8), RingSizeRange(u8, u8), RingSizeMin(u8), FormalCharge(i8), Hybridization(u8), RingCount(u8), Chiral(ChiralType),
46 Recursive(Box<SmartsPattern>),
47 And(Vec<AtomQuery>),
48 Or(Vec<AtomQuery>),
49 Not(Box<AtomQuery>),
50}
51
52#[derive(Debug, Clone)]
53pub enum BondQuery {
54 Single,
55 Double,
56 Triple,
57 Aromatic, Any, Ring, NotRing, Implicit, And(Vec<BondQuery>),
63 Not(Box<BondQuery>),
64}
65
66pub fn parse_smarts(smarts: &str) -> Result<SmartsPattern, String> {
68 let mut parser = SmartsParser::new(smarts);
69 parser.parse_chain(None)?;
70 Ok(SmartsPattern {
71 atoms: parser.atoms,
72 bonds: parser.bonds,
73 })
74}
75
76struct SmartsParser<'a> {
77 input: &'a [u8],
78 pos: usize,
79 atoms: Vec<SmartsAtom>,
80 bonds: Vec<SmartsBond>,
81 ring_opens: [Option<usize>; 10], }
83
84impl<'a> SmartsParser<'a> {
85 fn new(s: &'a str) -> Self {
86 Self {
87 input: s.as_bytes(),
88 pos: 0,
89 atoms: Vec::new(),
90 bonds: Vec::new(),
91 ring_opens: [None; 10],
92 }
93 }
94
95 fn peek(&self) -> Option<u8> {
96 self.input.get(self.pos).copied()
97 }
98
99 fn advance(&mut self) -> Option<u8> {
100 let c = self.input.get(self.pos).copied();
101 if c.is_some() {
102 self.pos += 1;
103 }
104 c
105 }
106
107 fn expect(&mut self, ch: u8) -> Result<(), String> {
108 if self.advance() == Some(ch) {
109 Ok(())
110 } else {
111 Err(format!("expected '{}' at pos {}", ch as char, self.pos - 1))
112 }
113 }
114
115 fn parse_chain(&mut self, prev_atom: Option<usize>) -> Result<(), String> {
117 let mut prev = prev_atom;
118 while self.pos < self.input.len() {
119 let c = match self.peek() {
120 Some(c) => c,
121 None => break,
122 };
123
124 match c {
125 b')' => break, b'(' => {
127 self.advance();
129 self.parse_chain(prev)?;
130 self.expect(b')')?;
131 }
132 b'[' | b'*' | b'c' | b'n' | b'o' | b's' | b'p' | b'C' | b'N' | b'O' | b'S'
133 | b'P' | b'F' | b'B' | b'I' | b'a' | b'A' | b'H' => {
134 let bond_q = self.parse_bond_if_present();
136 let atom_idx = self.parse_atom()?;
137 if let Some(p) = prev {
138 self.bonds.push(SmartsBond {
139 from: p,
140 to: atom_idx,
141 query: bond_q.unwrap_or(BondQuery::Implicit),
142 });
143 }
144 prev = Some(atom_idx);
145 }
146 b'-' | b'=' | b'#' | b'~' | b'/' | b'\\' | b':' | b'!' | b'@' => {
147 let bond_q = self.parse_bond()?;
149 let atom_idx = self.parse_atom()?;
150 if let Some(p) = prev {
151 self.bonds.push(SmartsBond {
152 from: p,
153 to: atom_idx,
154 query: bond_q,
155 });
156 }
157 prev = Some(atom_idx);
158 }
159 b'0'..=b'9' => {
160 let digit = (self.advance().unwrap() - b'0') as usize;
162 if let Some(open_atom) = self.ring_opens[digit] {
163 self.bonds.push(SmartsBond {
164 from: open_atom,
165 to: prev.unwrap_or(0),
166 query: BondQuery::Implicit,
167 });
168 self.ring_opens[digit] = None;
169 } else {
170 self.ring_opens[digit] = prev;
171 }
172 }
173 _ => break,
174 }
175 }
176 Ok(())
177 }
178
179 fn parse_bond_if_present(&mut self) -> Option<BondQuery> {
181 match self.peek() {
182 Some(b'-') | Some(b'=') | Some(b'#') | Some(b'~') | Some(b'!') | Some(b'@')
183 | Some(b':') => self.parse_bond().ok(),
184 _ => None,
185 }
186 }
187
188 fn parse_bond(&mut self) -> Result<BondQuery, String> {
190 let mut parts = Vec::new();
191 loop {
192 match self.peek() {
193 Some(b'-') => {
194 self.advance();
195 parts.push(BondQuery::Single);
196 }
197 Some(b'=') => {
198 self.advance();
199 parts.push(BondQuery::Double);
200 }
201 Some(b'#') => {
202 self.advance();
203 parts.push(BondQuery::Triple);
204 }
205 Some(b'~') => {
206 self.advance();
207 parts.push(BondQuery::Any);
208 }
209 Some(b':') => {
210 self.advance();
211 parts.push(BondQuery::Aromatic);
212 }
213 Some(b'@') => {
214 self.advance();
215 parts.push(BondQuery::Ring);
216 }
217 Some(b'!') => {
218 self.advance();
219 if self.peek() == Some(b'@') {
220 self.advance();
221 parts.push(BondQuery::NotRing);
222 } else {
223 let inner = self.parse_bond()?;
224 parts.push(BondQuery::Not(Box::new(inner)));
225 }
226 }
227 Some(b';') => {
228 self.advance();
229 } Some(b',') => {
231 self.advance();
233 }
234 _ => break,
235 }
236 }
237 match parts.len() {
238 0 => Ok(BondQuery::Implicit),
239 1 => Ok(parts.pop().unwrap()),
240 _ => Ok(BondQuery::And(parts)),
241 }
242 }
243
244 fn parse_atom(&mut self) -> Result<usize, String> {
246 let atom = match self.peek() {
247 Some(b'[') => self.parse_bracket_atom()?,
248 Some(b'*') => {
249 self.advance();
250 SmartsAtom {
251 query: AtomQuery::True,
252 map_idx: None,
253 }
254 }
255 _ => self.parse_organic_atom()?,
256 };
257 let idx = self.atoms.len();
258 self.atoms.push(atom);
259 Ok(idx)
260 }
261
262 fn parse_organic_atom(&mut self) -> Result<SmartsAtom, String> {
264 let c = self.advance().ok_or("unexpected end")?;
265 let query = match c {
266 b'C' if self.peek() == Some(b'l') => {
267 self.advance();
268 AtomQuery::Element(17)
269 }
270 b'B' if self.peek() == Some(b'r') => {
271 self.advance();
272 AtomQuery::Element(35)
273 }
274 b'C' => AtomQuery::Element(6),
275 b'N' => AtomQuery::Element(7),
276 b'O' => AtomQuery::Element(8),
277 b'S' => AtomQuery::Element(16),
278 b'P' => AtomQuery::Element(15),
279 b'F' => AtomQuery::Element(9),
280 b'B' => AtomQuery::Element(5),
281 b'I' => AtomQuery::Element(53),
282 b'H' => AtomQuery::Element(1),
283 b'c' => AtomQuery::AromaticElem(6),
284 b'n' => AtomQuery::AromaticElem(7),
285 b'o' => AtomQuery::AromaticElem(8),
286 b's' => AtomQuery::AromaticElem(16),
287 b'p' => AtomQuery::AromaticElem(15),
288 b'a' => AtomQuery::AnyAromatic,
289 b'A' => AtomQuery::AnyAliphatic,
290 _ => {
291 return Err(format!(
292 "unexpected atom char '{}' at pos {}",
293 c as char,
294 self.pos - 1
295 ))
296 }
297 };
298 Ok(SmartsAtom {
299 query,
300 map_idx: None,
301 })
302 }
303
304 fn parse_bracket_atom(&mut self) -> Result<SmartsAtom, String> {
306 self.expect(b'[')?;
307 let query = self.parse_atom_spec()?;
308 let map_idx = if self.peek() == Some(b':') {
310 self.advance();
311 Some(self.parse_number()? as u8)
312 } else {
313 None
314 };
315 self.expect(b']')?;
316 Ok(SmartsAtom { query, map_idx })
317 }
318
319 fn parse_atom_spec(&mut self) -> Result<AtomQuery, String> {
322 let mut parts = vec![self.parse_atom_query_or()?];
323 while self.peek() == Some(b';') {
324 self.advance();
325 parts.push(self.parse_atom_query_or()?);
326 }
327 if parts.len() == 1 {
328 Ok(parts.pop().unwrap())
329 } else {
330 Ok(AtomQuery::And(parts))
331 }
332 }
333
334 fn parse_atom_query_or(&mut self) -> Result<AtomQuery, String> {
336 let mut parts = vec![self.parse_atom_query_and()?];
337 while self.peek() == Some(b',') {
338 self.advance();
339 parts.push(self.parse_atom_query_and()?);
340 }
341 if parts.len() == 1 {
342 Ok(parts.pop().unwrap())
343 } else {
344 Ok(AtomQuery::Or(parts))
345 }
346 }
347
348 fn parse_atom_query_and(&mut self) -> Result<AtomQuery, String> {
350 let mut parts = Vec::new();
351 loop {
352 match self.peek() {
353 Some(b']') | Some(b',') | Some(b':') | Some(b';') | None => break,
354 Some(b'&') => {
355 self.advance();
356 } _ => parts.push(self.parse_atom_primitive()?),
358 }
359 }
360 match parts.len() {
361 0 => Ok(AtomQuery::True),
362 1 => Ok(parts.pop().unwrap()),
363 _ => Ok(AtomQuery::And(parts)),
364 }
365 }
366
367 fn parse_atom_primitive(&mut self) -> Result<AtomQuery, String> {
369 let c = self.peek().ok_or("unexpected end in atom spec")?;
370 match c {
371 b'!' => {
372 self.advance();
373 let inner = self.parse_atom_primitive()?;
374 Ok(AtomQuery::Not(Box::new(inner)))
375 }
376 b'#' => {
377 self.advance();
378 let n = self.parse_number()? as u8;
379 Ok(AtomQuery::AtomicNum(n))
380 }
381 b'@' => {
382 self.advance();
383 let chiral = if self.peek() == Some(b'@') {
384 self.advance();
385 ChiralType::TetrahedralCW
386 } else {
387 ChiralType::TetrahedralCCW
388 };
389 Ok(AtomQuery::Chiral(chiral))
390 }
391 b'$' => {
392 self.advance();
394 self.expect(b'(')?;
395 let start = self.pos;
396 let mut depth = 1;
398 while depth > 0 && self.pos < self.input.len() {
399 match self.input[self.pos] {
400 b'(' => depth += 1,
401 b')' => depth -= 1,
402 _ => {}
403 }
404 if depth > 0 {
405 self.pos += 1;
406 }
407 }
408 let inner_str = std::str::from_utf8(&self.input[start..self.pos])
409 .map_err(|_| "invalid utf8 in recursive SMARTS")?;
410 self.expect(b')')?;
411 let inner = parse_smarts(inner_str)?;
412 Ok(AtomQuery::Recursive(Box::new(inner)))
413 }
414 b'X' => {
415 self.advance();
416 let n = self.parse_number()? as u8;
417 Ok(AtomQuery::TotalDegree(n))
418 }
419 b'x' => {
420 self.advance();
421 let n = self.parse_number()? as u8;
422 Ok(AtomQuery::RingBondCount(n))
423 }
424 b'H' => {
425 self.advance();
426 if self.peek().is_some_and(|c| c.is_ascii_digit()) {
428 let n = self.parse_number()? as u8;
429 Ok(AtomQuery::TotalH(n))
430 } else {
431 Ok(AtomQuery::TotalH(1))
433 }
434 }
435 b'D' => {
436 self.advance();
437 if self.peek().is_some_and(|c| c.is_ascii_digit()) {
438 let n = self.parse_number()? as u8;
439 Ok(AtomQuery::HeavyDegree(n))
440 } else {
441 Ok(AtomQuery::HeavyDegree(1))
442 }
443 }
444 b'R' => {
445 self.advance();
446 if self.peek().is_some_and(|c| c.is_ascii_digit()) {
447 let n = self.parse_number()? as u8;
448 Ok(AtomQuery::RingCount(n))
449 } else {
450 Ok(AtomQuery::InRing)
451 }
452 }
453 b'r' => {
454 self.advance();
455 if self.peek() == Some(b'{') {
456 self.advance(); if self.peek() == Some(b'-') {
459 self.advance();
461 let m = self.parse_number()? as u8;
462 self.expect(b'}')?;
463 Ok(AtomQuery::RingSizeRange(3, m))
464 } else {
465 let n = self.parse_number()? as u8;
466 if self.peek() == Some(b'-') {
467 self.advance();
468 if self.peek() == Some(b'}') {
469 self.advance();
470 Ok(AtomQuery::RingSizeMin(n))
471 } else {
472 let m = self.parse_number()? as u8;
473 self.expect(b'}')?;
474 Ok(AtomQuery::RingSizeRange(n, m))
475 }
476 } else {
477 self.expect(b'}')?;
478 Ok(AtomQuery::RingSize(n))
479 }
480 }
481 } else if self.peek().is_some_and(|c| c.is_ascii_digit()) {
482 let n = self.parse_number()? as u8;
483 Ok(AtomQuery::RingSize(n))
484 } else {
485 Ok(AtomQuery::InRing)
486 }
487 }
488 b'+' => {
489 self.advance();
490 if self.peek().is_some_and(|c| c.is_ascii_digit()) {
491 let n = self.parse_number()? as i8;
492 Ok(AtomQuery::FormalCharge(n))
493 } else {
494 Ok(AtomQuery::FormalCharge(1))
495 }
496 }
497 b'-' => {
498 self.advance();
500 if self.peek().is_some_and(|c| c.is_ascii_digit()) {
501 let n = self.parse_number()? as i8;
502 Ok(AtomQuery::FormalCharge(-n))
503 } else {
504 Ok(AtomQuery::FormalCharge(-1))
505 }
506 }
507 b'^' => {
508 self.advance();
509 let n = self.parse_number()? as u8;
510 Ok(AtomQuery::Hybridization(n))
511 }
512 b'*' => {
513 self.advance();
514 Ok(AtomQuery::True)
515 }
516 b'a' => {
517 self.advance();
518 Ok(AtomQuery::AnyAromatic)
519 }
520 b'A' => {
521 self.advance();
522 Ok(AtomQuery::AnyAliphatic)
525 }
526 b'C' => {
527 self.advance();
528 if self.peek() == Some(b'l') {
529 self.advance();
530 Ok(AtomQuery::Element(17))
531 } else {
532 Ok(AtomQuery::Element(6))
533 }
534 }
535 b'N' => {
536 self.advance();
537 Ok(AtomQuery::Element(7))
538 }
539 b'O' => {
540 self.advance();
541 Ok(AtomQuery::Element(8))
542 }
543 b'S' => {
544 self.advance();
545 Ok(AtomQuery::Element(16))
546 }
547 b'P' => {
548 self.advance();
549 Ok(AtomQuery::Element(15))
550 }
551 b'F' => {
552 self.advance();
553 Ok(AtomQuery::Element(9))
554 }
555 b'B' => {
556 self.advance();
557 if self.peek() == Some(b'r') {
558 self.advance();
559 Ok(AtomQuery::Element(35))
560 } else {
561 Ok(AtomQuery::Element(5))
562 }
563 }
564 b'I' => {
565 self.advance();
566 Ok(AtomQuery::Element(53))
567 }
568 b'c' => {
569 self.advance();
570 Ok(AtomQuery::AromaticElem(6))
571 }
572 b'n' => {
573 self.advance();
574 Ok(AtomQuery::AromaticElem(7))
575 }
576 b'o' => {
577 self.advance();
578 Ok(AtomQuery::AromaticElem(8))
579 }
580 b's' => {
581 self.advance();
582 Ok(AtomQuery::AromaticElem(16))
583 }
584 b'p' => {
585 self.advance();
586 Ok(AtomQuery::AromaticElem(15))
587 }
588 _ => Err(format!(
589 "unexpected '{}' at pos {} in atom spec",
590 c as char, self.pos
591 )),
592 }
593 }
594
595 fn parse_number(&mut self) -> Result<i32, String> {
596 let start = self.pos;
597 while self.pos < self.input.len() && self.input[self.pos].is_ascii_digit() {
598 self.pos += 1;
599 }
600 if self.pos == start {
601 return Err(format!("expected number at pos {}", self.pos));
602 }
603 let s = std::str::from_utf8(&self.input[start..self.pos]).map_err(|_| "invalid utf8")?;
604 s.parse::<i32>().map_err(|e| e.to_string())
605 }
606}
607
608#[cfg(test)]
609mod tests {
610 use super::*;
611
612 #[test]
613 fn test_simple_pattern() {
614 let p = parse_smarts("[O:1]=[C:2]!@;-[O:3]~[CH0:4]").unwrap();
615 assert_eq!(p.atoms.len(), 4);
616 assert_eq!(p.bonds.len(), 3);
617 assert_eq!(p.atoms[0].map_idx, Some(1));
618 assert_eq!(p.atoms[3].map_idx, Some(4));
619 }
620
621 #[test]
622 fn test_recursive_smarts() {
623 let p = parse_smarts("[$([CX3]=O):1][NX3H1:2]!@;-[c:3][cH1:4]").unwrap();
624 assert_eq!(p.atoms.len(), 4);
625 if let AtomQuery::Recursive(ref inner) = p.atoms[0].query {
626 assert_eq!(inner.atoms.len(), 2); } else {
628 panic!("expected recursive");
629 }
630 }
631
632 #[test]
633 fn test_branch() {
634 let p = parse_smarts("[a:1][c:2]([a])!@;-[O:3][C:4]").unwrap();
635 assert_eq!(p.atoms.len(), 5); assert_eq!(p.bonds.len(), 4);
637 }
638
639 #[test]
640 fn test_chiral_atom_query() {
641 let p = parse_smarts("[C@@H:1]").unwrap();
642 assert_eq!(p.atoms.len(), 1);
643 assert_eq!(p.atoms[0].map_idx, Some(1));
644 if let AtomQuery::And(ref parts) = p.atoms[0].query {
645 assert!(parts
646 .iter()
647 .any(|q| matches!(q, AtomQuery::Chiral(ChiralType::TetrahedralCW))));
648 assert!(parts.iter().any(|q| matches!(q, AtomQuery::TotalH(1))));
649 } else {
650 panic!("expected AND query for chiral atom");
651 }
652 }
653
654 #[test]
655 fn test_ring_size_range() {
656 let p = parse_smarts("[c;r{9-}:2]").unwrap();
657 assert_eq!(p.atoms.len(), 1);
658 if let AtomQuery::And(ref parts) = p.atoms[0].query {
659 assert!(parts.iter().any(|q| matches!(q, AtomQuery::RingSizeMin(9))));
660 }
661 }
662
663 #[test]
664 fn test_parse_all_csd_patterns() {
665 let data = include_str!("../../tests/fixtures/smarts_patterns.txt");
666 let mut ok = 0;
667 let mut fail = 0;
668 let mut failures = Vec::new();
669 for line in data.lines() {
670 let smarts = line.split('\t').next().unwrap().trim();
671 if smarts.is_empty() {
672 continue;
673 }
674 match parse_smarts(smarts) {
675 Ok(p) => {
676 ok += 1;
677 let mapped: Vec<_> = p.atoms.iter().filter(|a| a.map_idx.is_some()).collect();
678 if mapped.len() != 4 {
679 failures.push(format!("WARN mapped={}: {}", mapped.len(), smarts));
680 }
681 }
682 Err(e) => {
683 fail += 1;
684 failures.push(format!("FAIL: {} → {}", smarts, e));
685 }
686 }
687 }
688 for f in &failures {
689 eprintln!("{}", f);
690 }
691 eprintln!("\nParsed: {} ok, {} failed out of {}", ok, fail, ok + fail);
692 assert_eq!(fail, 0, "{} patterns failed to parse", fail);
693 }
694}