Skip to main content

sci_form/
smiles.rs

1use crate::graph::{Atom, Bond, BondOrder, BondStereo, ChiralType, Hybridization, Molecule};
2use nalgebra::Vector3;
3use petgraph::graph::NodeIndex;
4use std::collections::HashMap;
5
6/// Recursive-descent parser that converts a SMILES string into a [`Molecule`] graph.
7///
8/// Handles:
9/// - Aliphatic & aromatic atoms, brackets, isotopes, formal charges
10/// - Ring closure numbers (single and double-digit)
11/// - Branching via `(` / `)`
12/// - Stereo: `@`/`@@` for chiral centers, `/`/`\` for double-bond E/Z
13/// - Implicit hydrogen addition and hybridization assignment
14pub struct SmilesParser<'a> {
15    input: &'a [u8],
16    pos: usize,
17    rings: HashMap<u32, (NodeIndex, BondOrder, BondStereo)>,
18    mol: &'a mut Molecule,
19}
20
21impl<'a> SmilesParser<'a> {
22    /// Create a new parser for `smiles`, writing atoms and bonds into `mol`.
23    pub fn new(smiles: &'a str, mol: &'a mut Molecule) -> Self {
24        SmilesParser {
25            input: smiles.as_bytes(),
26            pos: 0,
27            rings: HashMap::new(),
28            mol,
29        }
30    }
31
32    /// Parse the SMILES string, populating the molecule.
33    ///
34    /// Returns `Err` with a human-readable message on invalid input.
35    pub fn parse(&mut self) -> Result<(), String> {
36        self.parse_chain(None)?;
37        // After parsing, add implicit hydrogens
38        self.add_implicit_hydrogens();
39        Ok(())
40    }
41
42    fn peek(&self) -> Option<u8> {
43        self.input.get(self.pos).copied()
44    }
45
46    fn advance(&mut self) {
47        self.pos += 1;
48    }
49
50    fn parse_chain(&mut self, mut prev_node: Option<NodeIndex>) -> Result<(), String> {
51        let mut prev_bond_order = BondOrder::Single;
52        let mut prev_bond_stereo = BondStereo::None;
53
54        let get_bond_order =
55            |m: &Molecule, prev: NodeIndex, curr: NodeIndex, order: BondOrder| -> BondOrder {
56                if order == BondOrder::Single
57                    && m.graph[prev].hybridization == Hybridization::SP2
58                    && m.graph[curr].hybridization == Hybridization::SP2
59                {
60                    BondOrder::Aromatic
61                } else {
62                    order
63                }
64            };
65
66        while self.pos < self.input.len() {
67            let c = self.peek().unwrap();
68            match c {
69                b'(' => {
70                    self.advance();
71                    self.parse_chain(prev_node)?;
72                    if self.peek() == Some(b')') {
73                        self.advance();
74                    } else {
75                        return Err("Expected ')'".to_string());
76                    }
77                }
78                b')' => {
79                    break;
80                }
81                b'-' => {
82                    self.advance();
83                    prev_bond_order = BondOrder::Single;
84                }
85                b'=' => {
86                    self.advance();
87                    prev_bond_order = BondOrder::Double;
88                }
89                b'#' => {
90                    self.advance();
91                    prev_bond_order = BondOrder::Triple;
92                }
93                b':' => {
94                    self.advance();
95                    prev_bond_order = BondOrder::Aromatic;
96                }
97                b'.' => {
98                    // Disconnected fragment separator — no bond between previous and next atom
99                    self.advance();
100                    prev_node = None;
101                    prev_bond_order = BondOrder::Single;
102                    prev_bond_stereo = BondStereo::None;
103                }
104                b'/' => {
105                    self.advance();
106                    prev_bond_order = BondOrder::Single;
107                    prev_bond_stereo = BondStereo::E;
108                } // Simplify E/Z for now
109                b'\\' => {
110                    self.advance();
111                    prev_bond_order = BondOrder::Single;
112                    prev_bond_stereo = BondStereo::Z;
113                }
114                b'%' | b'0'..=b'9' => {
115                    let mut ring_num = 0;
116                    if c == b'%' {
117                        self.advance();
118                        if let (Some(d1), Some(d2)) = (self.peek(), self.input.get(self.pos + 1)) {
119                            if d1.is_ascii_digit() && d2.is_ascii_digit() {
120                                ring_num = ((d1 - b'0') * 10 + (d2 - b'0')) as u32;
121                                self.advance();
122                                self.advance();
123                            }
124                        }
125                    } else {
126                        ring_num = (c - b'0') as u32;
127                        self.advance();
128                    }
129                    if ring_num > 0 {
130                        if let Some(prev) = prev_node {
131                            if let Some((target, order, stereo)) = self.rings.remove(&ring_num) {
132                                let mut final_order = if prev_bond_order != BondOrder::Single {
133                                    prev_bond_order
134                                } else {
135                                    order
136                                };
137                                final_order = get_bond_order(self.mol, prev, target, final_order);
138                                let final_stereo = if prev_bond_stereo != BondStereo::None {
139                                    prev_bond_stereo
140                                } else {
141                                    stereo
142                                };
143                                self.mol.add_bond(
144                                    prev,
145                                    target,
146                                    Bond {
147                                        order: final_order,
148                                        stereo: final_stereo,
149                                    },
150                                );
151                            } else {
152                                self.rings
153                                    .insert(ring_num, (prev, prev_bond_order, prev_bond_stereo));
154                            }
155                            // Reset bond state after closing/opening ring
156                            prev_bond_order = BondOrder::Single;
157                            prev_bond_stereo = BondStereo::None;
158                        }
159                    }
160                }
161                b'[' => {
162                    self.advance();
163                    let node = self.parse_bracket_atom()?;
164                    if let Some(prev) = prev_node {
165                        let order = get_bond_order(self.mol, prev, node, prev_bond_order);
166                        self.mol.add_bond(
167                            prev,
168                            node,
169                            Bond {
170                                order,
171                                stereo: prev_bond_stereo,
172                            },
173                        );
174                    }
175                    prev_node = Some(node);
176                    prev_bond_order = BondOrder::Single;
177                    prev_bond_stereo = BondStereo::None;
178                }
179                _ if c.is_ascii_alphabetic() || c == b'*' => {
180                    let node = self.parse_organic_subset()?;
181                    if let Some(prev) = prev_node {
182                        let order = get_bond_order(self.mol, prev, node, prev_bond_order);
183                        self.mol.add_bond(
184                            prev,
185                            node,
186                            Bond {
187                                order,
188                                stereo: prev_bond_stereo,
189                            },
190                        );
191                    }
192                    prev_node = Some(node);
193                    prev_bond_order = BondOrder::Single;
194                    prev_bond_stereo = BondStereo::None;
195                }
196                _ => return Err(format!("Unexpected character: {}", c as char)),
197            }
198        }
199        Ok(())
200    }
201
202    fn parse_organic_subset(&mut self) -> Result<NodeIndex, String> {
203        let mut elem_str = String::new();
204        let c = self.peek().unwrap();
205        self.advance();
206        elem_str.push(c as char);
207        if let Some(nc) = self.peek() {
208            if nc.is_ascii_lowercase()
209                && (c == b'C' && (nc == b'l' || nc == b'u' || nc == b'o' || nc == b'r')
210                    || c == b'B' && nc == b'r')
211            {
212                elem_str.push(nc as char);
213                self.advance();
214            }
215        }
216
217        // Handle aromatic
218        let mut aromatic = false;
219        if elem_str.chars().next().unwrap().is_lowercase() {
220            aromatic = true;
221            elem_str = elem_str.to_uppercase();
222        }
223
224        let element = match elem_str.as_str() {
225            "C" => 6,
226            "N" => 7,
227            "O" => 8,
228            "P" => 15,
229            "S" => 16,
230            "F" => 9,
231            "CL" | "Cl" => 17,
232            "BR" | "Br" => 35,
233            "I" => 53,
234            "B" => 5,
235            "*" => 0,
236            &_ => return Err(format!("Unknown element: {}", elem_str)),
237        };
238
239        let atom = Atom {
240            element,
241            position: Vector3::zeros(),
242            charge: 0.0,
243            formal_charge: 0,
244            hybridization: if aromatic {
245                Hybridization::SP2
246            } else {
247                Hybridization::SP3
248            },
249            chiral_tag: ChiralType::Unspecified,
250            explicit_h: 0,
251        };
252
253        Ok(self.mol.add_atom(atom))
254    }
255
256    fn parse_bracket_atom(&mut self) -> Result<NodeIndex, String> {
257        let mut isotope = 0u32;
258        let mut aromatic = false;
259        let mut chiral = ChiralType::Unspecified;
260        let mut h_count = 0;
261        let mut charge = 0;
262
263        // Isotope
264        while let Some(c) = self.peek() {
265            if c.is_ascii_digit() {
266                isotope = isotope * 10 + (c - b'0') as u32;
267                self.advance();
268            } else {
269                break;
270            }
271        }
272
273        // Element
274        let mut elem_str = String::new();
275        if let Some(c) = self.peek() {
276            if c.is_ascii_alphabetic() {
277                elem_str.push(c as char);
278                self.advance();
279                if let Some(nc) = self.peek() {
280                    if nc.is_ascii_lowercase() && c.is_ascii_uppercase() {
281                        elem_str.push(nc as char);
282                        self.advance();
283                    }
284                }
285            } else if c == b'*' {
286                elem_str.push('*');
287                self.advance();
288            }
289        }
290
291        if elem_str.chars().next().unwrap().is_lowercase() {
292            aromatic = true;
293            elem_str = elem_str.to_uppercase();
294        }
295
296        let element: u8 = match elem_str.as_str() {
297            "H" => 1,
298            "HE" | "He" => 2,
299            "LI" | "Li" => 3,
300            "BE" | "Be" => 4,
301            "B" => 5,
302            "C" => 6,
303            "N" => 7,
304            "O" => 8,
305            "F" => 9,
306            "NE" | "Ne" => 10,
307            "NA" | "Na" => 11,
308            "MG" | "Mg" => 12,
309            "AL" | "Al" => 13,
310            "SI" | "Si" => 14,
311            "P" => 15,
312            "S" => 16,
313            "CL" | "Cl" => 17,
314            "AR" | "Ar" => 18,
315            "K" => 19,
316            "CA" | "Ca" => 20,
317            "SC" | "Sc" => 21,
318            "TI" | "Ti" => 22,
319            "V" => 23,
320            "CR" | "Cr" => 24,
321            "MN" | "Mn" => 25,
322            "FE" | "Fe" => 26,
323            "CO" | "Co" => 27,
324            "NI" | "Ni" => 28,
325            "CU" | "Cu" => 29,
326            "ZN" | "Zn" => 30,
327            "GA" | "Ga" => 31,
328            "GE" | "Ge" => 32,
329            "AS" | "As" => 33,
330            "SE" | "Se" => 34,
331            "BR" | "Br" => 35,
332            "KR" | "Kr" => 36,
333            "RB" | "Rb" => 37,
334            "SR" | "Sr" => 38,
335            "Y" => 39,
336            "ZR" | "Zr" => 40,
337            "MO" | "Mo" => 42,
338            "RU" | "Ru" => 44,
339            "RH" | "Rh" => 45,
340            "PD" | "Pd" => 46,
341            "AG" | "Ag" => 47,
342            "CD" | "Cd" => 48,
343            "IN" | "In" => 49,
344            "SN" | "Sn" => 50,
345            "SB" | "Sb" => 51,
346            "TE" | "Te" => 52,
347            "I" => 53,
348            "XE" | "Xe" => 54,
349            "CS" | "Cs" => 55,
350            "BA" | "Ba" => 56,
351            "LA" | "La" => 57,
352            "W" => 74,
353            "RE" | "Re" => 75,
354            "OS" | "Os" => 76,
355            "IR" | "Ir" => 77,
356            "PT" | "Pt" => 78,
357            "AU" | "Au" => 79,
358            "HG" | "Hg" => 80,
359            "TL" | "Tl" => 81,
360            "PB" | "Pb" => 82,
361            "BI" | "Bi" => 83,
362            "*" => 0,
363            _ => return Err(format!("Unknown bracket element: {}", elem_str)),
364        };
365
366        // Chirality
367        if let Some(c) = self.peek() {
368            if c == b'@' {
369                self.advance();
370                chiral = ChiralType::TetrahedralCCW;
371                if self.peek() == Some(b'@') {
372                    self.advance();
373                    chiral = ChiralType::TetrahedralCW;
374                }
375            }
376        }
377
378        // Hydrogens
379        if let Some(c) = self.peek() {
380            if c == b'H' {
381                self.advance();
382                h_count = 1;
383                if let Some(hc) = self.peek() {
384                    if hc.is_ascii_digit() {
385                        h_count = hc - b'0';
386                        self.advance();
387                    }
388                }
389            }
390        }
391
392        // Charge
393        if let Some(c) = self.peek() {
394            if c == b'+' {
395                self.advance();
396                charge = 1;
397                if let Some(cc) = self.peek() {
398                    if cc.is_ascii_digit() {
399                        charge = (cc - b'0') as i8;
400                        self.advance();
401                    } else if cc == b'+' {
402                        charge = 2;
403                        self.advance();
404                        if self.peek() == Some(b'+') {
405                            charge = 3;
406                            self.advance();
407                        }
408                    }
409                }
410            } else if c == b'-' {
411                self.advance();
412                charge = -1;
413                if let Some(cc) = self.peek() {
414                    if cc.is_ascii_digit() {
415                        charge = -((cc - b'0') as i8);
416                        self.advance();
417                    } else if cc == b'-' {
418                        charge = -2;
419                        self.advance();
420                        if self.peek() == Some(b'-') {
421                            charge = -3;
422                            self.advance();
423                        }
424                    }
425                }
426            }
427        }
428
429        if self.peek() == Some(b']') {
430            self.advance();
431        } else {
432            return Err("Expected ']'".to_string());
433        }
434
435        let atom = Atom {
436            element,
437            position: Vector3::zeros(),
438            charge: charge as f32,
439            formal_charge: charge,
440            hybridization: if aromatic {
441                Hybridization::SP2
442            } else {
443                Hybridization::SP3
444            }, // Simplification
445            chiral_tag: chiral,
446            explicit_h: h_count,
447        };
448
449        let node = self.mol.add_atom(atom);
450
451        Ok(node)
452    }
453
454    fn add_implicit_hydrogens(&mut self) {
455        let n = self.mol.graph.node_count();
456        let mut to_add = Vec::new();
457
458        for i in 0..n {
459            let ni = NodeIndex::new(i);
460            let atom = &self.mol.graph[ni];
461            if atom.element == 1 || atom.element == 0 {
462                continue;
463            } // H or dummy
464
465            let mut current_valents = 0;
466            let mut double_bonds = 0;
467            let mut triple_bonds = 0;
468            let mut aromatic_bonds = 0;
469
470            for edge in self.mol.graph.edges(ni) {
471                match edge.weight().order {
472                    BondOrder::Single => {
473                        current_valents += 1;
474                    }
475                    BondOrder::Double => {
476                        current_valents += 2;
477                        double_bonds += 1;
478                    }
479                    BondOrder::Triple => {
480                        current_valents += 3;
481                        triple_bonds += 1;
482                    }
483                    BondOrder::Aromatic => {
484                        current_valents += 1;
485                        aromatic_bonds += 1;
486                    }
487                    _ => {
488                        current_valents += 1;
489                    }
490                };
491            }
492
493            // Fix hybridization for non-aromatics based on exact bond types
494            if aromatic_bonds == 0 {
495                let actual_hybridization = if triple_bonds > 0 || double_bonds > 1 {
496                    Hybridization::SP
497                } else if double_bonds == 1 {
498                    Hybridization::SP2
499                } else {
500                    // Check for conjugation: N or O adjacent to double/aromatic bond → SP2
501                    // Matches RDKit: norbs==4, degree<=3, atomHasConjugatedBond → SP2
502                    let elem = atom.element;
503                    let is_conjugated = if (elem == 7 || elem == 8) && double_bonds == 0 {
504                        // Check if any neighbor has a double or aromatic bond (conjugation)
505                        self.mol.graph.neighbors(ni).any(|nb| {
506                            self.mol.graph.edges(nb).any(|e| {
507                                matches!(e.weight().order, BondOrder::Double | BondOrder::Aromatic)
508                            })
509                        })
510                    } else {
511                        false
512                    };
513                    if is_conjugated {
514                        Hybridization::SP2
515                    } else {
516                        Hybridization::SP3
517                    }
518                };
519
520                if let Some(atom_mut) = self.mol.graph.node_weight_mut(ni) {
521                    atom_mut.hybridization = actual_hybridization;
522                }
523            } else {
524                if let Some(atom_mut) = self.mol.graph.node_weight_mut(ni) {
525                    atom_mut.hybridization = Hybridization::SP2;
526                }
527            }
528
529            let atom = &self.mol.graph[ni];
530            // If it's aromatic, an uncharged Carbon expects 3 bonds normally (2 from ring, 1 external or 1 H).
531            let target_val = match atom.element {
532                5 => 3, // B
533                6 => 4, // C
534                7 => {
535                    if atom.formal_charge == 1 {
536                        4
537                    } else {
538                        3
539                    }
540                } // N
541                8 => {
542                    if atom.formal_charge == 1 {
543                        3
544                    } else {
545                        2
546                    }
547                } // O
548                9 | 17 | 35 | 53 => 1, // Halogens
549                14 => 4, // Si
550                15 => 3, // P
551                16 => 2, // S
552                32 => 4, // Ge
553                33 => 3, // As
554                34 => 2, // Se
555                50 => 4, // Sn
556                51 => 3, // Sb
557                52 => 2, // Te
558                _ => 0,
559            };
560
561            let mut h_needed = target_val - current_valents - atom.formal_charge.abs() as i32;
562
563            if atom.hybridization == Hybridization::SP2 && atom.element == 6 {
564                // simple aromatic C checks
565                let aromatic_bonds = self
566                    .mol
567                    .graph
568                    .edges(ni)
569                    .filter(|e| e.weight().order == BondOrder::Aromatic)
570                    .count();
571                if aromatic_bonds == 2 {
572                    h_needed = 1 - (current_valents - 2); // 1 H max on a standard aromatic carbon in a ring
573                } else if aromatic_bonds == 3 {
574                    h_needed = 0; // Bridgehead aromatic carbon
575                }
576            }
577
578            // Bare aromatic nitrogen (lowercase 'n', no bracket/explicit H) is pyridine-type:
579            // lone pair NOT in pi system, only 2 bonds needed → 0 implicit H.
580            // [nH] form has explicit_h > 0 and keeps its H (pyrrole-type).
581            if aromatic_bonds > 0 && atom.element == 7 && atom.explicit_h == 0 {
582                h_needed = 0;
583            }
584
585            // Calculate total hydrogens to add for this atom in THIS iteration
586            // to maintain exact topological index order!
587            let total_h_for_this_atom = atom.explicit_h as i32
588                + if h_needed - (atom.explicit_h as i32) > 0 {
589                    h_needed - (atom.explicit_h as i32)
590                } else {
591                    0
592                };
593
594            if total_h_for_this_atom > 0 {
595                to_add.push((ni, total_h_for_this_atom as u8));
596            }
597        }
598
599        // Add all hydrogens sequentially
600        for (parent, count) in to_add {
601            for _ in 0..count {
602                let h_node = self.mol.add_atom(Atom {
603                    element: 1,
604                    position: Vector3::zeros(),
605                    charge: 0.0,
606                    formal_charge: 0,
607                    hybridization: Hybridization::Unknown,
608                    chiral_tag: ChiralType::Unspecified,
609                    explicit_h: 0,
610                });
611                self.mol.add_bond(
612                    parent,
613                    h_node,
614                    Bond {
615                        order: BondOrder::Single,
616                        stereo: BondStereo::None,
617                    },
618                );
619            }
620        }
621    }
622}
623
624#[cfg(test)]
625mod tests {
626    use super::*;
627
628    #[test]
629    fn test_parse_benzene_aliphatic() {
630        let mol = Molecule::from_smiles("C1=CC=CC=C1").unwrap();
631        assert_eq!(mol.graph.node_count(), 12); // 6 C, 6 implicit H
632        assert_eq!(mol.graph.edge_count(), 12); // 6 ring bonds, 6 C-H bonds
633    }
634
635    #[test]
636    fn test_parse_benzene_aromatic() {
637        let mol = Molecule::from_smiles("c1ccccc1").unwrap();
638        assert_eq!(mol.graph.node_count(), 12); // 6 C, 6 implicit H
639        assert_eq!(mol.graph.edge_count(), 12); // 6 ring bonds, 6 C-H bonds
640    }
641
642    #[test]
643    fn test_parse_brackets_and_branches() {
644        let mol = Molecule::from_smiles("C(C)(C)C").unwrap(); // Isobutane
645        assert_eq!(mol.graph.node_count(), 14); // 4 C, 10 H
646    }
647}