Skip to main content

sci_form/
smiles.rs

1use crate::graph::{Atom, Bond, BondOrder, BondStereo, ChiralType, Hybridization, Molecule};
2use nalgebra::Vector3;
3use petgraph::graph::NodeIndex;
4use std::collections::HashMap;
5
6/// Recursive-descent parser that converts a SMILES string into a [`Molecule`] graph.
7///
8/// Handles:
9/// - Aliphatic & aromatic atoms, brackets, isotopes, formal charges
10/// - Ring closure numbers (single and double-digit)
11/// - Branching via `(` / `)`
12/// - Stereo: `@`/`@@` for chiral centers, `/`/`\` for double-bond E/Z
13/// - Implicit hydrogen addition and hybridization assignment
14pub struct SmilesParser<'a> {
15    input: &'a [u8],
16    pos: usize,
17    rings: HashMap<u32, (NodeIndex, BondOrder, BondStereo)>,
18    mol: &'a mut Molecule,
19}
20
21impl<'a> SmilesParser<'a> {
22    /// Create a new parser for `smiles`, writing atoms and bonds into `mol`.
23    pub fn new(smiles: &'a str, mol: &'a mut Molecule) -> Self {
24        SmilesParser {
25            input: smiles.as_bytes(),
26            pos: 0,
27            rings: HashMap::new(),
28            mol,
29        }
30    }
31
32    /// Parse the SMILES string, populating the molecule.
33    ///
34    /// Returns `Err` with a human-readable message on invalid input.
35    pub fn parse(&mut self) -> Result<(), String> {
36        self.parse_chain(None)?;
37        // After parsing, add implicit hydrogens
38        self.add_implicit_hydrogens();
39        Ok(())
40    }
41
42    fn peek(&self) -> Option<u8> {
43        self.input.get(self.pos).copied()
44    }
45
46    fn advance(&mut self) {
47        self.pos += 1;
48    }
49
50    fn parse_chain(&mut self, mut prev_node: Option<NodeIndex>) -> Result<(), String> {
51        let mut prev_bond_order = BondOrder::Single;
52        let mut prev_bond_stereo = BondStereo::None;
53
54        let get_bond_order =
55            |m: &Molecule, prev: NodeIndex, curr: NodeIndex, order: BondOrder| -> BondOrder {
56                if order == BondOrder::Single
57                    && m.graph[prev].hybridization == Hybridization::SP2
58                    && m.graph[curr].hybridization == Hybridization::SP2
59                {
60                    BondOrder::Aromatic
61                } else {
62                    order
63                }
64            };
65
66        while self.pos < self.input.len() {
67            let c = self.peek().unwrap();
68            match c {
69                b'(' => {
70                    self.advance();
71                    self.parse_chain(prev_node)?;
72                    if self.peek() == Some(b')') {
73                        self.advance();
74                    } else {
75                        return Err("Expected ')'".to_string());
76                    }
77                }
78                b')' => {
79                    break;
80                }
81                b'-' => {
82                    self.advance();
83                    prev_bond_order = BondOrder::Single;
84                }
85                b'=' => {
86                    self.advance();
87                    prev_bond_order = BondOrder::Double;
88                }
89                b'#' => {
90                    self.advance();
91                    prev_bond_order = BondOrder::Triple;
92                }
93                b':' => {
94                    self.advance();
95                    prev_bond_order = BondOrder::Aromatic;
96                }
97                b'.' => {
98                    // Disconnected fragment separator — no bond between previous and next atom
99                    self.advance();
100                    prev_node = None;
101                    prev_bond_order = BondOrder::Single;
102                    prev_bond_stereo = BondStereo::None;
103                }
104                b'/' => {
105                    self.advance();
106                    prev_bond_order = BondOrder::Single;
107                    prev_bond_stereo = BondStereo::E;
108                } // Simplify E/Z for now
109                b'\\' => {
110                    self.advance();
111                    prev_bond_order = BondOrder::Single;
112                    prev_bond_stereo = BondStereo::Z;
113                }
114                b'%' | b'0'..=b'9' => {
115                    let mut ring_num = 0;
116                    if c == b'%' {
117                        self.advance();
118                        if let (Some(d1), Some(d2)) = (self.peek(), self.input.get(self.pos + 1)) {
119                            if d1.is_ascii_digit() && d2.is_ascii_digit() {
120                                ring_num = ((d1 - b'0') * 10 + (d2 - b'0')) as u32;
121                                self.advance();
122                                self.advance();
123                            }
124                        }
125                    } else {
126                        ring_num = (c - b'0') as u32;
127                        self.advance();
128                    }
129                    if ring_num > 0 {
130                        if let Some(prev) = prev_node {
131                            if let Some((target, order, stereo)) = self.rings.remove(&ring_num) {
132                                let mut final_order = if prev_bond_order != BondOrder::Single {
133                                    prev_bond_order
134                                } else {
135                                    order
136                                };
137                                final_order = get_bond_order(self.mol, prev, target, final_order);
138                                let final_stereo = if prev_bond_stereo != BondStereo::None {
139                                    prev_bond_stereo
140                                } else {
141                                    stereo
142                                };
143                                self.mol.add_bond(
144                                    prev,
145                                    target,
146                                    Bond {
147                                        order: final_order,
148                                        stereo: final_stereo,
149                                    },
150                                );
151                            } else {
152                                self.rings
153                                    .insert(ring_num, (prev, prev_bond_order, prev_bond_stereo));
154                            }
155                            // Reset bond state after closing/opening ring
156                            prev_bond_order = BondOrder::Single;
157                            prev_bond_stereo = BondStereo::None;
158                        }
159                    }
160                }
161                b'[' => {
162                    self.advance();
163                    let node = self.parse_bracket_atom()?;
164                    if let Some(prev) = prev_node {
165                        let order = get_bond_order(self.mol, prev, node, prev_bond_order);
166                        self.mol.add_bond(
167                            prev,
168                            node,
169                            Bond {
170                                order,
171                                stereo: prev_bond_stereo,
172                            },
173                        );
174                    }
175                    prev_node = Some(node);
176                    prev_bond_order = BondOrder::Single;
177                    prev_bond_stereo = BondStereo::None;
178                }
179                _ if c.is_ascii_alphabetic() || c == b'*' => {
180                    let node = self.parse_organic_subset()?;
181                    if let Some(prev) = prev_node {
182                        let order = get_bond_order(self.mol, prev, node, prev_bond_order);
183                        self.mol.add_bond(
184                            prev,
185                            node,
186                            Bond {
187                                order,
188                                stereo: prev_bond_stereo,
189                            },
190                        );
191                    }
192                    prev_node = Some(node);
193                    prev_bond_order = BondOrder::Single;
194                    prev_bond_stereo = BondStereo::None;
195                }
196                _ => return Err(format!("Unexpected character: {}", c as char)),
197            }
198        }
199        Ok(())
200    }
201
202    fn parse_organic_subset(&mut self) -> Result<NodeIndex, String> {
203        let mut elem_str = String::new();
204        let c = self.peek().unwrap();
205        self.advance();
206        elem_str.push(c as char);
207        if let Some(nc) = self.peek() {
208            if nc.is_ascii_lowercase()
209                && (c == b'C' && (nc == b'l' || nc == b'u' || nc == b'o' || nc == b'r')
210                    || c == b'B' && nc == b'r')
211            {
212                elem_str.push(nc as char);
213                self.advance();
214            }
215        }
216
217        // Handle aromatic
218        let mut aromatic = false;
219        if elem_str.chars().next().unwrap().is_lowercase() {
220            aromatic = true;
221            elem_str = elem_str.to_uppercase();
222        }
223
224        let element = match elem_str.as_str() {
225            "C" => 6,
226            "N" => 7,
227            "O" => 8,
228            "P" => 15,
229            "S" => 16,
230            "F" => 9,
231            "CL" | "Cl" => 17,
232            "BR" | "Br" => 35,
233            "I" => 53,
234            "B" => 5,
235            "*" => 0,
236            &_ => return Err(format!("Unknown element: {}", elem_str)),
237        };
238
239        let atom = Atom {
240            element,
241            position: Vector3::zeros(),
242            charge: 0.0,
243            formal_charge: 0,
244            hybridization: if aromatic {
245                Hybridization::SP2
246            } else {
247                Hybridization::SP3
248            },
249            chiral_tag: ChiralType::Unspecified,
250            explicit_h: 0,
251        };
252
253        Ok(self.mol.add_atom(atom))
254    }
255
256    fn parse_bracket_atom(&mut self) -> Result<NodeIndex, String> {
257        let mut isotope = 0u32;
258        let mut aromatic = false;
259        let mut chiral = ChiralType::Unspecified;
260        let mut h_count = 0;
261        let mut charge = 0;
262
263        // Isotope
264        while let Some(c) = self.peek() {
265            if c.is_ascii_digit() {
266                isotope = isotope * 10 + (c - b'0') as u32;
267                self.advance();
268            } else {
269                break;
270            }
271        }
272
273        // Element
274        let mut elem_str = String::new();
275        if let Some(c) = self.peek() {
276            if c.is_ascii_alphabetic() {
277                elem_str.push(c as char);
278                self.advance();
279                if let Some(nc) = self.peek() {
280                    if nc.is_ascii_lowercase() && c.is_ascii_uppercase() {
281                        elem_str.push(nc as char);
282                        self.advance();
283                    }
284                }
285            } else if c == b'*' {
286                elem_str.push('*');
287                self.advance();
288            }
289        }
290
291        if elem_str.chars().next().unwrap().is_lowercase() {
292            aromatic = true;
293            elem_str = elem_str.to_uppercase();
294        }
295
296        let element: u8 = match elem_str.as_str() {
297            "H" => 1,
298            "HE" | "He" => 2,
299            "LI" | "Li" => 3,
300            "BE" | "Be" => 4,
301            "B" => 5,
302            "C" => 6,
303            "N" => 7,
304            "O" => 8,
305            "F" => 9,
306            "NE" | "Ne" => 10,
307            "NA" | "Na" => 11,
308            "MG" | "Mg" => 12,
309            "AL" | "Al" => 13,
310            "SI" | "Si" => 14,
311            "P" => 15,
312            "S" => 16,
313            "CL" | "Cl" => 17,
314            "AR" | "Ar" => 18,
315            "K" => 19,
316            "CA" | "Ca" => 20,
317            "SC" | "Sc" => 21,
318            "TI" | "Ti" => 22,
319            "V" => 23,
320            "CR" | "Cr" => 24,
321            "MN" | "Mn" => 25,
322            "FE" | "Fe" => 26,
323            "CO" | "Co" => 27,
324            "NI" | "Ni" => 28,
325            "CU" | "Cu" => 29,
326            "ZN" | "Zn" => 30,
327            "GA" | "Ga" => 31,
328            "GE" | "Ge" => 32,
329            "AS" | "As" => 33,
330            "SE" | "Se" => 34,
331            "BR" | "Br" => 35,
332            "KR" | "Kr" => 36,
333            "RB" | "Rb" => 37,
334            "SR" | "Sr" => 38,
335            "Y" => 39,
336            "ZR" | "Zr" => 40,
337            "NB" | "Nb" => 41,
338            "MO" | "Mo" => 42,
339            "RU" | "Ru" => 44,
340            "RH" | "Rh" => 45,
341            "PD" | "Pd" => 46,
342            "AG" | "Ag" => 47,
343            "CD" | "Cd" => 48,
344            "IN" | "In" => 49,
345            "SN" | "Sn" => 50,
346            "SB" | "Sb" => 51,
347            "TE" | "Te" => 52,
348            "I" => 53,
349            "XE" | "Xe" => 54,
350            "CS" | "Cs" => 55,
351            "BA" | "Ba" => 56,
352            "LA" | "La" => 57,
353            "W" => 74,
354            "RE" | "Re" => 75,
355            "OS" | "Os" => 76,
356            "IR" | "Ir" => 77,
357            "PT" | "Pt" => 78,
358            "AU" | "Au" => 79,
359            "HG" | "Hg" => 80,
360            "TL" | "Tl" => 81,
361            "PB" | "Pb" => 82,
362            "BI" | "Bi" => 83,
363            "*" => 0,
364            _ => return Err(format!("Unknown bracket element: {}", elem_str)),
365        };
366
367        // Chirality
368        if let Some(c) = self.peek() {
369            if c == b'@' {
370                self.advance();
371                chiral = ChiralType::TetrahedralCCW;
372                if self.peek() == Some(b'@') {
373                    self.advance();
374                    chiral = ChiralType::TetrahedralCW;
375                }
376            }
377        }
378
379        // Hydrogens
380        if let Some(c) = self.peek() {
381            if c == b'H' {
382                self.advance();
383                h_count = 1;
384                if let Some(hc) = self.peek() {
385                    if hc.is_ascii_digit() {
386                        h_count = hc - b'0';
387                        self.advance();
388                    }
389                }
390            }
391        }
392
393        // Charge
394        if let Some(c) = self.peek() {
395            if c == b'+' {
396                self.advance();
397                charge = 1;
398                if let Some(cc) = self.peek() {
399                    if cc.is_ascii_digit() {
400                        charge = (cc - b'0') as i8;
401                        self.advance();
402                    } else if cc == b'+' {
403                        charge = 2;
404                        self.advance();
405                        if self.peek() == Some(b'+') {
406                            charge = 3;
407                            self.advance();
408                        }
409                    }
410                }
411            } else if c == b'-' {
412                self.advance();
413                charge = -1;
414                if let Some(cc) = self.peek() {
415                    if cc.is_ascii_digit() {
416                        charge = -((cc - b'0') as i8);
417                        self.advance();
418                    } else if cc == b'-' {
419                        charge = -2;
420                        self.advance();
421                        if self.peek() == Some(b'-') {
422                            charge = -3;
423                            self.advance();
424                        }
425                    }
426                }
427            }
428        }
429
430        if self.peek() == Some(b']') {
431            self.advance();
432        } else {
433            return Err("Expected ']'".to_string());
434        }
435
436        let atom = Atom {
437            element,
438            position: Vector3::zeros(),
439            charge: charge as f32,
440            formal_charge: charge,
441            hybridization: if aromatic {
442                Hybridization::SP2
443            } else {
444                Hybridization::SP3
445            }, // Simplification
446            chiral_tag: chiral,
447            explicit_h: h_count,
448        };
449
450        let node = self.mol.add_atom(atom);
451
452        Ok(node)
453    }
454
455    fn add_implicit_hydrogens(&mut self) {
456        let n = self.mol.graph.node_count();
457        let mut to_add = Vec::new();
458
459        for i in 0..n {
460            let ni = NodeIndex::new(i);
461            let atom = &self.mol.graph[ni];
462            if atom.element == 1 || atom.element == 0 {
463                continue;
464            } // H or dummy
465
466            let mut current_valents = 0;
467            let mut double_bonds = 0;
468            let mut triple_bonds = 0;
469            let mut aromatic_bonds = 0;
470
471            for edge in self.mol.graph.edges(ni) {
472                match edge.weight().order {
473                    BondOrder::Single => {
474                        current_valents += 1;
475                    }
476                    BondOrder::Double => {
477                        current_valents += 2;
478                        double_bonds += 1;
479                    }
480                    BondOrder::Triple => {
481                        current_valents += 3;
482                        triple_bonds += 1;
483                    }
484                    BondOrder::Aromatic => {
485                        current_valents += 1;
486                        aromatic_bonds += 1;
487                    }
488                    _ => {
489                        current_valents += 1;
490                    }
491                };
492            }
493
494            // Fix hybridization for non-aromatics based on exact bond types
495            if aromatic_bonds == 0 {
496                let actual_hybridization = if triple_bonds > 0 || double_bonds > 1 {
497                    Hybridization::SP
498                } else if double_bonds == 1 {
499                    Hybridization::SP2
500                } else {
501                    // Check for conjugation: N or O adjacent to double/aromatic bond → SP2
502                    // Matches RDKit: norbs==4, degree<=3, atomHasConjugatedBond → SP2
503                    let elem = atom.element;
504                    let is_conjugated = if (elem == 7 || elem == 8) && double_bonds == 0 {
505                        // Check if any neighbor has a double or aromatic bond (conjugation)
506                        self.mol.graph.neighbors(ni).any(|nb| {
507                            self.mol.graph.edges(nb).any(|e| {
508                                matches!(e.weight().order, BondOrder::Double | BondOrder::Aromatic)
509                            })
510                        })
511                    } else {
512                        false
513                    };
514                    if is_conjugated {
515                        Hybridization::SP2
516                    } else {
517                        Hybridization::SP3
518                    }
519                };
520
521                if let Some(atom_mut) = self.mol.graph.node_weight_mut(ni) {
522                    atom_mut.hybridization = actual_hybridization;
523                }
524            } else if let Some(atom_mut) = self.mol.graph.node_weight_mut(ni) {
525                atom_mut.hybridization = Hybridization::SP2;
526            }
527
528            let atom = &self.mol.graph[ni];
529            // If it's aromatic, an uncharged Carbon expects 3 bonds normally (2 from ring, 1 external or 1 H).
530            let target_val = match atom.element {
531                5 => 3, // B
532                6 => 4, // C
533                7 => {
534                    if atom.formal_charge == 1 {
535                        4
536                    } else {
537                        3
538                    }
539                } // N
540                8 => {
541                    if atom.formal_charge == 1 {
542                        3
543                    } else {
544                        2
545                    }
546                } // O
547                9 | 17 | 35 | 53 => 1, // Halogens
548                14 => 4, // Si
549                15 => 3, // P
550                16 => 2, // S
551                32 => 4, // Ge
552                33 => 3, // As
553                34 => 2, // Se
554                50 => 4, // Sn
555                51 => 3, // Sb
556                52 => 2, // Te
557                _ => 0,
558            };
559
560            let mut h_needed = target_val - current_valents - atom.formal_charge.abs() as i32;
561
562            if atom.hybridization == Hybridization::SP2 && atom.element == 6 {
563                // simple aromatic C checks
564                let aromatic_bonds = self
565                    .mol
566                    .graph
567                    .edges(ni)
568                    .filter(|e| e.weight().order == BondOrder::Aromatic)
569                    .count();
570                if aromatic_bonds == 2 {
571                    h_needed = 1 - (current_valents - 2); // 1 H max on a standard aromatic carbon in a ring
572                } else if aromatic_bonds == 3 {
573                    h_needed = 0; // Bridgehead aromatic carbon
574                }
575            }
576
577            // Bare aromatic nitrogen (lowercase 'n', no bracket/explicit H) is pyridine-type:
578            // lone pair NOT in pi system, only 2 bonds needed → 0 implicit H.
579            // [nH] form has explicit_h > 0 and keeps its H (pyrrole-type).
580            if aromatic_bonds > 0 && atom.element == 7 && atom.explicit_h == 0 {
581                h_needed = 0;
582            }
583
584            // Calculate total hydrogens to add for this atom in THIS iteration
585            // to maintain exact topological index order!
586            let total_h_for_this_atom = atom.explicit_h as i32
587                + if h_needed - (atom.explicit_h as i32) > 0 {
588                    h_needed - (atom.explicit_h as i32)
589                } else {
590                    0
591                };
592
593            if total_h_for_this_atom > 0 {
594                to_add.push((ni, total_h_for_this_atom as u8));
595            }
596        }
597
598        // Add all hydrogens sequentially
599        for (parent, count) in to_add {
600            for _ in 0..count {
601                let h_node = self.mol.add_atom(Atom {
602                    element: 1,
603                    position: Vector3::zeros(),
604                    charge: 0.0,
605                    formal_charge: 0,
606                    hybridization: Hybridization::Unknown,
607                    chiral_tag: ChiralType::Unspecified,
608                    explicit_h: 0,
609                });
610                self.mol.add_bond(
611                    parent,
612                    h_node,
613                    Bond {
614                        order: BondOrder::Single,
615                        stereo: BondStereo::None,
616                    },
617                );
618            }
619        }
620    }
621}
622
623#[cfg(test)]
624mod tests {
625    use super::*;
626
627    #[test]
628    fn test_parse_benzene_aliphatic() {
629        let mol = Molecule::from_smiles("C1=CC=CC=C1").unwrap();
630        assert_eq!(mol.graph.node_count(), 12); // 6 C, 6 implicit H
631        assert_eq!(mol.graph.edge_count(), 12); // 6 ring bonds, 6 C-H bonds
632    }
633
634    #[test]
635    fn test_parse_benzene_aromatic() {
636        let mol = Molecule::from_smiles("c1ccccc1").unwrap();
637        assert_eq!(mol.graph.node_count(), 12); // 6 C, 6 implicit H
638        assert_eq!(mol.graph.edge_count(), 12); // 6 ring bonds, 6 C-H bonds
639    }
640
641    #[test]
642    fn test_parse_brackets_and_branches() {
643        let mol = Molecule::from_smiles("C(C)(C)C").unwrap(); // Isobutane
644        assert_eq!(mol.graph.node_count(), 14); // 4 C, 10 H
645    }
646}