Skip to main content

sci_form/
smiles.rs

1use crate::graph::{Atom, Bond, BondOrder, BondStereo, ChiralType, Hybridization, Molecule};
2use nalgebra::Vector3;
3use petgraph::graph::NodeIndex;
4use std::collections::HashMap;
5
6/// Recursive-descent parser that converts a SMILES string into a [`Molecule`] graph.
7///
8/// Handles:
9/// - Aliphatic & aromatic atoms, brackets, isotopes, formal charges
10/// - Ring closure numbers (single and double-digit)
11/// - Branching via `(` / `)`
12/// - Stereo: `@`/`@@` for chiral centers, `/`/`\` for double-bond E/Z
13/// - Implicit hydrogen addition and hybridization assignment
14pub struct SmilesParser<'a> {
15    input: &'a [u8],
16    pos: usize,
17    rings: HashMap<u32, (NodeIndex, BondOrder, BondStereo)>,
18    mol: &'a mut Molecule,
19}
20
21impl<'a> SmilesParser<'a> {
22    /// Create a new parser for `smiles`, writing atoms and bonds into `mol`.
23    pub fn new(smiles: &'a str, mol: &'a mut Molecule) -> Self {
24        SmilesParser {
25            input: smiles.as_bytes(),
26            pos: 0,
27            rings: HashMap::new(),
28            mol,
29        }
30    }
31
32    /// Parse the SMILES string, populating the molecule.
33    ///
34    /// Returns `Err` with a human-readable message on invalid input.
35    pub fn parse(&mut self) -> Result<(), String> {
36        self.parse_chain(None)?;
37        // After parsing, add implicit hydrogens
38        self.add_implicit_hydrogens();
39        Ok(())
40    }
41
42    fn peek(&self) -> Option<u8> {
43        self.input.get(self.pos).copied()
44    }
45
46    fn advance(&mut self) {
47        self.pos += 1;
48    }
49
50    fn parse_chain(&mut self, mut prev_node: Option<NodeIndex>) -> Result<(), String> {
51        let mut prev_bond_order = BondOrder::Single;
52        let mut prev_bond_stereo = BondStereo::None;
53
54        let get_bond_order =
55            |m: &Molecule, prev: NodeIndex, curr: NodeIndex, order: BondOrder| -> BondOrder {
56                if order == BondOrder::Single
57                    && m.graph[prev].hybridization == Hybridization::SP2
58                    && m.graph[curr].hybridization == Hybridization::SP2
59                {
60                    BondOrder::Aromatic
61                } else {
62                    order
63                }
64            };
65
66        while self.pos < self.input.len() {
67            let c = self.peek().unwrap();
68            match c {
69                b'(' => {
70                    self.advance();
71                    self.parse_chain(prev_node)?;
72                    if self.peek() == Some(b')') {
73                        self.advance();
74                    } else {
75                        return Err("Expected ')'".to_string());
76                    }
77                }
78                b')' => {
79                    break;
80                }
81                b'-' => {
82                    self.advance();
83                    prev_bond_order = BondOrder::Single;
84                }
85                b'=' => {
86                    self.advance();
87                    prev_bond_order = BondOrder::Double;
88                }
89                b'#' => {
90                    self.advance();
91                    prev_bond_order = BondOrder::Triple;
92                }
93                b':' => {
94                    self.advance();
95                    prev_bond_order = BondOrder::Aromatic;
96                }
97                b'.' => {
98                    // Disconnected fragment separator — no bond between previous and next atom
99                    self.advance();
100                    prev_node = None;
101                    prev_bond_order = BondOrder::Single;
102                    prev_bond_stereo = BondStereo::None;
103                }
104                b'/' => {
105                    self.advance();
106                    prev_bond_order = BondOrder::Single;
107                    prev_bond_stereo = BondStereo::E;
108                } // Simplify E/Z for now
109                b'\\' => {
110                    self.advance();
111                    prev_bond_order = BondOrder::Single;
112                    prev_bond_stereo = BondStereo::Z;
113                }
114                b'%' | b'0'..=b'9' => {
115                    let mut ring_num = 0;
116                    if c == b'%' {
117                        self.advance();
118                        if let (Some(d1), Some(d2)) = (self.peek(), self.input.get(self.pos + 1)) {
119                            if d1.is_ascii_digit() && d2.is_ascii_digit() {
120                                ring_num = ((d1 - b'0') * 10 + (d2 - b'0')) as u32;
121                                self.advance();
122                                self.advance();
123                            }
124                        }
125                    } else {
126                        ring_num = (c - b'0') as u32;
127                        self.advance();
128                    }
129                    if ring_num > 0 {
130                        if let Some(prev) = prev_node {
131                            if let Some((target, order, stereo)) = self.rings.remove(&ring_num) {
132                                let mut final_order = if prev_bond_order != BondOrder::Single {
133                                    prev_bond_order
134                                } else {
135                                    order
136                                };
137                                final_order = get_bond_order(self.mol, prev, target, final_order);
138                                let final_stereo = if prev_bond_stereo != BondStereo::None {
139                                    prev_bond_stereo
140                                } else {
141                                    stereo
142                                };
143                                self.mol.add_bond(
144                                    prev,
145                                    target,
146                                    Bond {
147                                        order: final_order,
148                                        stereo: final_stereo,
149                                    },
150                                );
151                            } else {
152                                self.rings
153                                    .insert(ring_num, (prev, prev_bond_order, prev_bond_stereo));
154                            }
155                            // Reset bond state after closing/opening ring
156                            prev_bond_order = BondOrder::Single;
157                            prev_bond_stereo = BondStereo::None;
158                        }
159                    }
160                }
161                b'[' => {
162                    self.advance();
163                    let node = self.parse_bracket_atom()?;
164                    if let Some(prev) = prev_node {
165                        let order = get_bond_order(self.mol, prev, node, prev_bond_order);
166                        self.mol.add_bond(
167                            prev,
168                            node,
169                            Bond {
170                                order,
171                                stereo: prev_bond_stereo,
172                            },
173                        );
174                    }
175                    prev_node = Some(node);
176                    prev_bond_order = BondOrder::Single;
177                    prev_bond_stereo = BondStereo::None;
178                }
179                _ if c.is_ascii_alphabetic() || c == b'*' => {
180                    let node = self.parse_organic_subset()?;
181                    if let Some(prev) = prev_node {
182                        let order = get_bond_order(self.mol, prev, node, prev_bond_order);
183                        self.mol.add_bond(
184                            prev,
185                            node,
186                            Bond {
187                                order,
188                                stereo: prev_bond_stereo,
189                            },
190                        );
191                    }
192                    prev_node = Some(node);
193                    prev_bond_order = BondOrder::Single;
194                    prev_bond_stereo = BondStereo::None;
195                }
196                _ => return Err(format!("Unexpected character: {}", c as char)),
197            }
198        }
199        Ok(())
200    }
201
202    fn parse_organic_subset(&mut self) -> Result<NodeIndex, String> {
203        let mut elem_str = String::new();
204        let c = self.peek().unwrap();
205        self.advance();
206        elem_str.push(c as char);
207        if let Some(nc) = self.peek() {
208            if nc.is_ascii_lowercase()
209                && (c == b'C' && (nc == b'l' || nc == b'u' || nc == b'o' || nc == b'r')
210                    || c == b'B' && nc == b'r')
211            {
212                elem_str.push(nc as char);
213                self.advance();
214            }
215        }
216
217        // Handle aromatic
218        let mut aromatic = false;
219        if elem_str.chars().next().unwrap().is_lowercase() {
220            aromatic = true;
221            elem_str = elem_str.to_uppercase();
222        }
223
224        let element = match elem_str.as_str() {
225            "C" => 6,
226            "N" => 7,
227            "O" => 8,
228            "P" => 15,
229            "S" => 16,
230            "F" => 9,
231            "CL" | "Cl" => 17,
232            "BR" | "Br" => 35,
233            "I" => 53,
234            "B" => 5,
235            "*" => 0,
236            &_ => return Err(format!("Unknown element: {}", elem_str)),
237        };
238
239        let atom = Atom {
240            element,
241            position: Vector3::zeros(),
242            charge: 0.0,
243            formal_charge: 0,
244            hybridization: if aromatic {
245                Hybridization::SP2
246            } else {
247                Hybridization::SP3
248            },
249            chiral_tag: ChiralType::Unspecified,
250            explicit_h: 0,
251        };
252
253        Ok(self.mol.add_atom(atom))
254    }
255
256    fn parse_bracket_atom(&mut self) -> Result<NodeIndex, String> {
257        let mut isotope = 0u32;
258        let mut aromatic = false;
259        let mut chiral = ChiralType::Unspecified;
260        let mut h_count = 0;
261        let mut charge = 0;
262
263        // Isotope
264        while let Some(c) = self.peek() {
265            if c.is_ascii_digit() {
266                isotope = isotope * 10 + (c - b'0') as u32;
267                self.advance();
268            } else {
269                break;
270            }
271        }
272
273        // Element
274        let mut elem_str = String::new();
275        if let Some(c) = self.peek() {
276            if c.is_ascii_alphabetic() {
277                elem_str.push(c as char);
278                self.advance();
279                if let Some(nc) = self.peek() {
280                    if nc.is_ascii_lowercase() && c.is_ascii_uppercase() {
281                        elem_str.push(nc as char);
282                        self.advance();
283                    }
284                }
285            } else if c == b'*' {
286                elem_str.push('*');
287                self.advance();
288            }
289        }
290
291        if elem_str.chars().next().unwrap().is_lowercase() {
292            aromatic = true;
293            elem_str = elem_str.to_uppercase();
294        }
295
296        let element: u8 = match elem_str.as_str() {
297            "H" => 1,
298            "C" => 6,
299            "N" => 7,
300            "O" => 8,
301            "P" => 15,
302            "S" => 16,
303            "F" => 9,
304            "CL" | "Cl" => 17,
305            "BR" | "Br" => 35,
306            "I" => 53,
307            "FE" | "Fe" => 26,
308            "ZN" | "Zn" => 30,
309            "CO" | "Co" => 27,
310            "CU" | "Cu" => 29,
311            "B" => 5,
312            "AG" | "Ag" => 47,
313            "AU" | "Au" => 79,
314            "PT" | "Pt" => 78,
315            "PD" | "Pd" => 46,
316            "NA" | "Na" => 11,
317            "K" => 19,
318            "LI" | "Li" => 3,
319            "CA" | "Ca" => 20,
320            "MG" | "Mg" => 12,
321            "*" => 0,
322            _ => 6, // Default fallback
323        };
324
325        // Chirality
326        if let Some(c) = self.peek() {
327            if c == b'@' {
328                self.advance();
329                chiral = ChiralType::TetrahedralCCW;
330                if self.peek() == Some(b'@') {
331                    self.advance();
332                    chiral = ChiralType::TetrahedralCW;
333                }
334            }
335        }
336
337        // Hydrogens
338        if let Some(c) = self.peek() {
339            if c == b'H' {
340                self.advance();
341                h_count = 1;
342                if let Some(hc) = self.peek() {
343                    if hc.is_ascii_digit() {
344                        h_count = hc - b'0';
345                        self.advance();
346                    }
347                }
348            }
349        }
350
351        // Charge
352        if let Some(c) = self.peek() {
353            if c == b'+' {
354                self.advance();
355                charge = 1;
356                if let Some(cc) = self.peek() {
357                    if cc.is_ascii_digit() {
358                        charge = (cc - b'0') as i8;
359                        self.advance();
360                    } else if cc == b'+' {
361                        charge = 2;
362                        self.advance();
363                        if self.peek() == Some(b'+') {
364                            charge = 3;
365                            self.advance();
366                        }
367                    }
368                }
369            } else if c == b'-' {
370                self.advance();
371                charge = -1;
372                if let Some(cc) = self.peek() {
373                    if cc.is_ascii_digit() {
374                        charge = -((cc - b'0') as i8);
375                        self.advance();
376                    } else if cc == b'-' {
377                        charge = -2;
378                        self.advance();
379                        if self.peek() == Some(b'-') {
380                            charge = -3;
381                            self.advance();
382                        }
383                    }
384                }
385            }
386        }
387
388        if self.peek() == Some(b']') {
389            self.advance();
390        } else {
391            return Err("Expected ']'".to_string());
392        }
393
394        let atom = Atom {
395            element,
396            position: Vector3::zeros(),
397            charge: charge as f32,
398            formal_charge: charge,
399            hybridization: if aromatic {
400                Hybridization::SP2
401            } else {
402                Hybridization::SP3
403            }, // Simplification
404            chiral_tag: chiral,
405            explicit_h: h_count,
406        };
407
408        let node = self.mol.add_atom(atom);
409
410        Ok(node)
411    }
412
413    fn add_implicit_hydrogens(&mut self) {
414        let n = self.mol.graph.node_count();
415        let mut to_add = Vec::new();
416
417        for i in 0..n {
418            let ni = NodeIndex::new(i);
419            let atom = &self.mol.graph[ni];
420            if atom.element == 1 || atom.element == 0 {
421                continue;
422            } // H or dummy
423
424            let mut current_valents = 0;
425            let mut double_bonds = 0;
426            let mut triple_bonds = 0;
427            let mut aromatic_bonds = 0;
428
429            for edge in self.mol.graph.edges(ni) {
430                match edge.weight().order {
431                    BondOrder::Single => {
432                        current_valents += 1;
433                    }
434                    BondOrder::Double => {
435                        current_valents += 2;
436                        double_bonds += 1;
437                    }
438                    BondOrder::Triple => {
439                        current_valents += 3;
440                        triple_bonds += 1;
441                    }
442                    BondOrder::Aromatic => {
443                        current_valents += 1;
444                        aromatic_bonds += 1;
445                    }
446                    _ => {
447                        current_valents += 1;
448                    }
449                };
450            }
451
452            // Fix hybridization for non-aromatics based on exact bond types
453            if aromatic_bonds == 0 {
454                let actual_hybridization = if triple_bonds > 0 || double_bonds > 1 {
455                    Hybridization::SP
456                } else if double_bonds == 1 {
457                    Hybridization::SP2
458                } else {
459                    // Check for conjugation: N or O adjacent to double/aromatic bond → SP2
460                    // Matches RDKit: norbs==4, degree<=3, atomHasConjugatedBond → SP2
461                    let elem = atom.element;
462                    let is_conjugated = if (elem == 7 || elem == 8) && double_bonds == 0 {
463                        // Check if any neighbor has a double or aromatic bond (conjugation)
464                        self.mol.graph.neighbors(ni).any(|nb| {
465                            self.mol.graph.edges(nb).any(|e| {
466                                matches!(e.weight().order, BondOrder::Double | BondOrder::Aromatic)
467                            })
468                        })
469                    } else {
470                        false
471                    };
472                    if is_conjugated {
473                        Hybridization::SP2
474                    } else {
475                        Hybridization::SP3
476                    }
477                };
478
479                if let Some(atom_mut) = self.mol.graph.node_weight_mut(ni) {
480                    atom_mut.hybridization = actual_hybridization;
481                }
482            } else {
483                if let Some(atom_mut) = self.mol.graph.node_weight_mut(ni) {
484                    atom_mut.hybridization = Hybridization::SP2;
485                }
486            }
487
488            let atom = &self.mol.graph[ni];
489            // If it's aromatic, an uncharged Carbon expects 3 bonds normally (2 from ring, 1 external or 1 H).
490            let target_val = match atom.element {
491                6 => 4, // C
492                7 => {
493                    if atom.formal_charge == 1 {
494                        4
495                    } else {
496                        3
497                    }
498                } // N
499                8 => {
500                    if atom.formal_charge == 1 {
501                        3
502                    } else {
503                        2
504                    }
505                } // O
506                9 | 17 | 35 | 53 => 1, // Halogens
507                16 => 2, // S
508                15 => 3, // P
509                _ => 0,
510            };
511
512            let mut h_needed = target_val - current_valents - atom.formal_charge.abs() as i32;
513
514            if atom.hybridization == Hybridization::SP2 && atom.element == 6 {
515                // simple aromatic C checks
516                let aromatic_bonds = self
517                    .mol
518                    .graph
519                    .edges(ni)
520                    .filter(|e| e.weight().order == BondOrder::Aromatic)
521                    .count();
522                if aromatic_bonds == 2 {
523                    h_needed = 1 - (current_valents - 2); // 1 H max on a standard aromatic carbon in a ring
524                } else if aromatic_bonds == 3 {
525                    h_needed = 0; // Bridgehead aromatic carbon
526                }
527            }
528
529            // Bare aromatic nitrogen (lowercase 'n', no bracket/explicit H) is pyridine-type:
530            // lone pair NOT in pi system, only 2 bonds needed → 0 implicit H.
531            // [nH] form has explicit_h > 0 and keeps its H (pyrrole-type).
532            if aromatic_bonds > 0 && atom.element == 7 && atom.explicit_h == 0 {
533                h_needed = 0;
534            }
535
536            // Calculate total hydrogens to add for this atom in THIS iteration
537            // to maintain exact topological index order!
538            let total_h_for_this_atom = atom.explicit_h as i32
539                + if h_needed - (atom.explicit_h as i32) > 0 {
540                    h_needed - (atom.explicit_h as i32)
541                } else {
542                    0
543                };
544
545            if total_h_for_this_atom > 0 {
546                to_add.push((ni, total_h_for_this_atom as u8));
547            }
548        }
549
550        // Add all hydrogens sequentially
551        for (parent, count) in to_add {
552            for _ in 0..count {
553                let h_node = self.mol.add_atom(Atom {
554                    element: 1,
555                    position: Vector3::zeros(),
556                    charge: 0.0,
557                    formal_charge: 0,
558                    hybridization: Hybridization::Unknown,
559                    chiral_tag: ChiralType::Unspecified,
560                    explicit_h: 0,
561                });
562                self.mol.add_bond(
563                    parent,
564                    h_node,
565                    Bond {
566                        order: BondOrder::Single,
567                        stereo: BondStereo::None,
568                    },
569                );
570            }
571        }
572    }
573}
574
575#[cfg(test)]
576mod tests {
577    use super::*;
578
579    #[test]
580    fn test_parse_benzene_aliphatic() {
581        let mol = Molecule::from_smiles("C1=CC=CC=C1").unwrap();
582        assert_eq!(mol.graph.node_count(), 12); // 6 C, 6 implicit H
583        assert_eq!(mol.graph.edge_count(), 12); // 6 ring bonds, 6 C-H bonds
584    }
585
586    #[test]
587    fn test_parse_benzene_aromatic() {
588        let mol = Molecule::from_smiles("c1ccccc1").unwrap();
589        assert_eq!(mol.graph.node_count(), 12); // 6 C, 6 implicit H
590        assert_eq!(mol.graph.edge_count(), 12); // 6 ring bonds, 6 C-H bonds
591    }
592
593    #[test]
594    fn test_parse_brackets_and_branches() {
595        let mol = Molecule::from_smiles("C(C)(C)C").unwrap(); // Isobutane
596        assert_eq!(mol.graph.node_count(), 14); // 4 C, 10 H
597    }
598}