molecules/
io.rs

1use chemistry_consts::ElementProperties; 
2use crate::molecule::{Atom, Bond, BondType, ChiralClass, Molecule};
3use nohash_hasher::IntMap;
4
5#[derive(Debug)]
6pub enum ParseError {
7    ElementNotFound(String),
8    BondNotFound,
9    RingIndexError,
10    InvalidBranch,
11    InvalidAromatic(String),
12    SMILESComplexError,
13    Charge,
14    ChiralClass(String),
15    AtomClassMissing,
16    EOL,
17}
18
19#[derive(Default)]
20pub struct SMILESParser {
21    atoms: Vec<Atom>,
22    bonds: Vec<(usize, usize, BondType)>,
23    current_atom_index: usize,
24    current_atom_charge: Option<i8>,
25    element_buffer: String,
26    isotope: Option<u16>,
27    chiral_class: ChiralClass,
28    current_atom_class: Option<u8>,
29    atom_classes: IntMap<usize, u8>,
30    last_bond_type: BondType,
31    ring_number: Option<usize>,
32    ring_bonds: IntMap<usize, (Option<usize>, Option<usize>)>,
33    hydrogens: IntMap<usize, u8>,
34    branch_stack: Vec<usize>,
35    branch_exits: usize,
36    is_multiple_branch: bool,
37    is_double_digit: bool,
38    is_aromatic: bool,
39}
40
41impl SMILESParser {
42    /// Parses a SMILES string and returns a Molecule
43    /// # Arguments
44    /// * `smiles` - A string slice that holds the SMILES string
45    ///
46    /// # Example
47    /// ```
48    /// use molecules::io::SMILESParser;
49    /// let molecules = SMILESParser::parse_smiles("C(C(C))COcCl").unwrap();
50    /// assert_eq!(molecules[0].atoms.len(), 7);
51    /// ```
52    pub fn parse_smiles(smiles: &str) -> Result<Vec<Molecule>, ParseError> {
53        let mut parser = SMILESParser::default();
54        let mut molecules = Vec::new();
55
56        // SMILES should only be ASCII so we could bypass the UTF-8 checks using bytes()
57        let bytes = smiles.as_bytes();
58        let mut pointer = 0;
59        while pointer < bytes.len() {
60            // This is kind of redundant maybe remove
61            let Some(&byte) = bytes.get(pointer) else {
62                break;
63            };
64            pointer += 1;
65            match byte {
66                b'A'..=b'Z' => {
67                    // Handle the previous element
68                    parser.handle_atom(Some(byte))?
69                }
70                b'a'..=b'z' => {
71                    // According to the SMILES specification, the lowercase letters are used to denote aromatic atoms if they are not complex
72                    match byte {
73                        b'b' | b'c' | b'n' | b'o' | b's' | b'p' => {
74                            parser.handle_atom(Some(byte))?
75                        }
76                        b'r' => {
77                            if parser.element_buffer == "B" {
78                                parser.element_buffer.push('R')
79                            } else {
80                                return Err(ParseError::InvalidAromatic(
81                                    parser.element_buffer.clone(),
82                                ));
83                            }
84                        }
85
86                        b'l' => {
87                            if parser.element_buffer == "C" {
88                                parser.element_buffer.push('L')
89                            } else {
90                                return Err(ParseError::InvalidAromatic(
91                                    parser.element_buffer.clone(),
92                                ));
93                            }
94                        }
95                        anything_else => {
96                            parser.element_buffer.push(anything_else as char);
97                            return Err(ParseError::InvalidAromatic(parser.element_buffer.clone()));
98                        }
99                    }
100                }
101
102                b'.' => {
103                    parser.handle_atom(None)?;
104                    parser.add_all_bonds();
105                    molecules
106                        .push(Molecule::from_atoms(parser.atoms).with_classes(parser.atom_classes));
107                    parser = SMILESParser::default();
108                }
109
110                b'1'..=b'9' => {
111                    parser.handle_number(byte)?;
112                }
113                b'-' => {
114                    parser.last_bond_type = BondType::Single;
115                }
116                b'=' => {
117                    parser.last_bond_type = BondType::Double;
118                }
119                b'#' => {
120                    parser.last_bond_type = BondType::Triple;
121                }
122                b'$' => {
123                    parser.last_bond_type = BondType::Quadruple;
124                }
125                b':' => {
126                    parser.last_bond_type = BondType::Aromatic;
127                }
128                b'(' => {
129                    if parser.branch_exits > 0 {
130                        parser.is_multiple_branch = true;
131                    } else {
132                        parser.branch_stack.push(parser.current_atom_index);
133                    }
134                }
135                b')' => parser.branch_exits += 1,
136                b'[' => {
137                    parser.handle_complex_atom(bytes, &mut pointer)?;
138                }
139
140                b'%' => {
141                    parser.is_double_digit = true;
142                }
143                _ => (),
144            }
145        }
146
147        if !parser.element_buffer.is_empty() {
148            parser.handle_atom(None)?
149        }
150
151        parser.add_all_bonds();
152
153        molecules.push(Molecule::from_atoms(parser.atoms).with_classes(parser.atom_classes));
154        Ok(molecules)
155    }
156
157    fn add_all_bonds(&mut self) {
158        #[cfg(debug_assertions)]
159        println!("Ring bonds: {:?}", self.ring_bonds);
160        for (start, end) in self.ring_bonds.values() {
161            if start.is_some() && end.is_some() {
162                self.bonds
163                    .push((start.unwrap(), end.unwrap(), BondType::Aromatic));
164            }
165        }
166
167        #[cfg(debug_assertions)]
168        println!("Hydrogens: {:?}", self.hydrogens);
169        for (atom, number) in self.hydrogens.iter() {
170            let hydrogen_index = self.current_atom_index;
171            for index in 0..*number {
172                self.atoms.push(Atom::new(1));
173                self.bonds
174                    .push((*atom, hydrogen_index + index as usize, BondType::Single));
175            }
176        }
177
178        // Add bonds to the molecule
179        for bond in self.bonds.iter() {
180            let atom1 = &mut self.atoms[bond.0];
181            atom1.add_bond(Bond::new(bond.1, bond.2));
182            let atom2 = &mut self.atoms[bond.1];
183            atom2.add_bond(Bond::new(bond.0, bond.2));
184        }
185    }
186
187    fn handle_branch(&mut self) -> Result<(), ParseError> {
188        // Pop the stack until we find the last branch atom
189        let mut branch_atom = None;
190        while let Some(branch) = self.branch_stack.pop() {
191            branch_atom = Some(branch);
192            if self.branch_exits > 0 {
193                self.branch_exits -= 1;
194            } else {
195                return Err(ParseError::InvalidBranch);
196            }
197            if self.branch_exits == 0 {
198                break;
199            }
200        }
201
202        if self.branch_exits > 0 {
203            return Err(ParseError::InvalidBranch);
204        }
205
206        if let Some(branch_atom) = branch_atom {
207            self.bonds
208                .push((branch_atom, self.current_atom_index, self.last_bond_type));
209            if self.is_multiple_branch {
210                // If we have multiple branches, we need to push the current atom back on the stack
211                self.branch_stack.push(branch_atom);
212                self.is_multiple_branch = false;
213            }
214        } else {
215            return Err(ParseError::BondNotFound);
216        }
217        Ok(())
218    }
219    fn handle_bond(&mut self) -> Result<(), ParseError> {
220        if self.current_atom_index == 0 {
221            return Ok(());
222        }
223        if self.branch_exits > 0 && !self.branch_stack.is_empty() {
224            self.handle_branch()?
225        } else {
226            self.bonds.push((
227                self.current_atom_index - 1,
228                self.current_atom_index,
229                self.last_bond_type,
230            ));
231        }
232        self.last_bond_type = BondType::Single;
233        Ok(())
234    }
235
236    fn handle_atom(&mut self, byte: Option<u8>) -> Result<(), ParseError> {
237        if !self.element_buffer.is_empty() {
238            let atomic_number = &self.element_buffer.to_uppercase().as_str().atomic_number()
239                .ok_or(ParseError::ElementNotFound(self.element_buffer.to_owned()))?;
240
241            let mut atom = Atom::new(*atomic_number);
242
243            self.current_atom_index += 1;
244
245            // TODO check if this is correct
246            if byte.is_some() {
247                self.handle_bond()?
248            }
249
250            if self.element_buffer.chars().next().unwrap().is_lowercase() || self.is_aromatic {
251                atom = Atom::new(*atomic_number).aromatic();
252            }
253
254            if let Some(isotope) = self.isotope {
255                if is_valid_isotope(*atomic_number, isotope) {
256                    atom = atom.with_isotope(isotope);
257                    self.isotope = None;
258                } else {
259                    println!(
260                        "Isotope {} is not valid for atomic number {}, skipping it",
261                        isotope, *atomic_number
262                    );
263                }
264            }
265
266            if self.chiral_class != ChiralClass::None {
267                atom = atom.with_chiral_class(self.chiral_class);
268                self.chiral_class = ChiralClass::None;
269            }
270
271            if let Some(charge) = self.current_atom_charge {
272                atom = atom.with_charge(charge);
273                self.current_atom_charge = None;
274            }
275
276            if let Some(atom_class) = self.current_atom_class {
277                self.atom_classes
278                    .insert(self.current_atom_index, atom_class);
279                self.current_atom_class = None;
280            }
281
282            self.atoms.push(atom);
283            self.element_buffer.clear();
284        }
285        let Some(character_byte) = byte else {
286            return Ok(());
287        };
288        self.element_buffer.push(character_byte as char);
289        Ok(())
290    }
291
292    // TODO handle invalid ring numbers
293    fn handle_number(&mut self, byte: u8) -> Result<(), ParseError> {
294        if self.is_double_digit {
295            if let Some(ring_number_value) = self.ring_number {
296                // TODO make this generic for any size
297                self.ring_number = Some(ring_number_value * 10 + byte_to_number(byte) as usize);
298                self.is_double_digit = false;
299            } else {
300                self.ring_number = Some(byte_to_number(byte) as usize);
301                return Ok(());
302            }
303        }
304
305        let Some(ring) = self.ring_number else {
306            return Ok(());
307        };
308
309        let (start, end) = self.ring_bonds.entry(ring).or_insert((None, None));
310        // If start is None, then we are at the start of the bond
311        if start.is_none() {
312            *start = Some(self.current_atom_index);
313        } else if end.is_none() {
314            *end = Some(self.current_atom_index);
315        } else {
316            return Err(ParseError::RingIndexError);
317        }
318
319        // This means we have invaid format e.g. C11 instead of C1 or C%11
320        if start == end {
321            return Err(ParseError::RingIndexError);
322        }
323
324        self.ring_number = None;
325        Ok(())
326    }
327
328    pub fn handle_complex_atom(
329        &mut self,
330        bytes: &[u8],
331        position: &mut usize,
332    ) -> Result<(), ParseError> {
333        self.handle_atom(None)?;
334
335        if bytes[*position].is_ascii_digit() {
336            let mut temp_number: u16 = 0;
337            while bytes[*position].is_ascii_digit() {
338                let number = byte_to_number(bytes[*position]);
339                temp_number = temp_number * 10 + number as u16;
340                *position += 1;
341            }
342            self.isotope = Some(temp_number);
343        }
344
345        let mut is_se_or_as = false;
346        let test = [bytes[*position], bytes[*position + 1]];
347        match &test {
348            b"se" => {
349                is_se_or_as = true;
350                self.element_buffer.push_str("SE");
351                self.is_aromatic = true;
352            }
353            b"as" => {
354                is_se_or_as = true;
355                self.element_buffer.push_str("AS");
356                self.is_aromatic = true;
357            }
358            _ => (),
359        }
360
361        if bytes[*position].is_ascii_uppercase() && !is_se_or_as {
362            self.element_buffer.push(bytes[*position] as char);
363            if bytes[*position + 1].is_ascii_lowercase() {
364                *position += 1;
365                self.element_buffer.push(bytes[*position] as char);
366            }
367        }
368
369        *position += 1;
370        let start_position = *position;
371
372        if bytes[*position] == b'@' {
373            let mut temp_counter = 0;
374            *position += 1;
375            while bytes[*position].is_ascii_uppercase() {
376                if bytes[*position] == b'H' && temp_counter == 0 {
377                    break;
378                }
379                *position += 1;
380                temp_counter += 1;
381                if temp_counter > 2 {
382                    return Err(ParseError::ChiralClass(
383                        "Chiral class is too long".to_string(),
384                    ));
385                }
386            }
387            while bytes[*position].is_ascii_digit() {
388                *position += 1;
389            }
390            self.chiral_class = parse_chiral_class(&bytes[start_position..*position])?;
391        }
392
393        if bytes[*position] == b'H' {
394            *position += 1;
395            // Theoretically we could have more than 9 hydrogen in extreme cases but it is not accepted in SMILES
396            if bytes[*position].is_ascii_digit() {
397                self.hydrogens
398                    .insert(self.current_atom_index, byte_to_number(bytes[*position]));
399            } else {
400                self.hydrogens.insert(self.current_atom_index, 1);
401            }
402        }
403
404        if bytes[*position] == b'+' || bytes[*position] == b'-' {
405            let sign = bytes[*position];
406            *position += 1;
407            if bytes[*position].is_ascii_digit() {
408                match sign {
409                    b'+' => self.current_atom_charge = Some(byte_to_number(bytes[*position]) as i8),
410                    b'-' => {
411                        self.current_atom_charge = Some(-(byte_to_number(bytes[*position]) as i8))
412                    }
413                    _ => (), // This should never happen
414                }
415            } else {
416                match sign {
417                    b'+' => self.current_atom_charge = Some(1),
418                    b'-' => self.current_atom_charge = Some(-1),
419                    _ => (), // This should never happen
420                }
421            }
422        }
423
424        if bytes[*position] == b':' {
425            *position += 1;
426            if bytes[*position].is_ascii_digit() {
427                self.current_atom_class = Some(byte_to_number(bytes[*position]));
428            } else {
429                return Err(ParseError::AtomClassMissing);
430            }
431        }
432        self.handle_bond()?;
433        Ok(())
434    }
435}
436
437fn byte_to_number(byte: u8) -> u8 {
438    byte - b'0'
439}
440
441fn parse_chiral_class(slice: &[u8]) -> Result<ChiralClass, ParseError> {
442    println!("Slice: {:?}", slice);
443    match slice {
444        s if s.starts_with(b"@@") => Ok(ChiralClass::R),
445        s if s.starts_with(b"@AL") => {
446            let number = parse_number_on_end_of_chiral_class(s);
447            Ok(ChiralClass::AL(number))
448        }
449        s if s.starts_with(b"@SP") => {
450            let number = parse_number_on_end_of_chiral_class(s);
451            Ok(ChiralClass::SP(number))
452        }
453        s if s.starts_with(b"@TB") => {
454            let number = parse_number_on_end_of_chiral_class(s);
455            Ok(ChiralClass::TB(number))
456        }
457        s if s.starts_with(b"@OH") => {
458            let number = parse_number_on_end_of_chiral_class(s);
459            Ok(ChiralClass::OH(number))
460        }
461        s if s.starts_with(b"@") => Ok(ChiralClass::S),
462        _ => Err(ParseError::ChiralClass(
463            String::from_utf8_lossy(slice).to_string(),
464        )),
465    }
466}
467
468fn parse_number_on_end_of_chiral_class(chiral_class: &[u8]) -> u8 {
469    let mut number = 0;
470    for &byte in chiral_class {
471        if byte.is_ascii_digit() {
472            number = number * 10 + byte_to_number(byte);
473        }
474    }
475    number
476}
477
478fn is_valid_isotope(atomic_number: u8, isotope: u16) -> bool {
479    let Some(isotopes) = atomic_number.isotopes() else {
480        // If the atomic number is not found, we assume that the isotope is not valid
481        // This is a bit of a hack but it should work
482        return false;
483    };
484
485    for iso in isotopes {
486        // TODO check if this is correct
487        if iso.mass.round() as u16 == isotope {
488            return true;
489        }
490    }
491    false
492}
493
494#[cfg(test)]
495mod tests {
496    use super::*;
497
498    #[test]
499    fn test_parse_smiles() {
500        let molecules = SMILESParser::parse_smiles("C(C(C))COcCl").unwrap();
501        assert_eq!(molecules[0].atoms.len(), 7);
502    }
503
504    #[test]
505    fn test_parse_smiles_with_isotope() {
506        let molecules = SMILESParser::parse_smiles("CCCC[13C]").unwrap();
507        assert_eq!(molecules[0].atoms.len(), 5);
508        assert_eq!(molecules[0].atoms[4].isotope.unwrap(), 13);
509    }
510
511    #[test]
512    fn test_parse_smiles_with_chiral_class() {
513        let molecules = SMILESParser::parse_smiles("C[C@](F)(Cl)Br").unwrap();
514        assert_eq!(molecules[0].atoms.len(), 5);
515        assert_eq!(molecules[0].atoms[1].chiral_class, ChiralClass::S);
516    }
517
518    #[test]
519    fn test_parse_smiles_with_complex_atom_and_isotope() {
520        let molecules = SMILESParser::parse_smiles("C[C@](F)(Cl)Br[13C]").unwrap();
521        assert_eq!(molecules[0].atoms.len(), 6);
522        assert_eq!(molecules[0].atoms[5].isotope.unwrap(), 13);
523    }
524
525    #[test]
526    fn test_parse_smiles_with_complex_atom_and_hydrogen() {
527        let molecules = SMILESParser::parse_smiles("C[C@H](Cl)C").unwrap();
528        println!("{:#?}", molecules);
529        assert_eq!(molecules[0].atoms.len(), 5);
530        assert_eq!(molecules[0].atoms[1].bonds().len(), 4);
531    }
532
533    #[test]
534    fn test_bond_parsing() {
535        let molecules = SMILESParser::parse_smiles("C-C=C#C").unwrap();
536        assert_eq!(molecules[0].atoms.len(), 4);
537        assert_eq!(molecules[0].get_edges().len(), 3);
538    }
539
540    #[test]
541    fn parse_test_file() {
542        let smiles = std::fs::read_to_string("tests/smiles.txt").unwrap();
543        for smile in smiles.lines() {
544            println!("Parsing: {}", smile);
545            match SMILESParser::parse_smiles(smile) {
546                Ok(molecules) => {
547                    println!("{:?}", molecules);
548                }
549                Err(e) => {
550                    println!("Error: {:?}", e);
551                    panic!();
552                }
553            }
554        }
555    }
556
557    #[test]
558    fn test_submolecule() {
559        let molecules = SMILESParser::parse_smiles("C(C(C))COcCl.C(C(C))").unwrap();
560        let submolecule = molecules[0].match_submolecule(&molecules[1]).unwrap();
561        assert_eq!(submolecule.len(), 3);
562    }
563}