1use chemistry_consts::ElementProperties;
2use crate::molecule::{Atom, Bond, BondType, ChiralClass, Molecule};
3use nohash_hasher::IntMap;
4
5#[derive(Debug)]
6pub enum ParseError {
7 ElementNotFound(String),
8 BondNotFound,
9 RingIndexError,
10 InvalidBranch,
11 InvalidAromatic(String),
12 SMILESComplexError,
13 Charge,
14 ChiralClass(String),
15 AtomClassMissing,
16 EOL,
17}
18
19#[derive(Default)]
20pub struct SMILESParser {
21 atoms: Vec<Atom>,
22 bonds: Vec<(usize, usize, BondType)>,
23 current_atom_index: usize,
24 current_atom_charge: Option<i8>,
25 element_buffer: String,
26 isotope: Option<u16>,
27 chiral_class: ChiralClass,
28 current_atom_class: Option<u8>,
29 atom_classes: IntMap<usize, u8>,
30 last_bond_type: BondType,
31 ring_number: Option<usize>,
32 ring_bonds: IntMap<usize, (Option<usize>, Option<usize>)>,
33 hydrogens: IntMap<usize, u8>,
34 branch_stack: Vec<usize>,
35 branch_exits: usize,
36 is_multiple_branch: bool,
37 is_double_digit: bool,
38 is_aromatic: bool,
39}
40
41impl SMILESParser {
42 pub fn parse_smiles(smiles: &str) -> Result<Vec<Molecule>, ParseError> {
53 let mut parser = SMILESParser::default();
54 let mut molecules = Vec::new();
55
56 let bytes = smiles.as_bytes();
58 let mut pointer = 0;
59 while pointer < bytes.len() {
60 let Some(&byte) = bytes.get(pointer) else {
62 break;
63 };
64 pointer += 1;
65 match byte {
66 b'A'..=b'Z' => {
67 parser.handle_atom(Some(byte))?
69 }
70 b'a'..=b'z' => {
71 match byte {
73 b'b' | b'c' | b'n' | b'o' | b's' | b'p' => {
74 parser.handle_atom(Some(byte))?
75 }
76 b'r' => {
77 if parser.element_buffer == "B" {
78 parser.element_buffer.push('R')
79 } else {
80 return Err(ParseError::InvalidAromatic(
81 parser.element_buffer.clone(),
82 ));
83 }
84 }
85
86 b'l' => {
87 if parser.element_buffer == "C" {
88 parser.element_buffer.push('L')
89 } else {
90 return Err(ParseError::InvalidAromatic(
91 parser.element_buffer.clone(),
92 ));
93 }
94 }
95 anything_else => {
96 parser.element_buffer.push(anything_else as char);
97 return Err(ParseError::InvalidAromatic(parser.element_buffer.clone()));
98 }
99 }
100 }
101
102 b'.' => {
103 parser.handle_atom(None)?;
104 parser.add_all_bonds();
105 molecules
106 .push(Molecule::from_atoms(parser.atoms).with_classes(parser.atom_classes));
107 parser = SMILESParser::default();
108 }
109
110 b'1'..=b'9' => {
111 parser.handle_number(byte)?;
112 }
113 b'-' => {
114 parser.last_bond_type = BondType::Single;
115 }
116 b'=' => {
117 parser.last_bond_type = BondType::Double;
118 }
119 b'#' => {
120 parser.last_bond_type = BondType::Triple;
121 }
122 b'$' => {
123 parser.last_bond_type = BondType::Quadruple;
124 }
125 b':' => {
126 parser.last_bond_type = BondType::Aromatic;
127 }
128 b'(' => {
129 if parser.branch_exits > 0 {
130 parser.is_multiple_branch = true;
131 } else {
132 parser.branch_stack.push(parser.current_atom_index);
133 }
134 }
135 b')' => parser.branch_exits += 1,
136 b'[' => {
137 parser.handle_complex_atom(bytes, &mut pointer)?;
138 }
139
140 b'%' => {
141 parser.is_double_digit = true;
142 }
143 _ => (),
144 }
145 }
146
147 if !parser.element_buffer.is_empty() {
148 parser.handle_atom(None)?
149 }
150
151 parser.add_all_bonds();
152
153 molecules.push(Molecule::from_atoms(parser.atoms).with_classes(parser.atom_classes));
154 Ok(molecules)
155 }
156
157 fn add_all_bonds(&mut self) {
158 #[cfg(debug_assertions)]
159 println!("Ring bonds: {:?}", self.ring_bonds);
160 for (start, end) in self.ring_bonds.values() {
161 if start.is_some() && end.is_some() {
162 self.bonds
163 .push((start.unwrap(), end.unwrap(), BondType::Aromatic));
164 }
165 }
166
167 #[cfg(debug_assertions)]
168 println!("Hydrogens: {:?}", self.hydrogens);
169 for (atom, number) in self.hydrogens.iter() {
170 let hydrogen_index = self.current_atom_index;
171 for index in 0..*number {
172 self.atoms.push(Atom::new(1));
173 self.bonds
174 .push((*atom, hydrogen_index + index as usize, BondType::Single));
175 }
176 }
177
178 for bond in self.bonds.iter() {
180 let atom1 = &mut self.atoms[bond.0];
181 atom1.add_bond(Bond::new(bond.1, bond.2));
182 let atom2 = &mut self.atoms[bond.1];
183 atom2.add_bond(Bond::new(bond.0, bond.2));
184 }
185 }
186
187 fn handle_branch(&mut self) -> Result<(), ParseError> {
188 let mut branch_atom = None;
190 while let Some(branch) = self.branch_stack.pop() {
191 branch_atom = Some(branch);
192 if self.branch_exits > 0 {
193 self.branch_exits -= 1;
194 } else {
195 return Err(ParseError::InvalidBranch);
196 }
197 if self.branch_exits == 0 {
198 break;
199 }
200 }
201
202 if self.branch_exits > 0 {
203 return Err(ParseError::InvalidBranch);
204 }
205
206 if let Some(branch_atom) = branch_atom {
207 self.bonds
208 .push((branch_atom, self.current_atom_index, self.last_bond_type));
209 if self.is_multiple_branch {
210 self.branch_stack.push(branch_atom);
212 self.is_multiple_branch = false;
213 }
214 } else {
215 return Err(ParseError::BondNotFound);
216 }
217 Ok(())
218 }
219 fn handle_bond(&mut self) -> Result<(), ParseError> {
220 if self.current_atom_index == 0 {
221 return Ok(());
222 }
223 if self.branch_exits > 0 && !self.branch_stack.is_empty() {
224 self.handle_branch()?
225 } else {
226 self.bonds.push((
227 self.current_atom_index - 1,
228 self.current_atom_index,
229 self.last_bond_type,
230 ));
231 }
232 self.last_bond_type = BondType::Single;
233 Ok(())
234 }
235
236 fn handle_atom(&mut self, byte: Option<u8>) -> Result<(), ParseError> {
237 if !self.element_buffer.is_empty() {
238 let atomic_number = &self.element_buffer.to_uppercase().as_str().atomic_number()
239 .ok_or(ParseError::ElementNotFound(self.element_buffer.to_owned()))?;
240
241 let mut atom = Atom::new(*atomic_number);
242
243 self.current_atom_index += 1;
244
245 if byte.is_some() {
247 self.handle_bond()?
248 }
249
250 if self.element_buffer.chars().next().unwrap().is_lowercase() || self.is_aromatic {
251 atom = Atom::new(*atomic_number).aromatic();
252 }
253
254 if let Some(isotope) = self.isotope {
255 if is_valid_isotope(*atomic_number, isotope) {
256 atom = atom.with_isotope(isotope);
257 self.isotope = None;
258 } else {
259 println!(
260 "Isotope {} is not valid for atomic number {}, skipping it",
261 isotope, *atomic_number
262 );
263 }
264 }
265
266 if self.chiral_class != ChiralClass::None {
267 atom = atom.with_chiral_class(self.chiral_class);
268 self.chiral_class = ChiralClass::None;
269 }
270
271 if let Some(charge) = self.current_atom_charge {
272 atom = atom.with_charge(charge);
273 self.current_atom_charge = None;
274 }
275
276 if let Some(atom_class) = self.current_atom_class {
277 self.atom_classes
278 .insert(self.current_atom_index, atom_class);
279 self.current_atom_class = None;
280 }
281
282 self.atoms.push(atom);
283 self.element_buffer.clear();
284 }
285 let Some(character_byte) = byte else {
286 return Ok(());
287 };
288 self.element_buffer.push(character_byte as char);
289 Ok(())
290 }
291
292 fn handle_number(&mut self, byte: u8) -> Result<(), ParseError> {
294 if self.is_double_digit {
295 if let Some(ring_number_value) = self.ring_number {
296 self.ring_number = Some(ring_number_value * 10 + byte_to_number(byte) as usize);
298 self.is_double_digit = false;
299 } else {
300 self.ring_number = Some(byte_to_number(byte) as usize);
301 return Ok(());
302 }
303 }
304
305 let Some(ring) = self.ring_number else {
306 return Ok(());
307 };
308
309 let (start, end) = self.ring_bonds.entry(ring).or_insert((None, None));
310 if start.is_none() {
312 *start = Some(self.current_atom_index);
313 } else if end.is_none() {
314 *end = Some(self.current_atom_index);
315 } else {
316 return Err(ParseError::RingIndexError);
317 }
318
319 if start == end {
321 return Err(ParseError::RingIndexError);
322 }
323
324 self.ring_number = None;
325 Ok(())
326 }
327
328 pub fn handle_complex_atom(
329 &mut self,
330 bytes: &[u8],
331 position: &mut usize,
332 ) -> Result<(), ParseError> {
333 self.handle_atom(None)?;
334
335 if bytes[*position].is_ascii_digit() {
336 let mut temp_number: u16 = 0;
337 while bytes[*position].is_ascii_digit() {
338 let number = byte_to_number(bytes[*position]);
339 temp_number = temp_number * 10 + number as u16;
340 *position += 1;
341 }
342 self.isotope = Some(temp_number);
343 }
344
345 let mut is_se_or_as = false;
346 let test = [bytes[*position], bytes[*position + 1]];
347 match &test {
348 b"se" => {
349 is_se_or_as = true;
350 self.element_buffer.push_str("SE");
351 self.is_aromatic = true;
352 }
353 b"as" => {
354 is_se_or_as = true;
355 self.element_buffer.push_str("AS");
356 self.is_aromatic = true;
357 }
358 _ => (),
359 }
360
361 if bytes[*position].is_ascii_uppercase() && !is_se_or_as {
362 self.element_buffer.push(bytes[*position] as char);
363 if bytes[*position + 1].is_ascii_lowercase() {
364 *position += 1;
365 self.element_buffer.push(bytes[*position] as char);
366 }
367 }
368
369 *position += 1;
370 let start_position = *position;
371
372 if bytes[*position] == b'@' {
373 let mut temp_counter = 0;
374 *position += 1;
375 while bytes[*position].is_ascii_uppercase() {
376 if bytes[*position] == b'H' && temp_counter == 0 {
377 break;
378 }
379 *position += 1;
380 temp_counter += 1;
381 if temp_counter > 2 {
382 return Err(ParseError::ChiralClass(
383 "Chiral class is too long".to_string(),
384 ));
385 }
386 }
387 while bytes[*position].is_ascii_digit() {
388 *position += 1;
389 }
390 self.chiral_class = parse_chiral_class(&bytes[start_position..*position])?;
391 }
392
393 if bytes[*position] == b'H' {
394 *position += 1;
395 if bytes[*position].is_ascii_digit() {
397 self.hydrogens
398 .insert(self.current_atom_index, byte_to_number(bytes[*position]));
399 } else {
400 self.hydrogens.insert(self.current_atom_index, 1);
401 }
402 }
403
404 if bytes[*position] == b'+' || bytes[*position] == b'-' {
405 let sign = bytes[*position];
406 *position += 1;
407 if bytes[*position].is_ascii_digit() {
408 match sign {
409 b'+' => self.current_atom_charge = Some(byte_to_number(bytes[*position]) as i8),
410 b'-' => {
411 self.current_atom_charge = Some(-(byte_to_number(bytes[*position]) as i8))
412 }
413 _ => (), }
415 } else {
416 match sign {
417 b'+' => self.current_atom_charge = Some(1),
418 b'-' => self.current_atom_charge = Some(-1),
419 _ => (), }
421 }
422 }
423
424 if bytes[*position] == b':' {
425 *position += 1;
426 if bytes[*position].is_ascii_digit() {
427 self.current_atom_class = Some(byte_to_number(bytes[*position]));
428 } else {
429 return Err(ParseError::AtomClassMissing);
430 }
431 }
432 self.handle_bond()?;
433 Ok(())
434 }
435}
436
437fn byte_to_number(byte: u8) -> u8 {
438 byte - b'0'
439}
440
441fn parse_chiral_class(slice: &[u8]) -> Result<ChiralClass, ParseError> {
442 println!("Slice: {:?}", slice);
443 match slice {
444 s if s.starts_with(b"@@") => Ok(ChiralClass::R),
445 s if s.starts_with(b"@AL") => {
446 let number = parse_number_on_end_of_chiral_class(s);
447 Ok(ChiralClass::AL(number))
448 }
449 s if s.starts_with(b"@SP") => {
450 let number = parse_number_on_end_of_chiral_class(s);
451 Ok(ChiralClass::SP(number))
452 }
453 s if s.starts_with(b"@TB") => {
454 let number = parse_number_on_end_of_chiral_class(s);
455 Ok(ChiralClass::TB(number))
456 }
457 s if s.starts_with(b"@OH") => {
458 let number = parse_number_on_end_of_chiral_class(s);
459 Ok(ChiralClass::OH(number))
460 }
461 s if s.starts_with(b"@") => Ok(ChiralClass::S),
462 _ => Err(ParseError::ChiralClass(
463 String::from_utf8_lossy(slice).to_string(),
464 )),
465 }
466}
467
468fn parse_number_on_end_of_chiral_class(chiral_class: &[u8]) -> u8 {
469 let mut number = 0;
470 for &byte in chiral_class {
471 if byte.is_ascii_digit() {
472 number = number * 10 + byte_to_number(byte);
473 }
474 }
475 number
476}
477
478fn is_valid_isotope(atomic_number: u8, isotope: u16) -> bool {
479 let Some(isotopes) = atomic_number.isotopes() else {
480 return false;
483 };
484
485 for iso in isotopes {
486 if iso.mass.round() as u16 == isotope {
488 return true;
489 }
490 }
491 false
492}
493
494#[cfg(test)]
495mod tests {
496 use super::*;
497
498 #[test]
499 fn test_parse_smiles() {
500 let molecules = SMILESParser::parse_smiles("C(C(C))COcCl").unwrap();
501 assert_eq!(molecules[0].atoms.len(), 7);
502 }
503
504 #[test]
505 fn test_parse_smiles_with_isotope() {
506 let molecules = SMILESParser::parse_smiles("CCCC[13C]").unwrap();
507 assert_eq!(molecules[0].atoms.len(), 5);
508 assert_eq!(molecules[0].atoms[4].isotope.unwrap(), 13);
509 }
510
511 #[test]
512 fn test_parse_smiles_with_chiral_class() {
513 let molecules = SMILESParser::parse_smiles("C[C@](F)(Cl)Br").unwrap();
514 assert_eq!(molecules[0].atoms.len(), 5);
515 assert_eq!(molecules[0].atoms[1].chiral_class, ChiralClass::S);
516 }
517
518 #[test]
519 fn test_parse_smiles_with_complex_atom_and_isotope() {
520 let molecules = SMILESParser::parse_smiles("C[C@](F)(Cl)Br[13C]").unwrap();
521 assert_eq!(molecules[0].atoms.len(), 6);
522 assert_eq!(molecules[0].atoms[5].isotope.unwrap(), 13);
523 }
524
525 #[test]
526 fn test_parse_smiles_with_complex_atom_and_hydrogen() {
527 let molecules = SMILESParser::parse_smiles("C[C@H](Cl)C").unwrap();
528 println!("{:#?}", molecules);
529 assert_eq!(molecules[0].atoms.len(), 5);
530 assert_eq!(molecules[0].atoms[1].bonds().len(), 4);
531 }
532
533 #[test]
534 fn test_bond_parsing() {
535 let molecules = SMILESParser::parse_smiles("C-C=C#C").unwrap();
536 assert_eq!(molecules[0].atoms.len(), 4);
537 assert_eq!(molecules[0].get_edges().len(), 3);
538 }
539
540 #[test]
541 fn parse_test_file() {
542 let smiles = std::fs::read_to_string("tests/smiles.txt").unwrap();
543 for smile in smiles.lines() {
544 println!("Parsing: {}", smile);
545 match SMILESParser::parse_smiles(smile) {
546 Ok(molecules) => {
547 println!("{:?}", molecules);
548 }
549 Err(e) => {
550 println!("Error: {:?}", e);
551 panic!();
552 }
553 }
554 }
555 }
556
557 #[test]
558 fn test_submolecule() {
559 let molecules = SMILESParser::parse_smiles("C(C(C))COcCl.C(C(C))").unwrap();
560 let submolecule = molecules[0].match_submolecule(&molecules[1]).unwrap();
561 assert_eq!(submolecule.len(), 3);
562 }
563}