1use crate::graph::{Atom, Bond, BondOrder, BondStereo, ChiralType, Hybridization, Molecule};
2use nalgebra::Vector3;
3use petgraph::graph::NodeIndex;
4use std::collections::HashMap;
5
6pub struct SmilesParser<'a> {
15 input: &'a [u8],
16 pos: usize,
17 rings: HashMap<u32, (NodeIndex, BondOrder, BondStereo)>,
18 mol: &'a mut Molecule,
19}
20
21impl<'a> SmilesParser<'a> {
22 pub fn new(smiles: &'a str, mol: &'a mut Molecule) -> Self {
24 SmilesParser {
25 input: smiles.as_bytes(),
26 pos: 0,
27 rings: HashMap::new(),
28 mol,
29 }
30 }
31
32 pub fn parse(&mut self) -> Result<(), String> {
36 self.parse_chain(None)?;
37 self.add_implicit_hydrogens();
39 Ok(())
40 }
41
42 fn peek(&self) -> Option<u8> {
43 self.input.get(self.pos).copied()
44 }
45
46 fn advance(&mut self) {
47 self.pos += 1;
48 }
49
50 fn parse_chain(&mut self, mut prev_node: Option<NodeIndex>) -> Result<(), String> {
51 let mut prev_bond_order = BondOrder::Single;
52 let mut prev_bond_stereo = BondStereo::None;
53
54 let get_bond_order =
55 |m: &Molecule, prev: NodeIndex, curr: NodeIndex, order: BondOrder| -> BondOrder {
56 if order == BondOrder::Single
57 && m.graph[prev].hybridization == Hybridization::SP2
58 && m.graph[curr].hybridization == Hybridization::SP2
59 {
60 BondOrder::Aromatic
61 } else {
62 order
63 }
64 };
65
66 while self.pos < self.input.len() {
67 let c = self.peek().unwrap();
68 match c {
69 b'(' => {
70 self.advance();
71 self.parse_chain(prev_node)?;
72 if self.peek() == Some(b')') {
73 self.advance();
74 } else {
75 return Err("Expected ')'".to_string());
76 }
77 }
78 b')' => {
79 break;
80 }
81 b'-' => {
82 self.advance();
83 prev_bond_order = BondOrder::Single;
84 }
85 b'=' => {
86 self.advance();
87 prev_bond_order = BondOrder::Double;
88 }
89 b'#' => {
90 self.advance();
91 prev_bond_order = BondOrder::Triple;
92 }
93 b':' => {
94 self.advance();
95 prev_bond_order = BondOrder::Aromatic;
96 }
97 b'.' => {
98 self.advance();
100 prev_node = None;
101 prev_bond_order = BondOrder::Single;
102 prev_bond_stereo = BondStereo::None;
103 }
104 b'/' => {
105 self.advance();
106 prev_bond_order = BondOrder::Single;
107 prev_bond_stereo = BondStereo::E;
108 } b'\\' => {
110 self.advance();
111 prev_bond_order = BondOrder::Single;
112 prev_bond_stereo = BondStereo::Z;
113 }
114 b'%' | b'0'..=b'9' => {
115 let mut ring_num = 0;
116 if c == b'%' {
117 self.advance();
118 if let (Some(d1), Some(d2)) = (self.peek(), self.input.get(self.pos + 1)) {
119 if d1.is_ascii_digit() && d2.is_ascii_digit() {
120 ring_num = ((d1 - b'0') * 10 + (d2 - b'0')) as u32;
121 self.advance();
122 self.advance();
123 }
124 }
125 } else {
126 ring_num = (c - b'0') as u32;
127 self.advance();
128 }
129 if ring_num > 0 {
130 if let Some(prev) = prev_node {
131 if let Some((target, order, stereo)) = self.rings.remove(&ring_num) {
132 let mut final_order = if prev_bond_order != BondOrder::Single {
133 prev_bond_order
134 } else {
135 order
136 };
137 final_order = get_bond_order(self.mol, prev, target, final_order);
138 let final_stereo = if prev_bond_stereo != BondStereo::None {
139 prev_bond_stereo
140 } else {
141 stereo
142 };
143 self.mol.add_bond(
144 prev,
145 target,
146 Bond {
147 order: final_order,
148 stereo: final_stereo,
149 },
150 );
151 } else {
152 self.rings
153 .insert(ring_num, (prev, prev_bond_order, prev_bond_stereo));
154 }
155 prev_bond_order = BondOrder::Single;
157 prev_bond_stereo = BondStereo::None;
158 }
159 }
160 }
161 b'[' => {
162 self.advance();
163 let node = self.parse_bracket_atom()?;
164 if let Some(prev) = prev_node {
165 let order = get_bond_order(self.mol, prev, node, prev_bond_order);
166 self.mol.add_bond(
167 prev,
168 node,
169 Bond {
170 order,
171 stereo: prev_bond_stereo,
172 },
173 );
174 }
175 prev_node = Some(node);
176 prev_bond_order = BondOrder::Single;
177 prev_bond_stereo = BondStereo::None;
178 }
179 _ if c.is_ascii_alphabetic() || c == b'*' => {
180 let node = self.parse_organic_subset()?;
181 if let Some(prev) = prev_node {
182 let order = get_bond_order(self.mol, prev, node, prev_bond_order);
183 self.mol.add_bond(
184 prev,
185 node,
186 Bond {
187 order,
188 stereo: prev_bond_stereo,
189 },
190 );
191 }
192 prev_node = Some(node);
193 prev_bond_order = BondOrder::Single;
194 prev_bond_stereo = BondStereo::None;
195 }
196 _ => return Err(format!("Unexpected character: {}", c as char)),
197 }
198 }
199 Ok(())
200 }
201
202 fn parse_organic_subset(&mut self) -> Result<NodeIndex, String> {
203 let mut elem_str = String::new();
204 let c = self.peek().unwrap();
205 self.advance();
206 elem_str.push(c as char);
207 if let Some(nc) = self.peek() {
208 if nc.is_ascii_lowercase()
209 && (c == b'C' && (nc == b'l' || nc == b'u' || nc == b'o' || nc == b'r')
210 || c == b'B' && nc == b'r')
211 {
212 elem_str.push(nc as char);
213 self.advance();
214 }
215 }
216
217 let mut aromatic = false;
219 if elem_str.chars().next().unwrap().is_lowercase() {
220 aromatic = true;
221 elem_str = elem_str.to_uppercase();
222 }
223
224 let element = match elem_str.as_str() {
225 "C" => 6,
226 "N" => 7,
227 "O" => 8,
228 "P" => 15,
229 "S" => 16,
230 "F" => 9,
231 "CL" | "Cl" => 17,
232 "BR" | "Br" => 35,
233 "I" => 53,
234 "B" => 5,
235 "*" => 0,
236 &_ => return Err(format!("Unknown element: {}", elem_str)),
237 };
238
239 let atom = Atom {
240 element,
241 position: Vector3::zeros(),
242 charge: 0.0,
243 formal_charge: 0,
244 hybridization: if aromatic {
245 Hybridization::SP2
246 } else {
247 Hybridization::SP3
248 },
249 chiral_tag: ChiralType::Unspecified,
250 explicit_h: 0,
251 };
252
253 Ok(self.mol.add_atom(atom))
254 }
255
256 fn parse_bracket_atom(&mut self) -> Result<NodeIndex, String> {
257 let mut isotope = 0u32;
258 let mut aromatic = false;
259 let mut chiral = ChiralType::Unspecified;
260 let mut h_count = 0;
261 let mut charge = 0;
262
263 while let Some(c) = self.peek() {
265 if c.is_ascii_digit() {
266 isotope = isotope * 10 + (c - b'0') as u32;
267 self.advance();
268 } else {
269 break;
270 }
271 }
272
273 let mut elem_str = String::new();
275 if let Some(c) = self.peek() {
276 if c.is_ascii_alphabetic() {
277 elem_str.push(c as char);
278 self.advance();
279 if let Some(nc) = self.peek() {
280 if nc.is_ascii_lowercase() && c.is_ascii_uppercase() {
281 elem_str.push(nc as char);
282 self.advance();
283 }
284 }
285 } else if c == b'*' {
286 elem_str.push('*');
287 self.advance();
288 }
289 }
290
291 if elem_str.chars().next().unwrap().is_lowercase() {
292 aromatic = true;
293 elem_str = elem_str.to_uppercase();
294 }
295
296 let element: u8 = match elem_str.as_str() {
297 "H" => 1,
298 "HE" | "He" => 2,
299 "LI" | "Li" => 3,
300 "BE" | "Be" => 4,
301 "B" => 5,
302 "C" => 6,
303 "N" => 7,
304 "O" => 8,
305 "F" => 9,
306 "NE" | "Ne" => 10,
307 "NA" | "Na" => 11,
308 "MG" | "Mg" => 12,
309 "AL" | "Al" => 13,
310 "SI" | "Si" => 14,
311 "P" => 15,
312 "S" => 16,
313 "CL" | "Cl" => 17,
314 "AR" | "Ar" => 18,
315 "K" => 19,
316 "CA" | "Ca" => 20,
317 "SC" | "Sc" => 21,
318 "TI" | "Ti" => 22,
319 "V" => 23,
320 "CR" | "Cr" => 24,
321 "MN" | "Mn" => 25,
322 "FE" | "Fe" => 26,
323 "CO" | "Co" => 27,
324 "NI" | "Ni" => 28,
325 "CU" | "Cu" => 29,
326 "ZN" | "Zn" => 30,
327 "GA" | "Ga" => 31,
328 "GE" | "Ge" => 32,
329 "AS" | "As" => 33,
330 "SE" | "Se" => 34,
331 "BR" | "Br" => 35,
332 "KR" | "Kr" => 36,
333 "RB" | "Rb" => 37,
334 "SR" | "Sr" => 38,
335 "Y" => 39,
336 "ZR" | "Zr" => 40,
337 "MO" | "Mo" => 42,
338 "RU" | "Ru" => 44,
339 "RH" | "Rh" => 45,
340 "PD" | "Pd" => 46,
341 "AG" | "Ag" => 47,
342 "CD" | "Cd" => 48,
343 "IN" | "In" => 49,
344 "SN" | "Sn" => 50,
345 "SB" | "Sb" => 51,
346 "TE" | "Te" => 52,
347 "I" => 53,
348 "XE" | "Xe" => 54,
349 "CS" | "Cs" => 55,
350 "BA" | "Ba" => 56,
351 "LA" | "La" => 57,
352 "W" => 74,
353 "RE" | "Re" => 75,
354 "OS" | "Os" => 76,
355 "IR" | "Ir" => 77,
356 "PT" | "Pt" => 78,
357 "AU" | "Au" => 79,
358 "HG" | "Hg" => 80,
359 "TL" | "Tl" => 81,
360 "PB" | "Pb" => 82,
361 "BI" | "Bi" => 83,
362 "*" => 0,
363 _ => return Err(format!("Unknown bracket element: {}", elem_str)),
364 };
365
366 if let Some(c) = self.peek() {
368 if c == b'@' {
369 self.advance();
370 chiral = ChiralType::TetrahedralCCW;
371 if self.peek() == Some(b'@') {
372 self.advance();
373 chiral = ChiralType::TetrahedralCW;
374 }
375 }
376 }
377
378 if let Some(c) = self.peek() {
380 if c == b'H' {
381 self.advance();
382 h_count = 1;
383 if let Some(hc) = self.peek() {
384 if hc.is_ascii_digit() {
385 h_count = hc - b'0';
386 self.advance();
387 }
388 }
389 }
390 }
391
392 if let Some(c) = self.peek() {
394 if c == b'+' {
395 self.advance();
396 charge = 1;
397 if let Some(cc) = self.peek() {
398 if cc.is_ascii_digit() {
399 charge = (cc - b'0') as i8;
400 self.advance();
401 } else if cc == b'+' {
402 charge = 2;
403 self.advance();
404 if self.peek() == Some(b'+') {
405 charge = 3;
406 self.advance();
407 }
408 }
409 }
410 } else if c == b'-' {
411 self.advance();
412 charge = -1;
413 if let Some(cc) = self.peek() {
414 if cc.is_ascii_digit() {
415 charge = -((cc - b'0') as i8);
416 self.advance();
417 } else if cc == b'-' {
418 charge = -2;
419 self.advance();
420 if self.peek() == Some(b'-') {
421 charge = -3;
422 self.advance();
423 }
424 }
425 }
426 }
427 }
428
429 if self.peek() == Some(b']') {
430 self.advance();
431 } else {
432 return Err("Expected ']'".to_string());
433 }
434
435 let atom = Atom {
436 element,
437 position: Vector3::zeros(),
438 charge: charge as f32,
439 formal_charge: charge,
440 hybridization: if aromatic {
441 Hybridization::SP2
442 } else {
443 Hybridization::SP3
444 }, chiral_tag: chiral,
446 explicit_h: h_count,
447 };
448
449 let node = self.mol.add_atom(atom);
450
451 Ok(node)
452 }
453
454 fn add_implicit_hydrogens(&mut self) {
455 let n = self.mol.graph.node_count();
456 let mut to_add = Vec::new();
457
458 for i in 0..n {
459 let ni = NodeIndex::new(i);
460 let atom = &self.mol.graph[ni];
461 if atom.element == 1 || atom.element == 0 {
462 continue;
463 } let mut current_valents = 0;
466 let mut double_bonds = 0;
467 let mut triple_bonds = 0;
468 let mut aromatic_bonds = 0;
469
470 for edge in self.mol.graph.edges(ni) {
471 match edge.weight().order {
472 BondOrder::Single => {
473 current_valents += 1;
474 }
475 BondOrder::Double => {
476 current_valents += 2;
477 double_bonds += 1;
478 }
479 BondOrder::Triple => {
480 current_valents += 3;
481 triple_bonds += 1;
482 }
483 BondOrder::Aromatic => {
484 current_valents += 1;
485 aromatic_bonds += 1;
486 }
487 _ => {
488 current_valents += 1;
489 }
490 };
491 }
492
493 if aromatic_bonds == 0 {
495 let actual_hybridization = if triple_bonds > 0 || double_bonds > 1 {
496 Hybridization::SP
497 } else if double_bonds == 1 {
498 Hybridization::SP2
499 } else {
500 let elem = atom.element;
503 let is_conjugated = if (elem == 7 || elem == 8) && double_bonds == 0 {
504 self.mol.graph.neighbors(ni).any(|nb| {
506 self.mol.graph.edges(nb).any(|e| {
507 matches!(e.weight().order, BondOrder::Double | BondOrder::Aromatic)
508 })
509 })
510 } else {
511 false
512 };
513 if is_conjugated {
514 Hybridization::SP2
515 } else {
516 Hybridization::SP3
517 }
518 };
519
520 if let Some(atom_mut) = self.mol.graph.node_weight_mut(ni) {
521 atom_mut.hybridization = actual_hybridization;
522 }
523 } else if let Some(atom_mut) = self.mol.graph.node_weight_mut(ni) {
524 atom_mut.hybridization = Hybridization::SP2;
525 }
526
527 let atom = &self.mol.graph[ni];
528 let target_val = match atom.element {
530 5 => 3, 6 => 4, 7 => {
533 if atom.formal_charge == 1 {
534 4
535 } else {
536 3
537 }
538 } 8 => {
540 if atom.formal_charge == 1 {
541 3
542 } else {
543 2
544 }
545 } 9 | 17 | 35 | 53 => 1, 14 => 4, 15 => 3, 16 => 2, 32 => 4, 33 => 3, 34 => 2, 50 => 4, 51 => 3, 52 => 2, _ => 0,
557 };
558
559 let mut h_needed = target_val - current_valents - atom.formal_charge.abs() as i32;
560
561 if atom.hybridization == Hybridization::SP2 && atom.element == 6 {
562 let aromatic_bonds = self
564 .mol
565 .graph
566 .edges(ni)
567 .filter(|e| e.weight().order == BondOrder::Aromatic)
568 .count();
569 if aromatic_bonds == 2 {
570 h_needed = 1 - (current_valents - 2); } else if aromatic_bonds == 3 {
572 h_needed = 0; }
574 }
575
576 if aromatic_bonds > 0 && atom.element == 7 && atom.explicit_h == 0 {
580 h_needed = 0;
581 }
582
583 let total_h_for_this_atom = atom.explicit_h as i32
586 + if h_needed - (atom.explicit_h as i32) > 0 {
587 h_needed - (atom.explicit_h as i32)
588 } else {
589 0
590 };
591
592 if total_h_for_this_atom > 0 {
593 to_add.push((ni, total_h_for_this_atom as u8));
594 }
595 }
596
597 for (parent, count) in to_add {
599 for _ in 0..count {
600 let h_node = self.mol.add_atom(Atom {
601 element: 1,
602 position: Vector3::zeros(),
603 charge: 0.0,
604 formal_charge: 0,
605 hybridization: Hybridization::Unknown,
606 chiral_tag: ChiralType::Unspecified,
607 explicit_h: 0,
608 });
609 self.mol.add_bond(
610 parent,
611 h_node,
612 Bond {
613 order: BondOrder::Single,
614 stereo: BondStereo::None,
615 },
616 );
617 }
618 }
619 }
620}
621
622#[cfg(test)]
623mod tests {
624 use super::*;
625
626 #[test]
627 fn test_parse_benzene_aliphatic() {
628 let mol = Molecule::from_smiles("C1=CC=CC=C1").unwrap();
629 assert_eq!(mol.graph.node_count(), 12); assert_eq!(mol.graph.edge_count(), 12); }
632
633 #[test]
634 fn test_parse_benzene_aromatic() {
635 let mol = Molecule::from_smiles("c1ccccc1").unwrap();
636 assert_eq!(mol.graph.node_count(), 12); assert_eq!(mol.graph.edge_count(), 12); }
639
640 #[test]
641 fn test_parse_brackets_and_branches() {
642 let mol = Molecule::from_smiles("C(C)(C)C").unwrap(); assert_eq!(mol.graph.node_count(), 14); }
645}