1use crate::graph::{Atom, Bond, BondOrder, BondStereo, ChiralType, Hybridization, Molecule};
2use nalgebra::Vector3;
3use petgraph::graph::NodeIndex;
4use std::collections::HashMap;
5
6pub struct SmilesParser<'a> {
15 input: &'a [u8],
16 pos: usize,
17 rings: HashMap<u32, (NodeIndex, BondOrder, BondStereo)>,
18 mol: &'a mut Molecule,
19}
20
21impl<'a> SmilesParser<'a> {
22 pub fn new(smiles: &'a str, mol: &'a mut Molecule) -> Self {
24 SmilesParser {
25 input: smiles.as_bytes(),
26 pos: 0,
27 rings: HashMap::new(),
28 mol,
29 }
30 }
31
32 pub fn parse(&mut self) -> Result<(), String> {
36 self.parse_chain(None)?;
37 self.add_implicit_hydrogens();
39 Ok(())
40 }
41
42 fn peek(&self) -> Option<u8> {
43 self.input.get(self.pos).copied()
44 }
45
46 fn advance(&mut self) {
47 self.pos += 1;
48 }
49
50 fn parse_chain(&mut self, mut prev_node: Option<NodeIndex>) -> Result<(), String> {
51 let mut prev_bond_order = BondOrder::Single;
52 let mut prev_bond_stereo = BondStereo::None;
53
54 let get_bond_order =
55 |m: &Molecule, prev: NodeIndex, curr: NodeIndex, order: BondOrder| -> BondOrder {
56 if order == BondOrder::Single
57 && m.graph[prev].hybridization == Hybridization::SP2
58 && m.graph[curr].hybridization == Hybridization::SP2
59 {
60 BondOrder::Aromatic
61 } else {
62 order
63 }
64 };
65
66 while self.pos < self.input.len() {
67 let c = self.peek().unwrap();
68 match c {
69 b'(' => {
70 self.advance();
71 self.parse_chain(prev_node)?;
72 if self.peek() == Some(b')') {
73 self.advance();
74 } else {
75 return Err("Expected ')'".to_string());
76 }
77 }
78 b')' => {
79 break;
80 }
81 b'-' => {
82 self.advance();
83 prev_bond_order = BondOrder::Single;
84 }
85 b'=' => {
86 self.advance();
87 prev_bond_order = BondOrder::Double;
88 }
89 b'#' => {
90 self.advance();
91 prev_bond_order = BondOrder::Triple;
92 }
93 b':' => {
94 self.advance();
95 prev_bond_order = BondOrder::Aromatic;
96 }
97 b'.' => {
98 self.advance();
100 prev_node = None;
101 prev_bond_order = BondOrder::Single;
102 prev_bond_stereo = BondStereo::None;
103 }
104 b'/' => {
105 self.advance();
106 prev_bond_order = BondOrder::Single;
107 prev_bond_stereo = BondStereo::E;
108 } b'\\' => {
110 self.advance();
111 prev_bond_order = BondOrder::Single;
112 prev_bond_stereo = BondStereo::Z;
113 }
114 b'%' | b'0'..=b'9' => {
115 let mut ring_num = 0;
116 if c == b'%' {
117 self.advance();
118 if let (Some(d1), Some(d2)) = (self.peek(), self.input.get(self.pos + 1)) {
119 if d1.is_ascii_digit() && d2.is_ascii_digit() {
120 ring_num = ((d1 - b'0') * 10 + (d2 - b'0')) as u32;
121 self.advance();
122 self.advance();
123 }
124 }
125 } else {
126 ring_num = (c - b'0') as u32;
127 self.advance();
128 }
129 if ring_num > 0 {
130 if let Some(prev) = prev_node {
131 if let Some((target, order, stereo)) = self.rings.remove(&ring_num) {
132 let mut final_order = if prev_bond_order != BondOrder::Single {
133 prev_bond_order
134 } else {
135 order
136 };
137 final_order = get_bond_order(self.mol, prev, target, final_order);
138 let final_stereo = if prev_bond_stereo != BondStereo::None {
139 prev_bond_stereo
140 } else {
141 stereo
142 };
143 self.mol.add_bond(
144 prev,
145 target,
146 Bond {
147 order: final_order,
148 stereo: final_stereo,
149 },
150 );
151 } else {
152 self.rings
153 .insert(ring_num, (prev, prev_bond_order, prev_bond_stereo));
154 }
155 prev_bond_order = BondOrder::Single;
157 prev_bond_stereo = BondStereo::None;
158 }
159 }
160 }
161 b'[' => {
162 self.advance();
163 let node = self.parse_bracket_atom()?;
164 if let Some(prev) = prev_node {
165 let order = get_bond_order(self.mol, prev, node, prev_bond_order);
166 self.mol.add_bond(
167 prev,
168 node,
169 Bond {
170 order,
171 stereo: prev_bond_stereo,
172 },
173 );
174 }
175 prev_node = Some(node);
176 prev_bond_order = BondOrder::Single;
177 prev_bond_stereo = BondStereo::None;
178 }
179 _ if c.is_ascii_alphabetic() || c == b'*' => {
180 let node = self.parse_organic_subset()?;
181 if let Some(prev) = prev_node {
182 let order = get_bond_order(self.mol, prev, node, prev_bond_order);
183 self.mol.add_bond(
184 prev,
185 node,
186 Bond {
187 order,
188 stereo: prev_bond_stereo,
189 },
190 );
191 }
192 prev_node = Some(node);
193 prev_bond_order = BondOrder::Single;
194 prev_bond_stereo = BondStereo::None;
195 }
196 _ => return Err(format!("Unexpected character: {}", c as char)),
197 }
198 }
199 Ok(())
200 }
201
202 fn parse_organic_subset(&mut self) -> Result<NodeIndex, String> {
203 let mut elem_str = String::new();
204 let c = self.peek().unwrap();
205 self.advance();
206 elem_str.push(c as char);
207 if let Some(nc) = self.peek() {
208 if nc.is_ascii_lowercase()
209 && (c == b'C' && (nc == b'l' || nc == b'u' || nc == b'o' || nc == b'r')
210 || c == b'B' && nc == b'r')
211 {
212 elem_str.push(nc as char);
213 self.advance();
214 }
215 }
216
217 let mut aromatic = false;
219 if elem_str.chars().next().unwrap().is_lowercase() {
220 aromatic = true;
221 elem_str = elem_str.to_uppercase();
222 }
223
224 let element = match elem_str.as_str() {
225 "C" => 6,
226 "N" => 7,
227 "O" => 8,
228 "P" => 15,
229 "S" => 16,
230 "F" => 9,
231 "CL" | "Cl" => 17,
232 "BR" | "Br" => 35,
233 "I" => 53,
234 "B" => 5,
235 "*" => 0,
236 &_ => return Err(format!("Unknown element: {}", elem_str)),
237 };
238
239 let atom = Atom {
240 element,
241 position: Vector3::zeros(),
242 charge: 0.0,
243 formal_charge: 0,
244 hybridization: if aromatic {
245 Hybridization::SP2
246 } else {
247 Hybridization::SP3
248 },
249 chiral_tag: ChiralType::Unspecified,
250 explicit_h: 0,
251 };
252
253 Ok(self.mol.add_atom(atom))
254 }
255
256 fn parse_bracket_atom(&mut self) -> Result<NodeIndex, String> {
257 let mut isotope = 0u32;
258 let mut aromatic = false;
259 let mut chiral = ChiralType::Unspecified;
260 let mut h_count = 0;
261 let mut charge = 0;
262
263 while let Some(c) = self.peek() {
265 if c.is_ascii_digit() {
266 isotope = isotope * 10 + (c - b'0') as u32;
267 self.advance();
268 } else {
269 break;
270 }
271 }
272
273 let mut elem_str = String::new();
275 if let Some(c) = self.peek() {
276 if c.is_ascii_alphabetic() {
277 elem_str.push(c as char);
278 self.advance();
279 if let Some(nc) = self.peek() {
280 if nc.is_ascii_lowercase() && c.is_ascii_uppercase() {
281 elem_str.push(nc as char);
282 self.advance();
283 }
284 }
285 } else if c == b'*' {
286 elem_str.push('*');
287 self.advance();
288 }
289 }
290
291 if elem_str.chars().next().unwrap().is_lowercase() {
292 aromatic = true;
293 elem_str = elem_str.to_uppercase();
294 }
295
296 let element: u8 = match elem_str.as_str() {
297 "H" => 1,
298 "HE" | "He" => 2,
299 "LI" | "Li" => 3,
300 "BE" | "Be" => 4,
301 "B" => 5,
302 "C" => 6,
303 "N" => 7,
304 "O" => 8,
305 "F" => 9,
306 "NE" | "Ne" => 10,
307 "NA" | "Na" => 11,
308 "MG" | "Mg" => 12,
309 "AL" | "Al" => 13,
310 "SI" | "Si" => 14,
311 "P" => 15,
312 "S" => 16,
313 "CL" | "Cl" => 17,
314 "AR" | "Ar" => 18,
315 "K" => 19,
316 "CA" | "Ca" => 20,
317 "SC" | "Sc" => 21,
318 "TI" | "Ti" => 22,
319 "V" => 23,
320 "CR" | "Cr" => 24,
321 "MN" | "Mn" => 25,
322 "FE" | "Fe" => 26,
323 "CO" | "Co" => 27,
324 "NI" | "Ni" => 28,
325 "CU" | "Cu" => 29,
326 "ZN" | "Zn" => 30,
327 "GA" | "Ga" => 31,
328 "GE" | "Ge" => 32,
329 "AS" | "As" => 33,
330 "SE" | "Se" => 34,
331 "BR" | "Br" => 35,
332 "KR" | "Kr" => 36,
333 "RB" | "Rb" => 37,
334 "SR" | "Sr" => 38,
335 "Y" => 39,
336 "ZR" | "Zr" => 40,
337 "NB" | "Nb" => 41,
338 "MO" | "Mo" => 42,
339 "RU" | "Ru" => 44,
340 "RH" | "Rh" => 45,
341 "PD" | "Pd" => 46,
342 "AG" | "Ag" => 47,
343 "CD" | "Cd" => 48,
344 "IN" | "In" => 49,
345 "SN" | "Sn" => 50,
346 "SB" | "Sb" => 51,
347 "TE" | "Te" => 52,
348 "I" => 53,
349 "XE" | "Xe" => 54,
350 "CS" | "Cs" => 55,
351 "BA" | "Ba" => 56,
352 "LA" | "La" => 57,
353 "W" => 74,
354 "RE" | "Re" => 75,
355 "OS" | "Os" => 76,
356 "IR" | "Ir" => 77,
357 "PT" | "Pt" => 78,
358 "AU" | "Au" => 79,
359 "HG" | "Hg" => 80,
360 "TL" | "Tl" => 81,
361 "PB" | "Pb" => 82,
362 "BI" | "Bi" => 83,
363 "*" => 0,
364 _ => return Err(format!("Unknown bracket element: {}", elem_str)),
365 };
366
367 if let Some(c) = self.peek() {
369 if c == b'@' {
370 self.advance();
371 chiral = ChiralType::TetrahedralCCW;
372 if self.peek() == Some(b'@') {
373 self.advance();
374 chiral = ChiralType::TetrahedralCW;
375 }
376 }
377 }
378
379 if let Some(c) = self.peek() {
381 if c == b'H' {
382 self.advance();
383 h_count = 1;
384 if let Some(hc) = self.peek() {
385 if hc.is_ascii_digit() {
386 h_count = hc - b'0';
387 self.advance();
388 }
389 }
390 }
391 }
392
393 if let Some(c) = self.peek() {
395 if c == b'+' {
396 self.advance();
397 charge = 1;
398 if let Some(cc) = self.peek() {
399 if cc.is_ascii_digit() {
400 charge = (cc - b'0') as i8;
401 self.advance();
402 } else if cc == b'+' {
403 charge = 2;
404 self.advance();
405 if self.peek() == Some(b'+') {
406 charge = 3;
407 self.advance();
408 }
409 }
410 }
411 } else if c == b'-' {
412 self.advance();
413 charge = -1;
414 if let Some(cc) = self.peek() {
415 if cc.is_ascii_digit() {
416 charge = -((cc - b'0') as i8);
417 self.advance();
418 } else if cc == b'-' {
419 charge = -2;
420 self.advance();
421 if self.peek() == Some(b'-') {
422 charge = -3;
423 self.advance();
424 }
425 }
426 }
427 }
428 }
429
430 if self.peek() == Some(b']') {
431 self.advance();
432 } else {
433 return Err("Expected ']'".to_string());
434 }
435
436 let atom = Atom {
437 element,
438 position: Vector3::zeros(),
439 charge: charge as f32,
440 formal_charge: charge,
441 hybridization: if aromatic {
442 Hybridization::SP2
443 } else {
444 Hybridization::SP3
445 }, chiral_tag: chiral,
447 explicit_h: h_count,
448 };
449
450 let node = self.mol.add_atom(atom);
451
452 Ok(node)
453 }
454
455 fn add_implicit_hydrogens(&mut self) {
456 let n = self.mol.graph.node_count();
457 let mut to_add = Vec::new();
458
459 for i in 0..n {
460 let ni = NodeIndex::new(i);
461 let atom = &self.mol.graph[ni];
462 if atom.element == 1 || atom.element == 0 {
463 continue;
464 } let mut current_valents = 0;
467 let mut double_bonds = 0;
468 let mut triple_bonds = 0;
469 let mut aromatic_bonds = 0;
470
471 for edge in self.mol.graph.edges(ni) {
472 match edge.weight().order {
473 BondOrder::Single => {
474 current_valents += 1;
475 }
476 BondOrder::Double => {
477 current_valents += 2;
478 double_bonds += 1;
479 }
480 BondOrder::Triple => {
481 current_valents += 3;
482 triple_bonds += 1;
483 }
484 BondOrder::Aromatic => {
485 current_valents += 1;
486 aromatic_bonds += 1;
487 }
488 _ => {
489 current_valents += 1;
490 }
491 };
492 }
493
494 if aromatic_bonds == 0 {
496 let actual_hybridization = if triple_bonds > 0 || double_bonds > 1 {
497 Hybridization::SP
498 } else if double_bonds == 1 {
499 Hybridization::SP2
500 } else {
501 let elem = atom.element;
504 let is_conjugated = if (elem == 7 || elem == 8) && double_bonds == 0 {
505 self.mol.graph.neighbors(ni).any(|nb| {
507 self.mol.graph.edges(nb).any(|e| {
508 matches!(e.weight().order, BondOrder::Double | BondOrder::Aromatic)
509 })
510 })
511 } else {
512 false
513 };
514 if is_conjugated {
515 Hybridization::SP2
516 } else {
517 Hybridization::SP3
518 }
519 };
520
521 if let Some(atom_mut) = self.mol.graph.node_weight_mut(ni) {
522 atom_mut.hybridization = actual_hybridization;
523 }
524 } else if let Some(atom_mut) = self.mol.graph.node_weight_mut(ni) {
525 atom_mut.hybridization = Hybridization::SP2;
526 }
527
528 let atom = &self.mol.graph[ni];
529 let target_val = match atom.element {
531 5 => 3, 6 => 4, 7 => {
534 if atom.formal_charge == 1 {
535 4
536 } else {
537 3
538 }
539 } 8 => {
541 if atom.formal_charge == 1 {
542 3
543 } else {
544 2
545 }
546 } 9 | 17 | 35 | 53 => 1, 14 => 4, 15 => 3, 16 => 2, 32 => 4, 33 => 3, 34 => 2, 50 => 4, 51 => 3, 52 => 2, _ => 0,
558 };
559
560 let mut h_needed = target_val - current_valents - atom.formal_charge.abs() as i32;
561
562 if atom.hybridization == Hybridization::SP2 && atom.element == 6 {
563 let aromatic_bonds = self
565 .mol
566 .graph
567 .edges(ni)
568 .filter(|e| e.weight().order == BondOrder::Aromatic)
569 .count();
570 if aromatic_bonds == 2 {
571 h_needed = 1 - (current_valents - 2); } else if aromatic_bonds == 3 {
573 h_needed = 0; }
575 }
576
577 if aromatic_bonds > 0 && atom.element == 7 && atom.explicit_h == 0 {
581 h_needed = 0;
582 }
583
584 let total_h_for_this_atom = atom.explicit_h as i32
587 + if h_needed - (atom.explicit_h as i32) > 0 {
588 h_needed - (atom.explicit_h as i32)
589 } else {
590 0
591 };
592
593 if total_h_for_this_atom > 0 {
594 to_add.push((ni, total_h_for_this_atom as u8));
595 }
596 }
597
598 for (parent, count) in to_add {
600 for _ in 0..count {
601 let h_node = self.mol.add_atom(Atom {
602 element: 1,
603 position: Vector3::zeros(),
604 charge: 0.0,
605 formal_charge: 0,
606 hybridization: Hybridization::Unknown,
607 chiral_tag: ChiralType::Unspecified,
608 explicit_h: 0,
609 });
610 self.mol.add_bond(
611 parent,
612 h_node,
613 Bond {
614 order: BondOrder::Single,
615 stereo: BondStereo::None,
616 },
617 );
618 }
619 }
620 }
621}
622
623#[cfg(test)]
624mod tests {
625 use super::*;
626
627 #[test]
628 fn test_parse_benzene_aliphatic() {
629 let mol = Molecule::from_smiles("C1=CC=CC=C1").unwrap();
630 assert_eq!(mol.graph.node_count(), 12); assert_eq!(mol.graph.edge_count(), 12); }
633
634 #[test]
635 fn test_parse_benzene_aromatic() {
636 let mol = Molecule::from_smiles("c1ccccc1").unwrap();
637 assert_eq!(mol.graph.node_count(), 12); assert_eq!(mol.graph.edge_count(), 12); }
640
641 #[test]
642 fn test_parse_brackets_and_branches() {
643 let mol = Molecule::from_smiles("C(C)(C)C").unwrap(); assert_eq!(mol.graph.node_count(), 14); }
646}