1use crate::graph::{Atom, Bond, BondOrder, BondStereo, ChiralType, Hybridization, Molecule};
2use nalgebra::Vector3;
3use petgraph::graph::NodeIndex;
4use std::collections::HashMap;
5
6pub struct SmilesParser<'a> {
15 input: &'a [u8],
16 pos: usize,
17 rings: HashMap<u32, (NodeIndex, BondOrder, BondStereo)>,
18 mol: &'a mut Molecule,
19}
20
21impl<'a> SmilesParser<'a> {
22 pub fn new(smiles: &'a str, mol: &'a mut Molecule) -> Self {
24 SmilesParser {
25 input: smiles.as_bytes(),
26 pos: 0,
27 rings: HashMap::new(),
28 mol,
29 }
30 }
31
32 pub fn parse(&mut self) -> Result<(), String> {
36 self.parse_chain(None)?;
37 self.add_implicit_hydrogens();
39 Ok(())
40 }
41
42 fn peek(&self) -> Option<u8> {
43 self.input.get(self.pos).copied()
44 }
45
46 fn advance(&mut self) {
47 self.pos += 1;
48 }
49
50 fn parse_chain(&mut self, mut prev_node: Option<NodeIndex>) -> Result<(), String> {
51 let mut prev_bond_order = BondOrder::Single;
52 let mut prev_bond_stereo = BondStereo::None;
53
54 let get_bond_order =
55 |m: &Molecule, prev: NodeIndex, curr: NodeIndex, order: BondOrder| -> BondOrder {
56 if order == BondOrder::Single
57 && m.graph[prev].hybridization == Hybridization::SP2
58 && m.graph[curr].hybridization == Hybridization::SP2
59 {
60 BondOrder::Aromatic
61 } else {
62 order
63 }
64 };
65
66 while self.pos < self.input.len() {
67 let c = self.peek().unwrap();
68 match c {
69 b'(' => {
70 self.advance();
71 self.parse_chain(prev_node)?;
72 if self.peek() == Some(b')') {
73 self.advance();
74 } else {
75 return Err("Expected ')'".to_string());
76 }
77 }
78 b')' => {
79 break;
80 }
81 b'-' => {
82 self.advance();
83 prev_bond_order = BondOrder::Single;
84 }
85 b'=' => {
86 self.advance();
87 prev_bond_order = BondOrder::Double;
88 }
89 b'#' => {
90 self.advance();
91 prev_bond_order = BondOrder::Triple;
92 }
93 b':' => {
94 self.advance();
95 prev_bond_order = BondOrder::Aromatic;
96 }
97 b'.' => {
98 self.advance();
100 prev_node = None;
101 prev_bond_order = BondOrder::Single;
102 prev_bond_stereo = BondStereo::None;
103 }
104 b'/' => {
105 self.advance();
106 prev_bond_order = BondOrder::Single;
107 prev_bond_stereo = BondStereo::E;
108 } b'\\' => {
110 self.advance();
111 prev_bond_order = BondOrder::Single;
112 prev_bond_stereo = BondStereo::Z;
113 }
114 b'%' | b'0'..=b'9' => {
115 let mut ring_num = 0;
116 if c == b'%' {
117 self.advance();
118 if let (Some(d1), Some(d2)) = (self.peek(), self.input.get(self.pos + 1)) {
119 if d1.is_ascii_digit() && d2.is_ascii_digit() {
120 ring_num = ((d1 - b'0') * 10 + (d2 - b'0')) as u32;
121 self.advance();
122 self.advance();
123 }
124 }
125 } else {
126 ring_num = (c - b'0') as u32;
127 self.advance();
128 }
129 if ring_num > 0 {
130 if let Some(prev) = prev_node {
131 if let Some((target, order, stereo)) = self.rings.remove(&ring_num) {
132 let mut final_order = if prev_bond_order != BondOrder::Single {
133 prev_bond_order
134 } else {
135 order
136 };
137 final_order = get_bond_order(self.mol, prev, target, final_order);
138 let final_stereo = if prev_bond_stereo != BondStereo::None {
139 prev_bond_stereo
140 } else {
141 stereo
142 };
143 self.mol.add_bond(
144 prev,
145 target,
146 Bond {
147 order: final_order,
148 stereo: final_stereo,
149 },
150 );
151 } else {
152 self.rings
153 .insert(ring_num, (prev, prev_bond_order, prev_bond_stereo));
154 }
155 prev_bond_order = BondOrder::Single;
157 prev_bond_stereo = BondStereo::None;
158 }
159 }
160 }
161 b'[' => {
162 self.advance();
163 let node = self.parse_bracket_atom()?;
164 if let Some(prev) = prev_node {
165 let order = get_bond_order(self.mol, prev, node, prev_bond_order);
166 self.mol.add_bond(
167 prev,
168 node,
169 Bond {
170 order,
171 stereo: prev_bond_stereo,
172 },
173 );
174 }
175 prev_node = Some(node);
176 prev_bond_order = BondOrder::Single;
177 prev_bond_stereo = BondStereo::None;
178 }
179 _ if c.is_ascii_alphabetic() || c == b'*' => {
180 let node = self.parse_organic_subset()?;
181 if let Some(prev) = prev_node {
182 let order = get_bond_order(self.mol, prev, node, prev_bond_order);
183 self.mol.add_bond(
184 prev,
185 node,
186 Bond {
187 order,
188 stereo: prev_bond_stereo,
189 },
190 );
191 }
192 prev_node = Some(node);
193 prev_bond_order = BondOrder::Single;
194 prev_bond_stereo = BondStereo::None;
195 }
196 _ => return Err(format!("Unexpected character: {}", c as char)),
197 }
198 }
199 Ok(())
200 }
201
202 fn parse_organic_subset(&mut self) -> Result<NodeIndex, String> {
203 let mut elem_str = String::new();
204 let c = self.peek().unwrap();
205 self.advance();
206 elem_str.push(c as char);
207 if let Some(nc) = self.peek() {
208 if nc.is_ascii_lowercase()
209 && (c == b'C' && (nc == b'l' || nc == b'u' || nc == b'o' || nc == b'r')
210 || c == b'B' && nc == b'r')
211 {
212 elem_str.push(nc as char);
213 self.advance();
214 }
215 }
216
217 let mut aromatic = false;
219 if elem_str.chars().next().unwrap().is_lowercase() {
220 aromatic = true;
221 elem_str = elem_str.to_uppercase();
222 }
223
224 let element = match elem_str.as_str() {
225 "C" => 6,
226 "N" => 7,
227 "O" => 8,
228 "P" => 15,
229 "S" => 16,
230 "F" => 9,
231 "CL" | "Cl" => 17,
232 "BR" | "Br" => 35,
233 "I" => 53,
234 "B" => 5,
235 "*" => 0,
236 &_ => return Err(format!("Unknown element: {}", elem_str)),
237 };
238
239 let atom = Atom {
240 element,
241 position: Vector3::zeros(),
242 charge: 0.0,
243 formal_charge: 0,
244 hybridization: if aromatic {
245 Hybridization::SP2
246 } else {
247 Hybridization::SP3
248 },
249 chiral_tag: ChiralType::Unspecified,
250 explicit_h: 0,
251 };
252
253 Ok(self.mol.add_atom(atom))
254 }
255
256 fn parse_bracket_atom(&mut self) -> Result<NodeIndex, String> {
257 let mut isotope = 0u32;
258 let mut aromatic = false;
259 let mut chiral = ChiralType::Unspecified;
260 let mut h_count = 0;
261 let mut charge = 0;
262
263 while let Some(c) = self.peek() {
265 if c.is_ascii_digit() {
266 isotope = isotope * 10 + (c - b'0') as u32;
267 self.advance();
268 } else {
269 break;
270 }
271 }
272
273 let mut elem_str = String::new();
275 if let Some(c) = self.peek() {
276 if c.is_ascii_alphabetic() {
277 elem_str.push(c as char);
278 self.advance();
279 if let Some(nc) = self.peek() {
280 if nc.is_ascii_lowercase() && c.is_ascii_uppercase() {
281 elem_str.push(nc as char);
282 self.advance();
283 }
284 }
285 } else if c == b'*' {
286 elem_str.push('*');
287 self.advance();
288 }
289 }
290
291 if elem_str.chars().next().unwrap().is_lowercase() {
292 aromatic = true;
293 elem_str = elem_str.to_uppercase();
294 }
295
296 let element: u8 = match elem_str.as_str() {
297 "H" => 1,
298 "C" => 6,
299 "N" => 7,
300 "O" => 8,
301 "P" => 15,
302 "S" => 16,
303 "F" => 9,
304 "CL" | "Cl" => 17,
305 "BR" | "Br" => 35,
306 "I" => 53,
307 "FE" | "Fe" => 26,
308 "ZN" | "Zn" => 30,
309 "CO" | "Co" => 27,
310 "CU" | "Cu" => 29,
311 "B" => 5,
312 "AG" | "Ag" => 47,
313 "AU" | "Au" => 79,
314 "PT" | "Pt" => 78,
315 "PD" | "Pd" => 46,
316 "NA" | "Na" => 11,
317 "K" => 19,
318 "LI" | "Li" => 3,
319 "CA" | "Ca" => 20,
320 "MG" | "Mg" => 12,
321 "*" => 0,
322 _ => 6, };
324
325 if let Some(c) = self.peek() {
327 if c == b'@' {
328 self.advance();
329 chiral = ChiralType::TetrahedralCCW;
330 if self.peek() == Some(b'@') {
331 self.advance();
332 chiral = ChiralType::TetrahedralCW;
333 }
334 }
335 }
336
337 if let Some(c) = self.peek() {
339 if c == b'H' {
340 self.advance();
341 h_count = 1;
342 if let Some(hc) = self.peek() {
343 if hc.is_ascii_digit() {
344 h_count = hc - b'0';
345 self.advance();
346 }
347 }
348 }
349 }
350
351 if let Some(c) = self.peek() {
353 if c == b'+' {
354 self.advance();
355 charge = 1;
356 if let Some(cc) = self.peek() {
357 if cc.is_ascii_digit() {
358 charge = (cc - b'0') as i8;
359 self.advance();
360 } else if cc == b'+' {
361 charge = 2;
362 self.advance();
363 if self.peek() == Some(b'+') {
364 charge = 3;
365 self.advance();
366 }
367 }
368 }
369 } else if c == b'-' {
370 self.advance();
371 charge = -1;
372 if let Some(cc) = self.peek() {
373 if cc.is_ascii_digit() {
374 charge = -((cc - b'0') as i8);
375 self.advance();
376 } else if cc == b'-' {
377 charge = -2;
378 self.advance();
379 if self.peek() == Some(b'-') {
380 charge = -3;
381 self.advance();
382 }
383 }
384 }
385 }
386 }
387
388 if self.peek() == Some(b']') {
389 self.advance();
390 } else {
391 return Err("Expected ']'".to_string());
392 }
393
394 let atom = Atom {
395 element,
396 position: Vector3::zeros(),
397 charge: charge as f32,
398 formal_charge: charge,
399 hybridization: if aromatic {
400 Hybridization::SP2
401 } else {
402 Hybridization::SP3
403 }, chiral_tag: chiral,
405 explicit_h: h_count,
406 };
407
408 let node = self.mol.add_atom(atom);
409
410 Ok(node)
411 }
412
413 fn add_implicit_hydrogens(&mut self) {
414 let n = self.mol.graph.node_count();
415 let mut to_add = Vec::new();
416
417 for i in 0..n {
418 let ni = NodeIndex::new(i);
419 let atom = &self.mol.graph[ni];
420 if atom.element == 1 || atom.element == 0 {
421 continue;
422 } let mut current_valents = 0;
425 let mut double_bonds = 0;
426 let mut triple_bonds = 0;
427 let mut aromatic_bonds = 0;
428
429 for edge in self.mol.graph.edges(ni) {
430 match edge.weight().order {
431 BondOrder::Single => {
432 current_valents += 1;
433 }
434 BondOrder::Double => {
435 current_valents += 2;
436 double_bonds += 1;
437 }
438 BondOrder::Triple => {
439 current_valents += 3;
440 triple_bonds += 1;
441 }
442 BondOrder::Aromatic => {
443 current_valents += 1;
444 aromatic_bonds += 1;
445 }
446 _ => {
447 current_valents += 1;
448 }
449 };
450 }
451
452 if aromatic_bonds == 0 {
454 let actual_hybridization = if triple_bonds > 0 || double_bonds > 1 {
455 Hybridization::SP
456 } else if double_bonds == 1 {
457 Hybridization::SP2
458 } else {
459 let elem = atom.element;
462 let is_conjugated = if (elem == 7 || elem == 8) && double_bonds == 0 {
463 self.mol.graph.neighbors(ni).any(|nb| {
465 self.mol.graph.edges(nb).any(|e| {
466 matches!(e.weight().order, BondOrder::Double | BondOrder::Aromatic)
467 })
468 })
469 } else {
470 false
471 };
472 if is_conjugated {
473 Hybridization::SP2
474 } else {
475 Hybridization::SP3
476 }
477 };
478
479 if let Some(atom_mut) = self.mol.graph.node_weight_mut(ni) {
480 atom_mut.hybridization = actual_hybridization;
481 }
482 } else {
483 if let Some(atom_mut) = self.mol.graph.node_weight_mut(ni) {
484 atom_mut.hybridization = Hybridization::SP2;
485 }
486 }
487
488 let atom = &self.mol.graph[ni];
489 let target_val = match atom.element {
491 6 => 4, 7 => {
493 if atom.formal_charge == 1 {
494 4
495 } else {
496 3
497 }
498 } 8 => {
500 if atom.formal_charge == 1 {
501 3
502 } else {
503 2
504 }
505 } 9 | 17 | 35 | 53 => 1, 16 => 2, 15 => 3, _ => 0,
510 };
511
512 let mut h_needed = target_val - current_valents - atom.formal_charge.abs() as i32;
513
514 if atom.hybridization == Hybridization::SP2 && atom.element == 6 {
515 let aromatic_bonds = self
517 .mol
518 .graph
519 .edges(ni)
520 .filter(|e| e.weight().order == BondOrder::Aromatic)
521 .count();
522 if aromatic_bonds == 2 {
523 h_needed = 1 - (current_valents - 2); } else if aromatic_bonds == 3 {
525 h_needed = 0; }
527 }
528
529 if aromatic_bonds > 0 && atom.element == 7 && atom.explicit_h == 0 {
533 h_needed = 0;
534 }
535
536 let total_h_for_this_atom = atom.explicit_h as i32
539 + if h_needed - (atom.explicit_h as i32) > 0 {
540 h_needed - (atom.explicit_h as i32)
541 } else {
542 0
543 };
544
545 if total_h_for_this_atom > 0 {
546 to_add.push((ni, total_h_for_this_atom as u8));
547 }
548 }
549
550 for (parent, count) in to_add {
552 for _ in 0..count {
553 let h_node = self.mol.add_atom(Atom {
554 element: 1,
555 position: Vector3::zeros(),
556 charge: 0.0,
557 formal_charge: 0,
558 hybridization: Hybridization::Unknown,
559 chiral_tag: ChiralType::Unspecified,
560 explicit_h: 0,
561 });
562 self.mol.add_bond(
563 parent,
564 h_node,
565 Bond {
566 order: BondOrder::Single,
567 stereo: BondStereo::None,
568 },
569 );
570 }
571 }
572 }
573}
574
575#[cfg(test)]
576mod tests {
577 use super::*;
578
579 #[test]
580 fn test_parse_benzene_aliphatic() {
581 let mol = Molecule::from_smiles("C1=CC=CC=C1").unwrap();
582 assert_eq!(mol.graph.node_count(), 12); assert_eq!(mol.graph.edge_count(), 12); }
585
586 #[test]
587 fn test_parse_benzene_aromatic() {
588 let mol = Molecule::from_smiles("c1ccccc1").unwrap();
589 assert_eq!(mol.graph.node_count(), 12); assert_eq!(mol.graph.edge_count(), 12); }
592
593 #[test]
594 fn test_parse_brackets_and_branches() {
595 let mol = Molecule::from_smiles("C(C)(C)C").unwrap(); assert_eq!(mol.graph.node_count(), 14); }
598}