Skip to main content

chem_name_resolver/parser/
mod.rs

1pub mod formula;
2pub mod scanner;
3pub mod smiles;
4
5mod alkane;
6mod locant;
7mod substituent;
8mod suffix;
9
10use crate::error::ResolveError;
11use suffix::{Suffix, SuffixGroup};
12use substituent::parse_substituents;
13
14// ── Molecular graph types ─────────────────────────────────────────────────────
15
16#[derive(Debug, Clone, PartialEq)]
17pub enum Element {
18    C,
19    H,
20    O,
21    N,
22    S,
23    P,
24    F,
25    Cl,
26    Br,
27    I,
28}
29
30impl Element {
31    pub fn symbol(&self) -> &'static str {
32        match self {
33            Element::C => "C",
34            Element::H => "H",
35            Element::O => "O",
36            Element::N => "N",
37            Element::S => "S",
38            Element::P => "P",
39            Element::F => "F",
40            Element::Cl => "Cl",
41            Element::Br => "Br",
42            Element::I => "I",
43        }
44    }
45
46    pub fn valence(&self) -> u8 {
47        match self {
48            Element::C => 4,
49            Element::N => 3,
50            Element::O | Element::S => 2,
51            Element::F | Element::Cl | Element::Br | Element::I | Element::H | Element::P => 1,
52        }
53    }
54}
55
56#[derive(Debug, Clone, PartialEq)]
57pub enum BondOrder {
58    Single,
59    Double,
60    Triple,
61}
62
63impl BondOrder {
64    pub fn degree(&self) -> u8 {
65        match self {
66            BondOrder::Single => 1,
67            BondOrder::Double => 2,
68            BondOrder::Triple => 3,
69        }
70    }
71}
72
73#[derive(Debug, Clone)]
74pub struct Atom {
75    pub element: Element,
76    pub charge: i8,
77    pub implicit_h: u8,
78}
79
80#[derive(Debug, Clone)]
81pub struct Bond {
82    pub to: usize,
83    pub order: BondOrder,
84}
85
86/// Molecular graph: atoms with adjacency lists.
87#[derive(Debug, Clone, Default)]
88pub struct MolGraph {
89    pub atoms: Vec<Atom>,
90    pub bonds: Vec<Vec<Bond>>,
91}
92
93impl MolGraph {
94    fn add_atom(&mut self, element: Element) -> usize {
95        let idx = self.atoms.len();
96        self.atoms.push(Atom { element, charge: 0, implicit_h: 0 });
97        self.bonds.push(Vec::new());
98        idx
99    }
100
101    fn add_bond(&mut self, a: usize, b: usize, order: BondOrder) {
102        self.bonds[a].push(Bond { to: b, order: order.clone() });
103        self.bonds[b].push(Bond { to: a, order });
104    }
105
106    fn fill_implicit_h(&mut self) {
107        for i in 0..self.atoms.len() {
108            let used: u8 = self.bonds[i].iter().map(|b| b.order.degree()).sum();
109            let valence = self.atoms[i].element.valence();
110            self.atoms[i].implicit_h = valence.saturating_sub(used);
111        }
112    }
113}
114
115// ── Entry point ───────────────────────────────────────────────────────────────
116
117/// Parse an IUPAC systematic name (already normalized, lowercase) into a MolGraph.
118/// MVP scope: straight-chain and monocyclic C1-C20, basic suffixes, simple substituents.
119pub fn parse_iupac(name: &str) -> Result<MolGraph, ResolveError> {
120    // Strip the "n-" prefix (e.g. "n-butane" = "butane").
121    let name = name.trim().strip_prefix("n-").unwrap_or(name.trim());
122
123    // Detect cyclo- prefix (e.g. "cyclohexane" → ring of 6 carbons).
124    let (is_cyclic, name) = if let Some(rest) = name.strip_prefix("cyclo") {
125        (true, rest)
126    } else {
127        (false, name)
128    };
129
130    // 1. Parse substituent prefix list ("2-chloro-", "3-methyl-", …)
131    let (substituents, rest) = parse_substituents(name);
132
133    // 2. Extract prefix locants that belong to the suffix (e.g. "2,4-" in "2,4-pentanedione").
134    //    Distinguished from substituent locants because no substituent name follows them.
135    let (prefix_suffix_locants, rest) = extract_prefix_suffix_locants(rest);
136
137    // 3. Parse chain stem
138    let (chain_len, rest) = alkane::parse_stem(rest).ok_or_else(|| ResolveError::ParseError {
139        pos: 0,
140        msg: format!("unrecognized chain stem in: {name:?}"),
141    })?;
142
143    // 4. Parse one or more suffix groups (handles "an" connector and compound suffixes)
144    let (suffix_groups, remaining) =
145        parse_suffix_groups(rest).map_err(|_| ResolveError::ParseError {
146            pos: name.len() - rest.len(),
147            msg: format!("unrecognized suffix in: {name:?}"),
148        })?;
149
150    if !remaining.is_empty() {
151        return Err(ResolveError::ParseError {
152            pos: name.len() - remaining.len(),
153            msg: format!("unexpected trailing text: {remaining:?}"),
154        });
155    }
156
157    build_graph(chain_len, &suffix_groups, &prefix_suffix_locants, &substituents, name, is_cyclic)
158}
159
160/// Extract prefix locants that precede the stem (e.g. "2,4-" in "2,4-pentanedione").
161fn extract_prefix_suffix_locants<'a>(input: &'a str) -> (Vec<u8>, &'a str) {
162    if let Some((locs, rest)) = locant::parse_locant_list(input) {
163        if alkane::parse_stem(rest).is_some() {
164            return (locs, rest);
165        }
166    }
167    (vec![], input)
168}
169
170/// Parse one or more suffix groups from `input`, handling the "an" connector.
171fn parse_suffix_groups(input: &str) -> Result<(Vec<SuffixGroup>, &str), ()> {
172    let mut groups = Vec::new();
173    let mut rest = input;
174
175    loop {
176        // Try "an" connector FIRST to avoid "anedione" being parsed as "ane" + "dione".
177        // "ane" as a suffix falls through to the direct parse below because stripping "an"
178        // from "ane" leaves "e" which strip_elision_e removes to "", and parse_suffix("")
179        // returns None, so we fall through.
180        if let Some(r) = rest.strip_prefix("an") {
181            let r = strip_elision_e(r);
182            if let Some((sg, r2)) = suffix::parse_suffix(r) {
183                groups.push(sg);
184                rest = r2;
185                if rest.is_empty() {
186                    break;
187                }
188                continue;
189            }
190        }
191
192        // Direct parse (for -ane, -ene, -yne without "an" connector, and infix locant forms)
193        if let Some((sg, r)) = suffix::parse_suffix(rest) {
194            groups.push(sg);
195            rest = r;
196            if rest.is_empty() {
197                break;
198            }
199            continue;
200        }
201
202        break;
203    }
204
205    if groups.is_empty() {
206        Err(())
207    } else {
208        Ok((groups, rest))
209    }
210}
211
212/// Strip the elision 'e' (from "-ane") when the suffix starts with a consonant.
213/// E.g. "edione" → "dione", but "ene" is NOT stripped (it is a suffix itself).
214fn strip_elision_e(input: &str) -> &str {
215    if input.starts_with('e')
216        && !input.starts_with("ene")
217        && !input.starts_with("en-")
218    {
219        &input[1..]
220    } else {
221        input
222    }
223}
224
225// ── Graph construction ────────────────────────────────────────────────────────
226
227fn build_graph(
228    chain_len: u8,
229    suffix_groups: &[SuffixGroup],
230    prefix_suffix_locants: &[u8],
231    substituents: &[substituent::Substituent],
232    name: &str,
233    is_cyclic: bool,
234) -> Result<MolGraph, ResolveError> {
235    let mut g = MolGraph::default();
236
237    // Build carbon chain (C-1 = index 0)
238    let carbon_indices: Vec<usize> =
239        (0..chain_len as usize).map(|_| g.add_atom(Element::C)).collect();
240
241    for i in 0..carbon_indices.len().saturating_sub(1) {
242        g.add_bond(carbon_indices[i], carbon_indices[i + 1], BondOrder::Single);
243    }
244
245    // For cyclo- compounds, close the ring by bonding C-1 to C-n
246    if is_cyclic {
247        let n = carbon_indices.len();
248        if n >= 3 {
249            g.add_bond(carbon_indices[0], carbon_indices[n - 1], BondOrder::Single);
250        }
251    }
252
253    // Apply suffix groups
254    for (sg_idx, sg) in suffix_groups.iter().enumerate() {
255        // Use prefix locants only for the first suffix group when it has none of its own
256        let effective_locants: Vec<u8> = if sg.locants.is_empty() && sg_idx == 0 && !prefix_suffix_locants.is_empty() {
257            prefix_suffix_locants.to_vec()
258        } else {
259            sg.locants.clone()
260        };
261        apply_suffix(&mut g, &carbon_indices, sg, &effective_locants, name)?;
262    }
263
264    // Apply substituents
265    for sub in substituents {
266        apply_substituent(&mut g, &carbon_indices, sub, name)?;
267    }
268
269    g.fill_implicit_h();
270    Ok(g)
271}
272
273fn apply_suffix(
274    g: &mut MolGraph,
275    carbons: &[usize],
276    sg: &SuffixGroup,
277    effective_locants: &[u8],
278    name: &str,
279) -> Result<(), ResolveError> {
280    let count = sg.multiplier.as_ref().map(|m| m.count()).unwrap_or(1) as usize;
281
282    // Resolve locants to 0-based indices. Default rules when unspecified:
283    let locants_0: Vec<usize> = if effective_locants.is_empty() {
284        match sg.suffix {
285            Suffix::Ane => vec![],
286            Suffix::Ene | Suffix::Yne => (0..count).map(|i| i * 2).collect(),
287            Suffix::Ol => (0..count).map(|i| i).collect(),
288            Suffix::One => {
289                if carbons.len() >= 3 {
290                    (0..count).map(|i| i + 1).collect()
291                } else {
292                    vec![0]
293                }
294            }
295            Suffix::Al | Suffix::OicAcid => {
296                if count == 1 {
297                    vec![0]
298                } else {
299                    // dioic acid: both terminal carbons
300                    vec![0, carbons.len() - 1]
301                }
302            }
303            // These suffixes handle locants internally; default is unused.
304            Suffix::Amine | Suffix::Thiol | Suffix::Nitrile | Suffix::Amide => vec![],
305        }
306    } else {
307        effective_locants.iter().map(|&l| l as usize - 1).collect()
308    };
309
310    match sg.suffix {
311        Suffix::Ane => {}
312        Suffix::Ene => {
313            for &ci in &locants_0 {
314                validate_bond_locant(ci, carbons.len(), "ene", name)?;
315                upgrade_bond(g, carbons[ci], carbons[ci + 1], BondOrder::Double);
316            }
317        }
318        Suffix::Yne => {
319            for &ci in &locants_0 {
320                validate_bond_locant(ci, carbons.len(), "yne", name)?;
321                upgrade_bond(g, carbons[ci], carbons[ci + 1], BondOrder::Triple);
322            }
323        }
324        Suffix::Ol => {
325            let indices: Vec<usize> = if locants_0.is_empty() {
326                vec![0]
327            } else {
328                locants_0
329            };
330            for ci in indices {
331                validate_atom_locant(ci, carbons.len(), "ol", name)?;
332                let oidx = g.add_atom(Element::O);
333                g.add_bond(carbons[ci], oidx, BondOrder::Single);
334            }
335        }
336        Suffix::One => {
337            for &ci in &locants_0 {
338                validate_atom_locant(ci, carbons.len(), "one", name)?;
339                let oidx = g.add_atom(Element::O);
340                g.add_bond(carbons[ci], oidx, BondOrder::Double);
341            }
342        }
343        Suffix::Al => {
344            let oidx = g.add_atom(Element::O);
345            g.add_bond(carbons[0], oidx, BondOrder::Double);
346        }
347        Suffix::OicAcid => {
348            let positions: Vec<usize> = if locants_0.is_empty() {
349                if count == 1 {
350                    vec![0]
351                } else {
352                    vec![0, carbons.len() - 1]
353                }
354            } else {
355                locants_0
356            };
357            for ci in positions {
358                validate_atom_locant(ci, carbons.len(), "oic acid", name)?;
359                let oidx = g.add_atom(Element::O);
360                let ohidx = g.add_atom(Element::O);
361                g.add_bond(carbons[ci], oidx, BondOrder::Double);
362                g.add_bond(carbons[ci], ohidx, BondOrder::Single);
363            }
364        }
365        Suffix::Amine => {
366            let indices: Vec<usize> = if locants_0.is_empty() {
367                vec![carbons.len() - 1]
368            } else {
369                locants_0
370            };
371            for ci in indices {
372                validate_atom_locant(ci, carbons.len(), "amine", name)?;
373                let nidx = g.add_atom(Element::N);
374                g.add_bond(carbons[ci], nidx, BondOrder::Single);
375            }
376        }
377        Suffix::Thiol => {
378            let indices: Vec<usize> = if locants_0.is_empty() {
379                vec![carbons.len() - 1]
380            } else {
381                locants_0
382            };
383            for ci in indices {
384                validate_atom_locant(ci, carbons.len(), "thiol", name)?;
385                let sidx = g.add_atom(Element::S);
386                g.add_bond(carbons[ci], sidx, BondOrder::Single);
387            }
388        }
389        Suffix::Nitrile => {
390            // -nitrile: C-1 gains a triple bond to N (the carbon IS part of the chain).
391            let cidx = carbons[0];
392            let nidx = g.add_atom(Element::N);
393            g.add_bond(cidx, nidx, BondOrder::Triple);
394        }
395        Suffix::Amide => {
396            // -amide: C-1 gets -NH2 (single) and =O (double).
397            // N added before O so DFS places =O as branch and N as continuation → "C(=O)N".
398            let ci = if locants_0.is_empty() { 0 } else { locants_0[0] };
399            validate_atom_locant(ci, carbons.len(), "amide", name)?;
400            let nidx = g.add_atom(Element::N);
401            let oidx = g.add_atom(Element::O);
402            g.add_bond(carbons[ci], nidx, BondOrder::Single);
403            g.add_bond(carbons[ci], oidx, BondOrder::Double);
404        }
405    }
406    Ok(())
407}
408
409fn apply_substituent(
410    g: &mut MolGraph,
411    carbons: &[usize],
412    sub: &substituent::Substituent,
413    name: &str,
414) -> Result<(), ResolveError> {
415    use substituent::SubstituentKind;
416    for &loc in &sub.locants {
417        let ci = loc as usize - 1;
418        validate_atom_locant(ci, carbons.len(), "substituent", name)?;
419        let cidx = carbons[ci];
420        match &sub.kind {
421            SubstituentKind::Oxo => {
422                let oidx = g.add_atom(Element::O);
423                g.add_bond(cidx, oidx, BondOrder::Double);
424            }
425            SubstituentKind::Hydroxy => {
426                let oidx = g.add_atom(Element::O);
427                g.add_bond(cidx, oidx, BondOrder::Single);
428            }
429            SubstituentKind::Chloro => {
430                let x = g.add_atom(Element::Cl);
431                g.add_bond(cidx, x, BondOrder::Single);
432            }
433            SubstituentKind::Bromo => {
434                let x = g.add_atom(Element::Br);
435                g.add_bond(cidx, x, BondOrder::Single);
436            }
437            SubstituentKind::Fluoro => {
438                let x = g.add_atom(Element::F);
439                g.add_bond(cidx, x, BondOrder::Single);
440            }
441            SubstituentKind::Iodo => {
442                let x = g.add_atom(Element::I);
443                g.add_bond(cidx, x, BondOrder::Single);
444            }
445            SubstituentKind::Methyl => {
446                let m = g.add_atom(Element::C);
447                g.add_bond(cidx, m, BondOrder::Single);
448            }
449            SubstituentKind::Ethyl => {
450                let m1 = g.add_atom(Element::C);
451                let m2 = g.add_atom(Element::C);
452                g.add_bond(cidx, m1, BondOrder::Single);
453                g.add_bond(m1, m2, BondOrder::Single);
454            }
455            SubstituentKind::Propyl
456            | SubstituentKind::Butyl
457            | SubstituentKind::Pentyl
458            | SubstituentKind::Hexyl => {
459                let chain_len = match &sub.kind {
460                    SubstituentKind::Propyl => 3,
461                    SubstituentKind::Butyl => 4,
462                    SubstituentKind::Pentyl => 5,
463                    SubstituentKind::Hexyl => 6,
464                    _ => unreachable!(),
465                };
466                let mut prev = cidx;
467                for _ in 0..chain_len {
468                    let m = g.add_atom(Element::C);
469                    g.add_bond(prev, m, BondOrder::Single);
470                    prev = m;
471                }
472            }
473            // -CH(CH3)2 : branch carbon + two methyls
474            SubstituentKind::Isopropyl => {
475                let branch = g.add_atom(Element::C);
476                let me1 = g.add_atom(Element::C);
477                let me2 = g.add_atom(Element::C);
478                g.add_bond(cidx, branch, BondOrder::Single);
479                g.add_bond(branch, me1, BondOrder::Single);
480                g.add_bond(branch, me2, BondOrder::Single);
481            }
482            // -C(CH3)3 : quaternary carbon + three methyls
483            SubstituentKind::TertButyl => {
484                let branch = g.add_atom(Element::C);
485                let me1 = g.add_atom(Element::C);
486                let me2 = g.add_atom(Element::C);
487                let me3 = g.add_atom(Element::C);
488                g.add_bond(cidx, branch, BondOrder::Single);
489                g.add_bond(branch, me1, BondOrder::Single);
490                g.add_bond(branch, me2, BondOrder::Single);
491                g.add_bond(branch, me3, BondOrder::Single);
492            }
493            // -CH(CH3)CH2CH3 : branch carbon + methyl + ethyl
494            SubstituentKind::SecButyl => {
495                let branch = g.add_atom(Element::C);
496                let me = g.add_atom(Element::C);
497                let et1 = g.add_atom(Element::C);
498                let et2 = g.add_atom(Element::C);
499                g.add_bond(cidx, branch, BondOrder::Single);
500                g.add_bond(branch, me, BondOrder::Single);
501                g.add_bond(branch, et1, BondOrder::Single);
502                g.add_bond(et1, et2, BondOrder::Single);
503            }
504            // -CH2CH(CH3)2 : methylene + isopropyl
505            SubstituentKind::IsoButyl => {
506                let ch2 = g.add_atom(Element::C);
507                let branch = g.add_atom(Element::C);
508                let me1 = g.add_atom(Element::C);
509                let me2 = g.add_atom(Element::C);
510                g.add_bond(cidx, ch2, BondOrder::Single);
511                g.add_bond(ch2, branch, BondOrder::Single);
512                g.add_bond(branch, me1, BondOrder::Single);
513                g.add_bond(branch, me2, BondOrder::Single);
514            }
515            SubstituentKind::Amino => {
516                let nidx = g.add_atom(Element::N);
517                g.add_bond(cidx, nidx, BondOrder::Single);
518            }
519            SubstituentKind::Mercapto => {
520                let sidx = g.add_atom(Element::S);
521                g.add_bond(cidx, sidx, BondOrder::Single);
522            }
523            SubstituentKind::Cyano => {
524                // cyano- = -C≡N branch attached to chain carbon
525                let cbranch = g.add_atom(Element::C);
526                let nidx = g.add_atom(Element::N);
527                g.add_bond(cidx, cbranch, BondOrder::Single);
528                g.add_bond(cbranch, nidx, BondOrder::Triple);
529            }
530            SubstituentKind::Acetyl => {
531                // acetyl- = -C(=O)CH3: add methyl C first so DFS writes "C(=O)C"
532                let carbonyl = g.add_atom(Element::C);
533                let methyl = g.add_atom(Element::C);
534                let o = g.add_atom(Element::O);
535                g.add_bond(cidx, carbonyl, BondOrder::Single);
536                g.add_bond(carbonyl, methyl, BondOrder::Single);
537                g.add_bond(carbonyl, o, BondOrder::Double);
538            }
539            SubstituentKind::Formyl => {
540                // formyl- = -CHO: one carbon double-bonded to O
541                let carbonyl = g.add_atom(Element::C);
542                let o = g.add_atom(Element::O);
543                g.add_bond(cidx, carbonyl, BondOrder::Single);
544                g.add_bond(carbonyl, o, BondOrder::Double);
545            }
546        }
547    }
548    Ok(())
549}
550
551fn validate_bond_locant(ci: usize, len: usize, tag: &str, name: &str) -> Result<(), ResolveError> {
552    if ci + 1 >= len {
553        Err(ResolveError::ParseError {
554            pos: 0,
555            msg: format!("{tag} locant {ci} out of range for {len}-carbon chain in {name:?}"),
556        })
557    } else {
558        Ok(())
559    }
560}
561
562fn validate_atom_locant(ci: usize, len: usize, tag: &str, name: &str) -> Result<(), ResolveError> {
563    if ci >= len {
564        Err(ResolveError::ParseError {
565            pos: 0,
566            msg: format!("{tag} locant {ci} out of range for {len}-carbon chain in {name:?}"),
567        })
568    } else {
569        Ok(())
570    }
571}
572
573fn upgrade_bond(g: &mut MolGraph, a: usize, b: usize, new_order: BondOrder) {
574    for bond in &mut g.bonds[a] {
575        if bond.to == b {
576            bond.order = new_order.clone();
577        }
578    }
579    for bond in &mut g.bonds[b] {
580        if bond.to == a {
581            bond.order = new_order.clone();
582        }
583    }
584}
585
586#[cfg(test)]
587mod tests {
588    use super::*;
589    use crate::parser::smiles::to_smiles;
590
591    fn smiles(name: &str) -> String {
592        to_smiles(&parse_iupac(name).unwrap_or_else(|e| panic!("{name}: {e}")))
593    }
594
595    #[test]
596    fn methane() {
597        assert_eq!(smiles("methane"), "C");
598    }
599
600    #[test]
601    fn ethane() {
602        assert_eq!(smiles("ethane"), "CC");
603    }
604
605    #[test]
606    fn propane() {
607        assert_eq!(smiles("propane"), "CCC");
608    }
609
610    #[test]
611    fn ethanol() {
612        // DFS from C0(=C1): C0→C1→O → "CCO"
613        assert_eq!(smiles("ethanol"), "CCO");
614    }
615
616    #[test]
617    fn propan_2_one() {
618        // DFS: C0→C1 (branch O at higher idx)→C2 → "CC(=O)C"
619        assert_eq!(smiles("propan-2-one"), "CC(=O)C");
620    }
621
622    #[test]
623    fn but_2_yne() {
624        assert_eq!(smiles("but-2-yne"), "CC#CC");
625    }
626
627    #[test]
628    fn two_four_pentanedione() {
629        assert_eq!(smiles("2,4-pentanedione"), "CC(=O)CC(=O)C");
630    }
631}