Skip to main content

hs_predict/smiles/
detector.rs

1//! Organic / inorganic detection and functional group detection from SMILES.
2//!
3//! Detection is based on substring pattern matching against canonical SMILES
4//! (as returned by PubChem). It is intentionally approximate — results carry
5//! a confidence of ≤ 0.70 and are used only as heading-level hints.
6//!
7//! # Priority order
8//! Groups are checked in decreasing specificity so that more specific patterns
9//! take precedence (e.g. anhydride before ester before carboxylic acid).
10
11use crate::types::OrganicInorganic;
12use serde::{Deserialize, Serialize};
13
14// ─────────────────────────────────────────────────────────────────────────────
15// FunctionalGroup enum
16// ─────────────────────────────────────────────────────────────────────────────
17
18/// Functional group category detectable from a SMILES string.
19///
20/// The 20 groups cover the main HS Chapter 29 classification criteria
21/// for organic chemicals plus the organic/inorganic distinction used
22/// for Chapter 28.
23#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
24#[serde(rename_all = "snake_case")]
25pub enum FunctionalGroup {
26    /// –C(=O)–O–C(=O)– (acid anhydride).
27    Anhydride,
28    /// –N=C=O (isocyanate or isothiocyanate N=C=S).
29    Isocyanate,
30    /// –C≡N (nitrile / cyanide).
31    Nitrile,
32    /// –[N+](=O)[O–] nitro group.
33    Nitro,
34    /// Three-membered ring containing O (epoxide).
35    Epoxide,
36    /// –S(=O)(=O)–OH sulphonic acid.
37    SulphonicAcid,
38    /// P=O or P–O (phosphate / phosphonate ester).
39    Phosphate,
40    /// –C(=O)–NH₂ / –NHC(=O)– amide.
41    Amide,
42    /// –C(=O)–O–C ester (not anhydride).
43    Ester,
44    /// –C(=O)–OH carboxylic acid.
45    CarboxylicAcid,
46    /// –CHO terminal aldehyde.
47    Aldehyde,
48    /// –C(=O)– flanked by two C atoms (ketone).
49    Ketone,
50    /// Phenolic –OH on aromatic ring.
51    Phenol,
52    /// –SH thiol (mercaptan).
53    Thiol,
54    /// C–S–C thioether / sulphide.
55    Sulphide,
56    /// Aliphatic –C–OH alcohol.
57    Alcohol,
58    /// C–O–C ether (not ester, not epoxide).
59    Ether,
60    /// Primary, secondary, or tertiary amine –NHₓ (not amide).
61    Amine,
62    /// C–F / C–Cl / C–Br / C–I organic halide.
63    Halide,
64    /// Aromatic ring (any aromatic atom present).
65    AromaticRing,
66}
67
68impl FunctionalGroup {
69    /// Short display label for notes and logging.
70    pub fn label(self) -> &'static str {
71        match self {
72            Self::Anhydride => "Anhydride",
73            Self::Isocyanate => "Isocyanate",
74            Self::Nitrile => "Nitrile",
75            Self::Nitro => "Nitro",
76            Self::Epoxide => "Epoxide",
77            Self::SulphonicAcid => "SulphonicAcid",
78            Self::Phosphate => "Phosphate",
79            Self::Amide => "Amide",
80            Self::Ester => "Ester",
81            Self::CarboxylicAcid => "CarboxylicAcid",
82            Self::Aldehyde => "Aldehyde",
83            Self::Ketone => "Ketone",
84            Self::Phenol => "Phenol",
85            Self::Thiol => "Thiol",
86            Self::Sulphide => "Sulphide",
87            Self::Alcohol => "Alcohol",
88            Self::Ether => "Ether",
89            Self::Amine => "Amine",
90            Self::Halide => "Halide",
91            Self::AromaticRing => "AromaticRing",
92        }
93    }
94}
95
96// ─────────────────────────────────────────────────────────────────────────────
97// Organic / inorganic classification
98// ─────────────────────────────────────────────────────────────────────────────
99
100/// Determine whether a SMILES string represents an organic, inorganic,
101/// or organometallic compound.
102///
103/// Uses the chemical definition: *organic* = contains at least one carbon atom
104/// that is not in a purely inorganic context (CO₂, CO, CS₂, carbonate, cyanide
105/// as free ion).
106pub fn classify_organic(smiles: &str) -> OrganicInorganic {
107    // No carbon → definitely inorganic
108    if !smiles.chars().any(|c| c == 'C' || c == 'c') {
109        return OrganicInorganic::Inorganic;
110    }
111
112    // Exact-match known simple inorganic carbon compounds
113    let normalised = smiles.replace(' ', "");
114    let inorganic_exact: &[&str] = &[
115        "O=C=O",       // CO₂
116        "[O-]C(=O)[O-]", // carbonate ion
117        "[O-]C([O-])=O",
118        "[C-]#[O+]",   // CO
119        "[C+]#[O-]",
120        "S=C=S",       // CS₂
121        "[C-]#N",      // cyanide ion
122        "[N+]#[C-]",
123        "C(=O)([O-])[O-]", // carbonate
124    ];
125    if inorganic_exact.iter().any(|p| normalised == *p) {
126        return OrganicInorganic::Inorganic;
127    }
128
129    // Check multi-component SMILES (dot-separated): each fragment independently
130    // A compound is organometallic if any fragment has a direct metal–C bond.
131    let metal_symbols: &[&str] = &[
132        "[Fe]", "[Co]", "[Ni]", "[Cr]", "[Mn]", "[Mo]", "[W]",
133        "[Ti]", "[V]",  "[Ru]", "[Rh]", "[Pd]", "[Os]", "[Ir]",
134        "[Pt]", "[Zn]", "[Al]", "[Pb]", "[Sn]", "[Hg]", "[Tl]",
135    ];
136    // Organometallic: metal atom directly bonded to carbon in SMILES notation
137    // i.e. the metal symbol is followed or preceded by C/c (with no space or [)
138    for metal in metal_symbols {
139        if smiles.contains(metal) {
140            // Check if this metal is bonded to C in the SMILES graph.
141            // Heuristic: metal symbol immediately adjacent to C or c in the string.
142            let idx = smiles.find(metal).unwrap_or(usize::MAX);
143            let after = smiles.get(idx + metal.len()..).unwrap_or("");
144            let before = smiles.get(..idx).unwrap_or("");
145            let bonded = after.starts_with('C')
146                || after.starts_with('c')
147                || before.ends_with('C')
148                || before.ends_with('c');
149            if bonded {
150                return OrganicInorganic::Organometallic;
151            }
152        }
153    }
154
155    OrganicInorganic::Organic
156}
157
158// ─────────────────────────────────────────────────────────────────────────────
159// Functional group detection
160// ─────────────────────────────────────────────────────────────────────────────
161
162/// Detect functional groups present in a SMILES string.
163///
164/// The detection uses substring pattern matching against both the
165/// canonical and common alternative SMILES representations.
166/// Groups are returned in detection priority order (most specific first).
167///
168/// # Limitations
169/// - Does not perform full SMILES parsing; edge cases may be missed.
170/// - Designed primarily for PubChem canonical SMILES.
171/// - Confidences are capped at ≤ 0.70 due to these limitations.
172pub fn detect_functional_groups(smiles: &str) -> Vec<FunctionalGroup> {
173    let mut groups: Vec<FunctionalGroup> = Vec::new();
174
175    // Helper: returns true if any of `patterns` is a substring of `smiles`.
176    let any = |patterns: &[&str]| -> bool { patterns.iter().any(|p| smiles.contains(p)) };
177
178    // ── 1. Anhydride (check before ester and acid) ────────────────────────
179    // Linear anhydride: C(=O)OC(=O) (e.g. acetic anhydride: CC(=O)OC(=O)C)
180    // Cyclic anhydride: O=C[digit]OC(=O) (e.g. phthalic: O=C1OC(=O)c2ccccc21)
181    let cyclic_anhydride = (1u8..=9).any(|n| {
182        smiles.contains(&format!("O=C{}OC(=O)", n))
183    });
184    if smiles.contains("C(=O)OC(=O)") || cyclic_anhydride {
185        groups.push(FunctionalGroup::Anhydride);
186    }
187
188    // ── 2. Isocyanate ─────────────────────────────────────────────────────
189    if any(&["N=C=O", "O=C=N"]) {
190        groups.push(FunctionalGroup::Isocyanate);
191    }
192
193    // ── 3. Nitrile ────────────────────────────────────────────────────────
194    if any(&["C#N", "N#C"]) {
195        groups.push(FunctionalGroup::Nitrile);
196    }
197
198    // ── 4. Nitro ──────────────────────────────────────────────────────────
199    // PubChem canonical writes the double-bond O before N: O=[N+]([O-])
200    if any(&[
201        "O=[N+]([O-])", // PubChem canonical (nitrobenzene, TNT, etc.)
202        "[N+](=O)[O-]", // alternative bracket form
203        "N(=O)=O",
204        "[N+]([O-])=O",
205        "[N+](=O)([O-])",
206    ]) {
207        groups.push(FunctionalGroup::Nitro);
208    }
209
210    // ── 5. Epoxide (3-membered ring with O) ───────────────────────────────
211    // PubChem canonical for ethylene oxide: C1CO1 (C-C-O ring).
212    // Also handle C1OC1 (alternative) and stereocentres.
213    if any(&[
214        "C1CO1",           // ethylene oxide / PubChem canonical
215        "C1OC1",           // alternative ring ordering
216        "[C@@H]1O[C@H]1",  // stereo epoxide
217        "[C@H]1O[C@@H]1",
218    ]) {
219        groups.push(FunctionalGroup::Epoxide);
220    }
221
222    // ── 6. Sulphonic acid ─────────────────────────────────────────────────
223    if any(&["S(=O)(=O)O", "S(=O)(=O)[OH]", "S(O)(=O)=O", "[S](=O)(=O)O"]) {
224        groups.push(FunctionalGroup::SulphonicAcid);
225    }
226
227    // ── 7. Phosphate / phosphonate ────────────────────────────────────────
228    if smiles.contains('P')
229        && any(&["P(=O)(O)", "P(=O)([O", "P(O)(O)", "P([OH])", "OP(=O)", "P(=O)O"])
230    {
231        groups.push(FunctionalGroup::Phosphate);
232    }
233
234    // ── 8. Amide (before amine) ───────────────────────────────────────────
235    // Canonical: NC(=O), NC(C...)=O, C(N)=O, C(=O)N, C(=O)[NH
236    if any(&[
237        "NC(=O)", "NC(C", // NC(C...)=O  — amide N before carbonyl-C
238        "C(N)=O", "C(=O)N", "C(=O)[NH", "[NH]C(=O)", "[NH2]C(=O)",
239        "N)=O",   // -N)=O terminal amide
240    ]) {
241        // Exclude isocyanate and nitrile (already tagged)
242        let has_iso = groups.contains(&FunctionalGroup::Isocyanate);
243        let has_nitrile = groups.contains(&FunctionalGroup::Nitrile);
244        if !has_iso && !has_nitrile {
245            groups.push(FunctionalGroup::Amide);
246        }
247    }
248
249    // ── 9. Ester (before carboxylic acid) ─────────────────────────────────
250    // Canonical: OC(C...)=O (ester O before carbonyl-C), C(=O)OC
251    let has_anhydride = groups.contains(&FunctionalGroup::Anhydride);
252    if !has_anhydride
253        && any(&[
254            "OC(C)=O", "OC(=O)C", "C(=O)OC", "C(=O)Oc",  // common ester patterns
255            "OC(CC", "OC(c",  // aromatic/branched esters
256        ])
257    {
258        groups.push(FunctionalGroup::Ester);
259    }
260
261    // ── 10. Carboxylic acid ────────────────────────────────────────────────
262    // After ester to avoid false positives
263    let has_ester = groups.contains(&FunctionalGroup::Ester);
264    if !has_ester && !has_anhydride {
265        // Acid patterns: C(=O)O terminal, C(O)=O, OC(=O) at boundaries
266        // In canonical SMILES: acetic acid = CC(=O)O (O is terminal)
267        let has_acid_pattern = any(&[
268            "C(=O)O",    // acetic acid: CC(=O)O — O terminal
269            "C(O)=O",    // alternative writing
270            "C(=O)[OH]", // explicit H on O
271        ]);
272        // Exclude if the pattern belongs to carbonate or similar
273        if has_acid_pattern {
274            groups.push(FunctionalGroup::CarboxylicAcid);
275        }
276    }
277
278    // ── 11. Aldehyde ──────────────────────────────────────────────────────
279    // Terminal C=O with no second C on the carbonyl C
280    // Canonical: CC=O, O=Cc..., [CH]=O
281    let has_higher_carbonyl = groups.iter().any(|g| {
282        matches!(
283            g,
284            FunctionalGroup::Amide
285                | FunctionalGroup::Ester
286                | FunctionalGroup::CarboxylicAcid
287                | FunctionalGroup::Anhydride
288        )
289    });
290    if !has_higher_carbonyl {
291        let aldehyde = smiles.ends_with("C=O")
292            || smiles.ends_with("[CH]=O")
293            || smiles.starts_with("O=C")  // e.g. O=Cc1ccccc1 (benzaldehyde)
294            || any(&["[CH]=O", "[CHO]"]);
295        if aldehyde {
296            groups.push(FunctionalGroup::Aldehyde);
297        }
298    }
299
300    // ── 12. Ketone ────────────────────────────────────────────────────────
301    // Carbonyl C with C on both sides; canonical: CC(C)=O, CC(CC)=O
302    if !has_higher_carbonyl {
303        let has_aldehyde = groups.contains(&FunctionalGroup::Aldehyde);
304        if !has_aldehyde
305            && any(&[
306                "C(C)=O",  // CC(C)=O acetone, CC(CC)=O 2-butanone
307                "C(CC)=O", "C(CCC)=O",
308                "C(c)=O",  // aryl ketone: C(c1...)=O
309                "c(=O)C",  // aromatic ketone
310                "C(=O)C",  // alternative form: CC(=O)CC
311            ])
312        {
313            groups.push(FunctionalGroup::Ketone);
314        }
315    }
316
317    // ── 13. Phenol ────────────────────────────────────────────────────────
318    if any(&[
319        "c1ccccc1O", "Oc1ccccc1",
320        "c(O)",      // aromatic C-OH inline
321        "c([OH])",   // explicit
322        "Oc1cc", "Oc1ccc", "c1cc(O)", "c1ccc(O)",
323    ]) {
324        groups.push(FunctionalGroup::Phenol);
325    }
326
327    // ── 14. Thiol ─────────────────────────────────────────────────────────
328    // Canonical: [SH] explicit, or CS at end of string
329    if any(&["[SH]", "C[SH]", "c[SH]"])
330        || smiles.ends_with("CS")
331        || smiles.ends_with("cS")
332    {
333        groups.push(FunctionalGroup::Thiol);
334    }
335
336    // ── 15. Sulphide (after thiol and sulphonic acid) ──────────────────────
337    let has_sulphonic = groups.contains(&FunctionalGroup::SulphonicAcid);
338    let has_thiol = groups.contains(&FunctionalGroup::Thiol);
339    if !has_sulphonic
340        && !has_thiol
341        && smiles.contains('S')
342        && any(&["CSC", "cSC", "CSc", "cSc", "C(S)C"])
343    {
344        groups.push(FunctionalGroup::Sulphide);
345    }
346
347    // ── 16. Alcohol ───────────────────────────────────────────────────────
348    // Aliphatic C-OH: [OH] explicit, terminal O in chain, or (O) pendant
349    let has_phenol = groups.contains(&FunctionalGroup::Phenol);
350    let has_acid = groups.contains(&FunctionalGroup::CarboxylicAcid);
351    let has_ester2 = groups.contains(&FunctionalGroup::Ester);
352    let has_anhydride2 = groups.contains(&FunctionalGroup::Anhydride);
353    // Also guard against aldehyde: "CC=O" ends with "O" but is not an alcohol.
354    let has_aldehyde_grp = groups.contains(&FunctionalGroup::Aldehyde);
355    if !has_phenol && !has_acid && !has_ester2 && !has_anhydride2 && !has_aldehyde_grp {
356        let alcohol = any(&["[OH]", "C[OH]"])
357            || smiles.ends_with("CO")
358            || smiles.ends_with("CCO")
359            || smiles.ends_with("O")  // generic terminal O (e.g. CCO = ethanol)
360            || any(&["C(O)", "C([OH])"]);
361        if alcohol {
362            groups.push(FunctionalGroup::Alcohol);
363        }
364    }
365
366    // ── 17. Ether ─────────────────────────────────────────────────────────
367    // C-O-C not ester, not epoxide, not acid anhydride
368    let has_epoxide = groups.contains(&FunctionalGroup::Epoxide);
369    let has_ester3 = groups.contains(&FunctionalGroup::Ester);
370    let has_acid2 = groups.contains(&FunctionalGroup::CarboxylicAcid);
371    if !has_epoxide && !has_ester3 && !has_acid2 && !has_anhydride
372        && any(&["COC", "cOC", "COc", "cOc"]) {
373        groups.push(FunctionalGroup::Ether);
374    }
375
376    // ── 18. Amine ─────────────────────────────────────────────────────────
377    // N not in amide, nitrile, nitro
378    let has_amide = groups.contains(&FunctionalGroup::Amide);
379    let has_nitrile = groups.contains(&FunctionalGroup::Nitrile);
380    let has_nitro = groups.contains(&FunctionalGroup::Nitro);
381    if smiles.contains('N')
382        && !has_nitrile
383        && !has_nitro
384    {
385        // Look for amine patterns not adjacent to a carbonyl
386        let amine = any(&[
387            "CN", "NC", "[NH2]", "[NH3+]", "[NH]", "cN", "Nc",
388        ]);
389        // If amide already detected, only add amine if there's a free amine too
390        if amine && (!has_amide || any(&["[NH2]", "[NH3+]", "CN(", "N(C)C"])) {
391            groups.push(FunctionalGroup::Amine);
392        }
393    }
394
395    // ── 19. Halide ────────────────────────────────────────────────────────
396    if any(&[
397        "CF", "CCl", "CBr", "CI",
398        "Fc", "Clc", "Brc", "Ic",
399        "[F]", "[Cl]", "[Br]", "[I]",
400        "c[F]", "c[Cl]", "c[Br]", "c[I]",
401        "CF3", "CCl3", "CHF", "CHCl", "CHBr",
402    ]) {
403        groups.push(FunctionalGroup::Halide);
404    }
405
406    // ── 20. Aromatic ring (last — lowest priority) ────────────────────────
407    if smiles.chars().any(|c| matches!(c, 'c' | 'n' | 'o' | 's' | 'p')) {
408        groups.push(FunctionalGroup::AromaticRing);
409    }
410
411    groups
412}
413
414// ─────────────────────────────────────────────────────────────────────────────
415// Structural feature extraction
416// ─────────────────────────────────────────────────────────────────────────────
417
418/// Atom-count and connectivity properties extracted from a SMILES string.
419///
420/// These supplement functional-group detection and are used by
421/// [`crate::smiles::chapter_map::map_to_subheading`] to resolve
422/// 4-digit HS headings to 6-digit subheadings.
423///
424/// Analysis is heuristic and designed for PubChem canonical SMILES.
425#[derive(Debug, Clone, Default, Serialize, Deserialize)]
426pub struct StructuralFeatures {
427    /// Total carbon atom count (uppercase C + aromatic c, excluding Cl).
428    pub carbon_count: u32,
429    /// Estimated hydroxyl (–OH) group count.
430    ///
431    /// For carboxylic acids this includes the acid –OH (one per –COOH).
432    /// Use `hydroxyl_count.saturating_sub(1)` when `CarboxylicAcid` is
433    /// in the detected functional groups to get the extra alcohol –OH count.
434    pub hydroxyl_count: u32,
435    /// Number of C=O (carbonyl) groups (ketone, aldehyde, ester, acid, etc.).
436    pub carbonyl_count: u32,
437    /// `true` when the SMILES contains a ring-closure digit outside brackets.
438    pub has_ring: bool,
439    /// `true` when lowercase aromatic-carbon atoms (`c`) are present.
440    pub has_aromatic_ring: bool,
441    /// `true` when a C=C aliphatic double bond is present.
442    pub has_cc_double_bond: bool,
443    /// `true` when a halogen substituent (F, Cl, Br, I) is present.
444    pub has_halogen: bool,
445    // ── v0.5.2 additions ─────────────────────────────────────────────────
446    /// Number of C=C double bonds (counts "=C"/"=c" occurrences).
447    /// Used to distinguish monoenes from dienes (e.g. isoprene has 2).
448    pub cc_double_bond_count: u32,
449    /// `true` when a C≡C triple bond is present (alkynes → 2901.29).
450    pub has_triple_bond: bool,
451    /// Number of Cl (chlorine) atoms.  Used to resolve 2903.11–2903.15.
452    pub chlorine_count: u32,
453    /// `true` when the SMILES contains only carbon and hydrogen atoms —
454    /// no heteroatoms (O, N, S, P) and no halogens.
455    /// Routes acyclic/cyclic hydrocarbons to HS 2901/2902.
456    pub is_pure_hydrocarbon: bool,
457    /// `true` when the SMILES contains only C, H, and Cl —
458    /// no other heteroatoms and no F/Br/I.
459    /// Routes simple chlorinated hydrocarbons to HS 2903.
460    pub is_chloro_hydrocarbon: bool,
461}
462
463/// Extract structural features from a canonical SMILES string.
464///
465/// The analysis is approximate.  Use together with [`detect_functional_groups`]
466/// to narrow 4-digit HS headings down to 6-digit subheadings.
467pub fn detect_structural_features(smiles: &str) -> StructuralFeatures {
468    StructuralFeatures {
469        carbon_count:      count_carbons(smiles),
470        hydroxyl_count:    count_hydroxyls(smiles),
471        carbonyl_count:    smiles.matches("=O").count() as u32,
472        has_ring:          ring_present(smiles),
473        has_aromatic_ring: smiles.contains('c'),
474        has_cc_double_bond: cc_double_bond_present(smiles),
475        has_halogen: smiles.contains('F')
476            || smiles.contains("Cl")
477            || smiles.contains("Br")
478            || (smiles.contains('I') && !smiles.contains("In")),
479        cc_double_bond_count: count_cc_double_bonds(smiles),
480        has_triple_bond:      detect_triple_bond(smiles),
481        chlorine_count:       count_chlorines(smiles),
482        is_pure_hydrocarbon:  is_pure_hydrocarbon_smiles(smiles),
483        is_chloro_hydrocarbon: is_chloro_only_smiles(smiles),
484    }
485}
486
487/// Count carbon atoms in a SMILES string.
488/// Handles bracket atoms (`[13C]`, `[CH2]`) and skips `Cl` (chlorine).
489fn count_carbons(smiles: &str) -> u32 {
490    let mut count = 0u32;
491    let mut chars = smiles.chars().peekable();
492    let mut in_bracket = false;
493    let mut bracket_buf = String::new();
494
495    while let Some(ch) = chars.next() {
496        match ch {
497            '[' => {
498                in_bracket = true;
499                bracket_buf.clear();
500            }
501            ']' if in_bracket => {
502                in_bracket = false;
503                // Strip leading isotope digits then inspect the atom symbol.
504                let sym = bracket_buf.trim_start_matches(|c: char| c.is_ascii_digit());
505                if sym.starts_with('C') || sym.starts_with('c') {
506                    count += 1;
507                }
508            }
509            c if in_bracket => bracket_buf.push(c),
510            'C' => {
511                if chars.peek() == Some(&'l') {
512                    chars.next(); // Cl = chlorine, not carbon
513                } else {
514                    count += 1;
515                }
516            }
517            'c' => count += 1,
518            _ => {}
519        }
520    }
521    count
522}
523
524/// Estimate the number of hydroxyl (–OH) groups in a SMILES string.
525///
526/// Counts aliphatic `O` atoms that are not carbonyl oxygens (`=O`) and not
527/// ether oxygens (flanked by carbon on both sides).  Also recognises `[OH]`.
528fn count_hydroxyls(smiles: &str) -> u32 {
529    let chars: Vec<char> = smiles.chars().collect();
530    let n = chars.len();
531    let mut count = 0u32;
532    let mut i = 0;
533
534    while i < n {
535        // Bracket atom: read until ']'
536        if chars[i] == '[' {
537            i += 1;
538            let mut buf = String::new();
539            while i < n && chars[i] != ']' {
540                buf.push(chars[i]);
541                i += 1;
542            }
543            i += 1; // skip ']'
544            let sym = buf.trim_start_matches(|c: char| c.is_ascii_digit());
545            if sym.starts_with("OH") {
546                count += 1;
547            }
548            continue;
549        }
550
551        if chars[i] == 'O' {
552            let prev = if i > 0 { chars[i - 1] } else { '\0' };
553            let next = if i + 1 < n { chars[i + 1] } else { '\0' };
554
555            // Skip carbonyl oxygen (=O)
556            if prev == '=' {
557                i += 1;
558                continue;
559            }
560
561            // Skip ether oxygen: carbon-like on both sides
562            let prev_is_c = matches!(prev, 'C' | 'c' | ')');
563            let next_is_c = matches!(next, 'C' | 'c' | '(');
564            if prev_is_c && next_is_c {
565                i += 1;
566                continue;
567            }
568
569            count += 1;
570        }
571
572        i += 1;
573    }
574    count
575}
576
577/// Return `true` when the SMILES contains a ring-closure digit outside brackets.
578fn ring_present(smiles: &str) -> bool {
579    let mut in_bracket = false;
580    for ch in smiles.chars() {
581        match ch {
582            '[' => in_bracket = true,
583            ']' => in_bracket = false,
584            c if c.is_ascii_digit() && !in_bracket => return true,
585            _ => {}
586        }
587    }
588    false
589}
590
591/// Return `true` when a C=C aliphatic double bond is present.
592fn cc_double_bond_present(smiles: &str) -> bool {
593    // Direct C=C forms
594    smiles.contains("C=C")
595        || smiles.contains("c=c")
596        || smiles.contains("C=c")
597        || smiles.contains("c=C")
598        // Branch form: C(=C)... e.g. methacrylic acid CC(=C)C(=O)O
599        || smiles.contains("(=C)")
600        || smiles.contains("(=c)")
601}
602
603/// Count the number of C=C double bonds by counting "=C" / "=c" patterns.
604fn count_cc_double_bonds(smiles: &str) -> u32 {
605    let bytes = smiles.as_bytes();
606    let mut count = 0u32;
607    for i in 0..bytes.len().saturating_sub(1) {
608        if bytes[i] == b'=' && matches!(bytes[i + 1], b'C' | b'c') {
609            count += 1;
610        }
611    }
612    count
613}
614
615/// Return `true` when the SMILES contains a C#C triple bond (alkyne).
616fn detect_triple_bond(smiles: &str) -> bool {
617    smiles.contains("C#C") || smiles.contains("C#c") || smiles.contains("c#C") || smiles.contains("c#c")
618}
619
620/// Count the number of chlorine atoms (the two-character token "Cl").
621fn count_chlorines(smiles: &str) -> u32 {
622    let bytes = smiles.as_bytes();
623    let mut count = 0u32;
624    for i in 0..bytes.len().saturating_sub(1) {
625        if bytes[i] == b'C' && bytes[i + 1] == b'l' {
626            count += 1;
627        }
628    }
629    count
630}
631
632/// Return `true` when the SMILES contains only C and H atoms — no
633/// heteroatoms (O, N, S, P) and no halogens.
634fn is_pure_hydrocarbon_smiles(smiles: &str) -> bool {
635    !smiles.contains('O')
636        && !smiles.contains('o')
637        && !smiles.contains('N')
638        && !smiles.contains('n')
639        && !smiles.contains('S')
640        && !smiles.contains('s')
641        && !smiles.contains('P')
642        && !smiles.contains('p')
643        && !smiles.contains('F')
644        && !smiles.contains('I')
645        && !smiles.contains("Cl")
646        && !smiles.contains("Br")
647}
648
649/// Return `true` when the SMILES contains only C, H, and Cl —
650/// no other heteroatoms and no F/Br/I halogens.
651fn is_chloro_only_smiles(smiles: &str) -> bool {
652    !smiles.contains('O')
653        && !smiles.contains('o')
654        && !smiles.contains('N')
655        && !smiles.contains('n')
656        && !smiles.contains('S')
657        && !smiles.contains('s')
658        && !smiles.contains('P')
659        && !smiles.contains('p')
660        && !smiles.contains('F')
661        && !smiles.contains('I')
662        && !smiles.contains("Br")
663        && smiles.contains("Cl")
664}
665
666// ─────────────────────────────────────────────────────────────────────────────
667// Tests
668// ─────────────────────────────────────────────────────────────────────────────
669
670#[cfg(test)]
671mod tests {
672    use super::*;
673
674    fn fg(smiles: &str) -> Vec<FunctionalGroup> {
675        detect_functional_groups(smiles)
676    }
677
678    fn has(smiles: &str, g: FunctionalGroup) -> bool {
679        fg(smiles).contains(&g)
680    }
681
682    // ── Organic / inorganic ───────────────────────────────────────────────
683
684    #[test]
685    fn co2_is_inorganic() {
686        assert_eq!(classify_organic("O=C=O"), OrganicInorganic::Inorganic);
687    }
688
689    #[test]
690    fn water_is_inorganic() {
691        assert_eq!(classify_organic("O"), OrganicInorganic::Inorganic);
692    }
693
694    #[test]
695    fn ethanol_is_organic() {
696        assert_eq!(classify_organic("CCO"), OrganicInorganic::Organic);
697    }
698
699    #[test]
700    fn benzene_is_organic() {
701        assert_eq!(classify_organic("c1ccccc1"), OrganicInorganic::Organic);
702    }
703
704    // ── Functional group detection ────────────────────────────────────────
705
706    #[test]
707    fn acetic_acid_detected() {
708        // CC(=O)O — acetic acid (PubChem canonical)
709        assert!(has("CC(=O)O", FunctionalGroup::CarboxylicAcid));
710        assert!(!has("CC(=O)O", FunctionalGroup::Ester));
711    }
712
713    #[test]
714    fn ethyl_acetate_detected_as_ester() {
715        // CCOC(C)=O — ethyl acetate (PubChem canonical)
716        assert!(has("CCOC(C)=O", FunctionalGroup::Ester));
717        assert!(!has("CCOC(C)=O", FunctionalGroup::CarboxylicAcid));
718    }
719
720    #[test]
721    fn phthalic_anhydride_detected() {
722        // O=C1OC(=O)c2ccccc21
723        let groups = fg("O=C1OC(=O)c2ccccc21");
724        assert!(groups.contains(&FunctionalGroup::Anhydride));
725        assert!(!groups.contains(&FunctionalGroup::Ester));
726    }
727
728    #[test]
729    fn acetaldehyde_detected() {
730        // CC=O
731        assert!(has("CC=O", FunctionalGroup::Aldehyde));
732        assert!(!has("CC=O", FunctionalGroup::Ketone));
733    }
734
735    /// Regression test: "CC=O" (acetaldehyde) must NOT be classified as Alcohol.
736    /// The terminal "O" in "CC=O" was previously caught by the generic
737    /// `smiles.ends_with("O")` check in the alcohol branch.
738    #[test]
739    fn acetaldehyde_not_classified_as_alcohol() {
740        assert!(!has("CC=O", FunctionalGroup::Alcohol),
741            "aldehyde SMILES 'CC=O' must not produce Alcohol group");
742    }
743
744    #[test]
745    fn acetone_detected_as_ketone() {
746        // CC(C)=O — PubChem canonical
747        assert!(has("CC(C)=O", FunctionalGroup::Ketone));
748        assert!(!has("CC(C)=O", FunctionalGroup::Aldehyde));
749    }
750
751    #[test]
752    fn ethanol_detected_as_alcohol() {
753        // CCO
754        assert!(has("CCO", FunctionalGroup::Alcohol));
755        assert!(!has("CCO", FunctionalGroup::Ether));
756    }
757
758    #[test]
759    fn dimethyl_ether_detected() {
760        // COC
761        assert!(has("COC", FunctionalGroup::Ether));
762        assert!(!has("COC", FunctionalGroup::Alcohol));
763    }
764
765    #[test]
766    fn methylamine_detected() {
767        // CN — methylamine
768        assert!(has("CN", FunctionalGroup::Amine));
769    }
770
771    #[test]
772    fn acetamide_detected() {
773        // CC(N)=O — acetamide (PubChem canonical)
774        assert!(has("CC(N)=O", FunctionalGroup::Amide));
775        assert!(!has("CC(N)=O", FunctionalGroup::Ketone));
776    }
777
778    #[test]
779    fn acetonitrile_detected() {
780        // CC#N
781        assert!(has("CC#N", FunctionalGroup::Nitrile));
782    }
783
784    #[test]
785    fn chloromethane_detected() {
786        // CCl
787        assert!(has("CCl", FunctionalGroup::Halide));
788    }
789
790    #[test]
791    fn ethylene_oxide_detected() {
792        // C1CO1 — ethylene oxide (PubChem canonical)
793        assert!(has("C1CO1", FunctionalGroup::Epoxide));
794    }
795
796    #[test]
797    fn benzene_detected_as_aromatic() {
798        assert!(has("c1ccccc1", FunctionalGroup::AromaticRing));
799    }
800
801    #[test]
802    fn phenol_detected() {
803        // Oc1ccccc1
804        assert!(has("Oc1ccccc1", FunctionalGroup::Phenol));
805    }
806
807    #[test]
808    fn nitrobenzene_detected() {
809        // O=[N+]([O-])c1ccccc1
810        assert!(has("O=[N+]([O-])c1ccccc1", FunctionalGroup::Nitro));
811    }
812
813    #[test]
814    fn ethanesulfonic_acid_detected() {
815        // CCS(=O)(=O)O
816        assert!(has("CCS(=O)(=O)O", FunctionalGroup::SulphonicAcid));
817    }
818
819    #[test]
820    fn dimethyl_sulfide_detected() {
821        // CSC
822        assert!(has("CSC", FunctionalGroup::Sulphide));
823    }
824
825    #[test]
826    fn methanethiol_detected() {
827        // C[SH]
828        assert!(has("C[SH]", FunctionalGroup::Thiol));
829    }
830
831    #[test]
832    fn isocyanate_detected() {
833        // CN=C=O — methyl isocyanate
834        assert!(has("CN=C=O", FunctionalGroup::Isocyanate));
835    }
836
837    #[test]
838    fn trimethyl_phosphate_detected() {
839        // COP(=O)(OC)OC
840        assert!(has("COP(=O)(OC)OC", FunctionalGroup::Phosphate));
841    }
842
843    // ── StructuralFeatures ────────────────────────────────────────────────
844
845    fn sf(smiles: &str) -> StructuralFeatures {
846        detect_structural_features(smiles)
847    }
848
849    #[test]
850    fn acetone_carbon_count_3() {
851        // CC(C)=O — 3 carbons, no ring, no aromatic, no C=C
852        let f = sf("CC(C)=O");
853        assert_eq!(f.carbon_count, 3);
854        assert!(!f.has_ring);
855        assert!(!f.has_aromatic_ring);
856        assert!(!f.has_cc_double_bond);
857        assert_eq!(f.carbonyl_count, 1);
858    }
859
860    #[test]
861    fn ethanol_hydroxyl_count_1() {
862        // CCO — 2 carbons, 1 OH
863        let f = sf("CCO");
864        assert_eq!(f.carbon_count, 2);
865        assert_eq!(f.hydroxyl_count, 1);
866    }
867
868    #[test]
869    fn ethylene_glycol_hydroxyl_count_2() {
870        // OCCO — 2 carbons, 2 OH
871        let f = sf("OCCO");
872        assert_eq!(f.carbon_count, 2);
873        assert_eq!(f.hydroxyl_count, 2);
874    }
875
876    #[test]
877    fn glycerol_hydroxyl_count_3() {
878        // OCC(O)CO — 3 carbons, 3 OH
879        let f = sf("OCC(O)CO");
880        assert_eq!(f.carbon_count, 3);
881        assert_eq!(f.hydroxyl_count, 3);
882    }
883
884    #[test]
885    fn ether_oxygen_not_counted_as_oh() {
886        // COC — dimethyl ether, 0 OH
887        let f = sf("COC");
888        assert_eq!(f.hydroxyl_count, 0);
889    }
890
891    #[test]
892    fn acetic_acid_one_oh() {
893        // CC(=O)O — acetic acid: 1 carbonyl + 1 acid OH
894        let f = sf("CC(=O)O");
895        assert_eq!(f.carbon_count, 2);
896        assert_eq!(f.hydroxyl_count, 1);
897        assert_eq!(f.carbonyl_count, 1);
898    }
899
900    #[test]
901    fn acrylic_acid_has_cc_double_bond() {
902        // C=CC(=O)O — acrylic acid: C=C present
903        let f = sf("C=CC(=O)O");
904        assert!(f.has_cc_double_bond);
905        assert_eq!(f.carbon_count, 3);
906    }
907
908    #[test]
909    fn methacrylic_acid_has_cc_double_bond() {
910        // CC(=C)C(=O)O — methacrylic acid: branch C=C
911        let f = sf("CC(=C)C(=O)O");
912        assert!(f.has_cc_double_bond);
913        assert_eq!(f.carbon_count, 4);
914    }
915
916    #[test]
917    fn benzene_has_aromatic_ring() {
918        let f = sf("c1ccccc1");
919        assert!(f.has_ring);
920        assert!(f.has_aromatic_ring);
921        assert_eq!(f.carbon_count, 6);
922    }
923
924    #[test]
925    fn cyclohexanone_is_ring_no_aromatic() {
926        // O=C1CCCCC1 — cyclohexanone: 6C, ring, no aromatic
927        let f = sf("O=C1CCCCC1");
928        assert!(f.has_ring);
929        assert!(!f.has_aromatic_ring);
930        assert_eq!(f.carbon_count, 6);
931    }
932
933    #[test]
934    fn chlorobenzene_has_halogen() {
935        let f = sf("Clc1ccccc1");
936        assert!(f.has_halogen);
937        assert_eq!(f.carbon_count, 6);
938    }
939
940    #[test]
941    fn methanol_carbon_count_1() {
942        let f = sf("CO");
943        assert_eq!(f.carbon_count, 1);
944        assert_eq!(f.hydroxyl_count, 1);
945    }
946
947    #[test]
948    fn isoprene_structural_features() {
949        let f = detect_structural_features("C=CC(C)=C");
950        assert_eq!(f.cc_double_bond_count, 2);
951        assert_eq!(f.carbon_count, 5);
952        assert!(f.is_pure_hydrocarbon);
953        assert!(!f.has_ring);
954        assert!(!f.has_triple_bond);
955    }
956
957    #[test]
958    fn dcm_structural_features() {
959        let f = detect_structural_features("ClCCl");
960        assert_eq!(f.chlorine_count, 2);
961        assert_eq!(f.carbon_count, 1);
962        assert!(f.is_chloro_hydrocarbon);
963        assert!(!f.is_pure_hydrocarbon);
964    }
965
966    #[test]
967    fn cyclohexane_structural_features() {
968        let f = detect_structural_features("C1CCCCC1");
969        assert!(f.has_ring);
970        assert!(!f.has_aromatic_ring);
971        assert!(f.is_pure_hydrocarbon);
972        assert_eq!(f.carbon_count, 6);
973        assert_eq!(f.cc_double_bond_count, 0);
974    }
975
976    #[test]
977    fn ethylene_cc_count_1() {
978        let f = detect_structural_features("C=C");
979        assert_eq!(f.cc_double_bond_count, 1);
980        assert!(f.is_pure_hydrocarbon);
981    }
982
983    #[test]
984    fn chloroform_chlorine_count_3() {
985        let f = detect_structural_features("ClC(Cl)Cl");
986        assert_eq!(f.chlorine_count, 3);
987        assert_eq!(f.carbon_count, 1);
988        assert!(f.is_chloro_hydrocarbon);
989    }
990
991    #[test]
992    fn acetone_not_pure_hydrocarbon() {
993        let f = detect_structural_features("CC(C)=O");
994        assert!(!f.is_pure_hydrocarbon);
995        assert!(!f.is_chloro_hydrocarbon);
996        // cc_double_bond_count: "=O" does not have 'C' after '=', but check "=C" is 0
997        // acetone has C=O not C=C, count should be 0
998        assert_eq!(f.cc_double_bond_count, 0);
999    }
1000}