Skip to main content

hs_predict/smiles/
mod.rs

1//! SMILES-based functional group detection and chapter-level HS classification.
2//!
3//! This module provides a pattern-matching engine that inspects a canonical
4//! SMILES string and infers:
5//!
6//! 1. **Organic vs. inorganic** classification
7//! 2. **Functional groups** present (up to 20 categories)
8//! 3. **HS chapter / heading hint** (approximate, confidence ≤ 0.70)
9//!
10//! The engine is used as Priority 3 in [`HsPipeline::classify`] when the CAS
11//! rule table (Priority 2) finds no match but a SMILES string is available.
12//!
13//! [`HsPipeline::classify`]: crate::pipeline::HsPipeline::classify
14//!
15//! # Example
16//! ```rust
17//! use hs_predict::smiles::classify_smiles;
18//! use hs_predict::smiles::detector::FunctionalGroup;
19//!
20//! let result = classify_smiles("CC(C)=O").unwrap(); // acetone
21//! assert_eq!(result.heading_hint.heading, Some(2914)); // ketone → 29.14
22//! ```
23
24pub mod chapter_map;
25pub mod detector;
26
27pub use chapter_map::HeadingHint;
28pub use detector::FunctionalGroup;
29
30use crate::types::OrganicInorganic;
31
32// ─────────────────────────────────────────────────────────────────────────────
33// SmilesClassification
34// ─────────────────────────────────────────────────────────────────────────────
35
36/// Result of SMILES-based functional group analysis and HS heading estimation.
37#[derive(Debug, Clone, serde::Serialize)]
38pub struct SmilesClassification {
39    /// Whether the compound is organic, inorganic, or organometallic.
40    pub organic_class: OrganicInorganic,
41
42    /// Functional groups detected in the SMILES string.
43    /// May be empty for simple hydrocarbons (alkanes, alkenes, etc.).
44    pub functional_groups: Vec<FunctionalGroup>,
45
46    /// Best-guess HS chapter / heading based on detected groups.
47    pub heading_hint: HeadingHint,
48}
49
50// ─────────────────────────────────────────────────────────────────────────────
51// Public entry point
52// ─────────────────────────────────────────────────────────────────────────────
53
54/// Analyse a SMILES string and return a chapter-level HS classification hint.
55///
56/// # Returns
57/// - `Some(SmilesClassification)` — analysis result; use
58///   [`SmilesClassification::heading_hint`] for the HS heading.
59/// - `None` — the SMILES string is empty or whitespace-only.
60///
61/// # Notes
62/// - Detection is based on substring matching against canonical SMILES
63///   (as produced by PubChem). Non-canonical or hand-written SMILES may
64///   yield reduced accuracy.
65/// - Results carry confidence ≤ 0.70; always verify with a trade-compliance
66///   expert before using in a customs declaration.
67///
68/// # Example
69/// ```rust
70/// use hs_predict::smiles::classify_smiles;
71///
72/// // Benzaldehyde → aldehyde → 29.12
73/// let r = classify_smiles("O=Cc1ccccc1").unwrap();
74/// assert_eq!(r.heading_hint.heading, Some(2912));
75///
76/// // Acetic acid → carboxylic acid → 29.15
77/// let r = classify_smiles("CC(=O)O").unwrap();
78/// assert_eq!(r.heading_hint.heading, Some(2915));
79/// ```
80/// Maximum accepted SMILES string length (bytes).
81///
82/// SMILES strings for real-world compounds are at most a few thousand
83/// characters.  This limit prevents algorithmic-complexity denial of service
84/// from excessively long inputs.
85pub const MAX_SMILES_LEN: usize = 4096;
86
87pub fn classify_smiles(smiles: &str) -> Option<SmilesClassification> {
88    let smiles = smiles.trim();
89    if smiles.is_empty() || smiles.len() > MAX_SMILES_LEN {
90        return None;
91    }
92
93    let organic_class = detector::classify_organic(smiles);
94    let functional_groups = detector::detect_functional_groups(smiles);
95    let heading_hint = chapter_map::map_to_heading(&functional_groups, &organic_class);
96
97    Some(SmilesClassification {
98        organic_class,
99        functional_groups,
100        heading_hint,
101    })
102}
103
104// ─────────────────────────────────────────────────────────────────────────────
105// Tests
106// ─────────────────────────────────────────────────────────────────────────────
107
108#[cfg(test)]
109mod tests {
110    use super::*;
111
112    #[test]
113    fn empty_smiles_returns_none() {
114        assert!(classify_smiles("").is_none());
115        assert!(classify_smiles("   ").is_none());
116    }
117
118    #[test]
119    fn acetone_ketone_heading() {
120        // CC(C)=O — acetone (PubChem canonical)
121        let r = classify_smiles("CC(C)=O").unwrap();
122        assert_eq!(r.heading_hint.heading, Some(2914));
123        assert!(r.functional_groups.contains(&FunctionalGroup::Ketone));
124        assert!(matches!(r.organic_class, OrganicInorganic::Organic));
125    }
126
127    #[test]
128    fn acetic_acid_heading() {
129        // CC(=O)O — acetic acid
130        let r = classify_smiles("CC(=O)O").unwrap();
131        assert_eq!(r.heading_hint.heading, Some(2915));
132        assert!(r.functional_groups.contains(&FunctionalGroup::CarboxylicAcid));
133    }
134
135    #[test]
136    fn ethyl_acetate_heading() {
137        // CCOC(C)=O — ethyl acetate
138        let r = classify_smiles("CCOC(C)=O").unwrap();
139        assert_eq!(r.heading_hint.heading, Some(2915));
140        assert!(r.functional_groups.contains(&FunctionalGroup::Ester));
141    }
142
143    #[test]
144    fn benzaldehyde_heading() {
145        // O=Cc1ccccc1 — benzaldehyde
146        let r = classify_smiles("O=Cc1ccccc1").unwrap();
147        assert_eq!(r.heading_hint.heading, Some(2912));
148        assert!(r.functional_groups.contains(&FunctionalGroup::Aldehyde));
149    }
150
151    #[test]
152    fn ethanol_heading() {
153        // CCO — ethanol
154        let r = classify_smiles("CCO").unwrap();
155        assert_eq!(r.heading_hint.heading, Some(2905));
156        assert!(r.functional_groups.contains(&FunctionalGroup::Alcohol));
157    }
158
159    #[test]
160    fn methylamine_heading() {
161        // CN — methylamine
162        let r = classify_smiles("CN").unwrap();
163        assert_eq!(r.heading_hint.heading, Some(2921));
164    }
165
166    #[test]
167    fn chlorobenzene_heading() {
168        // Clc1ccccc1 — chlorobenzene
169        let r = classify_smiles("Clc1ccccc1").unwrap();
170        assert_eq!(r.heading_hint.heading, Some(2903));
171        assert!(r.functional_groups.contains(&FunctionalGroup::Halide));
172    }
173
174    #[test]
175    fn co2_is_inorganic_ch28() {
176        let r = classify_smiles("O=C=O").unwrap();
177        assert_eq!(r.heading_hint.chapter, 28);
178        assert!(matches!(r.organic_class, OrganicInorganic::Inorganic));
179    }
180
181    #[test]
182    fn epoxide_heading() {
183        // C1CO1 — ethylene oxide
184        let r = classify_smiles("C1CO1").unwrap();
185        assert_eq!(r.heading_hint.heading, Some(2910));
186    }
187
188    #[test]
189    fn phthalic_anhydride_heading() {
190        // O=C1OC(=O)c2ccccc21
191        let r = classify_smiles("O=C1OC(=O)c2ccccc21").unwrap();
192        assert!(r.functional_groups.contains(&FunctionalGroup::Anhydride));
193        assert_eq!(r.heading_hint.heading, Some(2915));
194    }
195}