Skip to main content

hs_predict/smiles/
mod.rs

1//! SMILES-based functional group detection and chapter-level HS classification.
2//!
3//! This module provides a pattern-matching engine that inspects a canonical
4//! SMILES string and infers:
5//!
6//! 1. **Organic vs. inorganic** classification
7//! 2. **Functional groups** present (up to 20 categories)
8//! 3. **HS chapter / heading hint** (approximate, confidence ≤ 0.70)
9//!
10//! The engine is used as Priority 3 in [`HsPipeline::classify`] when the CAS
11//! rule table (Priority 2) finds no match but a SMILES string is available.
12//!
13//! [`HsPipeline::classify`]: crate::pipeline::HsPipeline::classify
14//!
15//! # Example
16//! ```rust
17//! use hs_predict::smiles::classify_smiles;
18//! use hs_predict::smiles::detector::FunctionalGroup;
19//!
20//! let result = classify_smiles("CC(C)=O").unwrap(); // acetone
21//! assert_eq!(result.heading_hint.heading, Some(2914)); // ketone → 29.14
22//! ```
23
24pub mod chapter_map;
25pub mod detector;
26
27pub use chapter_map::HeadingHint;
28pub use detector::{FunctionalGroup, StructuralFeatures};
29
30use crate::types::OrganicInorganic;
31
32// ─────────────────────────────────────────────────────────────────────────────
33// SmilesClassification
34// ─────────────────────────────────────────────────────────────────────────────
35
36/// Result of SMILES-based functional group analysis and HS heading estimation.
37#[derive(Debug, Clone, serde::Serialize)]
38pub struct SmilesClassification {
39    /// Whether the compound is organic, inorganic, or organometallic.
40    pub organic_class: OrganicInorganic,
41
42    /// Functional groups detected in the SMILES string.
43    /// May be empty for simple hydrocarbons (alkanes, alkenes, etc.).
44    pub functional_groups: Vec<FunctionalGroup>,
45
46    /// Structural atom-count and connectivity properties.
47    pub structural_features: StructuralFeatures,
48
49    /// Best-guess HS chapter / heading (and 6-digit subheading when
50    /// determinable) based on detected groups and structural features.
51    pub heading_hint: HeadingHint,
52}
53
54// ─────────────────────────────────────────────────────────────────────────────
55// Public entry point
56// ─────────────────────────────────────────────────────────────────────────────
57
58/// Analyse a SMILES string and return a chapter-level HS classification hint.
59///
60/// # Returns
61/// - `Some(SmilesClassification)` — analysis result; use
62///   [`SmilesClassification::heading_hint`] for the HS heading.
63/// - `None` — the SMILES string is empty or whitespace-only.
64///
65/// # Notes
66/// - Detection is based on substring matching against canonical SMILES
67///   (as produced by PubChem). Non-canonical or hand-written SMILES may
68///   yield reduced accuracy.
69/// - Results carry confidence ≤ 0.70; always verify with a trade-compliance
70///   expert before using in a customs declaration.
71///
72/// # Example
73/// ```rust
74/// use hs_predict::smiles::classify_smiles;
75///
76/// // Benzaldehyde → aldehyde → 29.12
77/// let r = classify_smiles("O=Cc1ccccc1").unwrap();
78/// assert_eq!(r.heading_hint.heading, Some(2912));
79///
80/// // Acetic acid → carboxylic acid → 29.15
81/// let r = classify_smiles("CC(=O)O").unwrap();
82/// assert_eq!(r.heading_hint.heading, Some(2915));
83/// ```
84/// Maximum accepted SMILES string length (bytes).
85///
86/// SMILES strings for real-world compounds are at most a few thousand
87/// characters.  This limit prevents algorithmic-complexity denial of service
88/// from excessively long inputs.
89pub const MAX_SMILES_LEN: usize = 4096;
90
91pub fn classify_smiles(smiles: &str) -> Option<SmilesClassification> {
92    let smiles = smiles.trim();
93    if smiles.is_empty() || smiles.len() > MAX_SMILES_LEN {
94        return None;
95    }
96
97    let organic_class = detector::classify_organic(smiles);
98    let functional_groups = detector::detect_functional_groups(smiles);
99    let structural_features = detector::detect_structural_features(smiles);
100    let heading_hint = chapter_map::map_to_subheading(
101        &functional_groups,
102        &organic_class,
103        &structural_features,
104    );
105
106    Some(SmilesClassification {
107        organic_class,
108        functional_groups,
109        structural_features,
110        heading_hint,
111    })
112}
113
114// ─────────────────────────────────────────────────────────────────────────────
115// Tests
116// ─────────────────────────────────────────────────────────────────────────────
117
118#[cfg(test)]
119mod tests {
120    use super::*;
121
122    #[test]
123    fn empty_smiles_returns_none() {
124        assert!(classify_smiles("").is_none());
125        assert!(classify_smiles("   ").is_none());
126    }
127
128    #[test]
129    fn acetone_ketone_heading() {
130        // CC(C)=O — acetone (PubChem canonical)
131        let r = classify_smiles("CC(C)=O").unwrap();
132        assert_eq!(r.heading_hint.heading, Some(2914));
133        assert!(r.functional_groups.contains(&FunctionalGroup::Ketone));
134        assert!(matches!(r.organic_class, OrganicInorganic::Organic));
135    }
136
137    #[test]
138    fn acetic_acid_heading() {
139        // CC(=O)O — acetic acid
140        let r = classify_smiles("CC(=O)O").unwrap();
141        assert_eq!(r.heading_hint.heading, Some(2915));
142        assert!(r.functional_groups.contains(&FunctionalGroup::CarboxylicAcid));
143    }
144
145    #[test]
146    fn ethyl_acetate_heading() {
147        // CCOC(C)=O — ethyl acetate
148        let r = classify_smiles("CCOC(C)=O").unwrap();
149        assert_eq!(r.heading_hint.heading, Some(2915));
150        assert!(r.functional_groups.contains(&FunctionalGroup::Ester));
151    }
152
153    #[test]
154    fn benzaldehyde_heading() {
155        // O=Cc1ccccc1 — benzaldehyde
156        let r = classify_smiles("O=Cc1ccccc1").unwrap();
157        assert_eq!(r.heading_hint.heading, Some(2912));
158        assert!(r.functional_groups.contains(&FunctionalGroup::Aldehyde));
159    }
160
161    #[test]
162    fn ethanol_heading() {
163        // CCO — ethanol: structural engine routes to HS 22.07 (ethyl alcohol),
164        // not 29.05.  This is the correct WCO classification.
165        let r = classify_smiles("CCO").unwrap();
166        assert_eq!(r.heading_hint.chapter, 22);
167        assert_eq!(r.heading_hint.heading, Some(2207));
168        assert_eq!(r.heading_hint.subheading.as_deref(), Some("220710"));
169        assert!(r.functional_groups.contains(&FunctionalGroup::Alcohol));
170    }
171
172    #[test]
173    fn methylamine_heading() {
174        // CN — methylamine
175        let r = classify_smiles("CN").unwrap();
176        assert_eq!(r.heading_hint.heading, Some(2921));
177    }
178
179    #[test]
180    fn chlorobenzene_heading() {
181        // Clc1ccccc1 — chlorobenzene
182        let r = classify_smiles("Clc1ccccc1").unwrap();
183        assert_eq!(r.heading_hint.heading, Some(2903));
184        assert!(r.functional_groups.contains(&FunctionalGroup::Halide));
185    }
186
187    #[test]
188    fn co2_is_inorganic_ch28() {
189        let r = classify_smiles("O=C=O").unwrap();
190        assert_eq!(r.heading_hint.chapter, 28);
191        assert!(matches!(r.organic_class, OrganicInorganic::Inorganic));
192    }
193
194    #[test]
195    fn epoxide_heading() {
196        // C1CO1 — ethylene oxide
197        let r = classify_smiles("C1CO1").unwrap();
198        assert_eq!(r.heading_hint.heading, Some(2910));
199    }
200
201    #[test]
202    fn phthalic_anhydride_heading() {
203        // O=C1OC(=O)c2ccccc21
204        let r = classify_smiles("O=C1OC(=O)c2ccccc21").unwrap();
205        assert!(r.functional_groups.contains(&FunctionalGroup::Anhydride));
206        assert_eq!(r.heading_hint.heading, Some(2915));
207    }
208}