Skip to main content

hs_predict/pubchem/
mod.rs

1//! PubChem REST API client.
2//!
3//! Requires the **`pubchem`** Cargo feature.
4//!
5//! # Purpose
6//!
7//! Enriches a [`SubstanceIdentifier`] with structural data fetched from the
8//! [PubChem PUG REST API](https://pubchem.ncbi.nlm.nih.gov/docs/pug-rest):
9//! - CAS number → SMILES, InChI, InChIKey, IUPAC name, CID
10//! - IUPAC name → SMILES, CID, …
11//! - SMILES / InChIKey / InChI → CID + remaining fields
12//!
13//! # Usage
14//!
15//! ```rust,no_run
16//! # #[cfg(feature = "pubchem")]
17//! # async fn example() -> hs_predict::Result<()> {
18//! use hs_predict::pipeline::HsPipeline;
19//! use hs_predict::pubchem::PubChemClient;
20//! use hs_predict::types::{ProductDescription, SubstanceIdentifier};
21//!
22//! let pipeline = HsPipeline::new()
23//!     .with_pubchem(PubChemClient::new());
24//!
25//! let mut product = ProductDescription {
26//!     identifier: SubstanceIdentifier::from_cas("1310-73-2"),
27//!     physical_form: None,
28//!     purity_pct: None,
29//!     purity_type: None,
30//!     mixture_components: None,
31//!     intended_use: None,
32//!     additional_context: None,
33//! };
34//!
35//! // Enrich: CAS 1310-73-2 → SMILES "[Na+].[OH-]", IUPAC "sodium hydroxide", …
36//! pipeline.enrich(&mut product).await?;
37//!
38//! // Classify as normal (SMILES now available → better matching)
39//! let prediction = pipeline.classify(&product)?;
40//! println!("{}", prediction.display());
41//! # Ok(())
42//! # }
43//! ```
44//!
45//! # Rate limiting
46//!
47//! PubChem allows up to **5 requests / second** without an API key.
48//! [`PubChemClient`] enforces this automatically via an internal token-bucket
49//! rate limiter ([`governor`]).
50//!
51//! # Caching
52//!
53//! Responses are cached by PubChem CID using [`moka`] with a 24-hour TTL and
54//! a 1 000-entry capacity. The same compound looked up by different identifiers
55//! (CAS vs. InChIKey) is cached once after the first fetch.
56
57mod error;
58
59pub use error::PubChemError;
60
61use std::num::NonZeroU32;
62use std::sync::Arc;
63use std::time::Duration;
64
65use governor::{DefaultDirectRateLimiter, Quota, RateLimiter};
66use moka::future::Cache;
67use serde::Deserialize;
68use urlencoding::encode;
69
70use crate::error::{HsPredictError, Result};
71use crate::types::SubstanceIdentifier;
72
73// ─── PubChem API constants ────────────────────────────────────────────────────
74
75const BASE_URL: &str = "https://pubchem.ncbi.nlm.nih.gov/rest/pug";
76
77/// Properties fetched in each request (comma-separated PubChem field names).
78const PROPERTIES: &str =
79    "IUPACName,CanonicalSMILES,InChIKey,InChI,MolecularFormula,MolecularWeight";
80
81// ─── Result type ─────────────────────────────────────────────────────────────
82
83/// Compound data returned from a successful PubChem lookup.
84#[derive(Debug, Clone)]
85pub struct PubChemCompound {
86    /// PubChem Compound ID.
87    pub cid: u64,
88    /// Preferred IUPAC name as assigned by PubChem.
89    pub iupac_name: Option<String>,
90    /// Canonical SMILES string.
91    pub canonical_smiles: Option<String>,
92    /// Standard InChI string.
93    pub inchi: Option<String>,
94    /// 27-character InChIKey.
95    pub inchi_key: Option<String>,
96    /// Hill-notation molecular formula.
97    pub molecular_formula: Option<String>,
98    /// Molecular weight in g/mol.
99    pub molecular_weight: Option<f64>,
100}
101
102impl PubChemCompound {
103    /// Copy fields from this compound into `id`, filling only the **missing** fields.
104    ///
105    /// The CID is always set. Other fields are only written if the identifier
106    /// field is currently `None`.
107    pub fn apply_to(&self, id: &mut SubstanceIdentifier) {
108        id.cid = Some(self.cid);
109        if id.smiles.is_none() {
110            id.smiles = self.canonical_smiles.clone();
111        }
112        if id.iupac_name.is_none() {
113            id.iupac_name = self.iupac_name.clone();
114        }
115        if id.inchi.is_none() {
116            id.inchi = self.inchi.clone();
117        }
118        if id.inchi_key.is_none() {
119            id.inchi_key = self.inchi_key.clone();
120        }
121    }
122}
123
124// ─── Client ──────────────────────────────────────────────────────────────────
125
126/// PubChem REST API client with built-in rate limiting and in-memory caching.
127///
128/// Cheap to clone — all internal state is reference-counted.
129#[derive(Clone)]
130pub struct PubChemClient {
131    http: reqwest::Client,
132    /// CID → compound (24 h TTL, capacity 1 000).
133    cache: Cache<u64, Arc<PubChemCompound>>,
134    limiter: Arc<DefaultDirectRateLimiter>,
135    /// Configurable base URL (override for testing).
136    base_url: String,
137}
138
139impl std::fmt::Debug for PubChemClient {
140    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
141        f.debug_struct("PubChemClient")
142            .field("base_url", &self.base_url)
143            .finish_non_exhaustive()
144    }
145}
146
147impl Default for PubChemClient {
148    fn default() -> Self {
149        Self::new()
150    }
151}
152
153impl PubChemClient {
154    /// Create a client with PubChem's default rate limit (5 req/s).
155    pub fn new() -> Self {
156        Self::builder().build()
157    }
158
159    /// Start building a customised client.
160    pub fn builder() -> PubChemClientBuilder {
161        PubChemClientBuilder::default()
162    }
163
164    // ── Core lookup ──────────────────────────────────────────────────
165
166    /// Look up a compound by the best available identifier.
167    ///
168    /// Priority: CAS number → InChIKey → InChI → SMILES → IUPAC name.
169    ///
170    /// Results are cached by CID, so repeated calls for the same compound
171    /// are free after the first network request.
172    ///
173    /// # Errors
174    /// - [`PubChemError::NotFound`] — no compound matched.
175    /// - [`PubChemError::NoUsableIdentifier`] — the identifier has no usable field.
176    /// - [`PubChemError::Http`] — network or server error.
177    /// - [`PubChemError::RateLimitExceeded`] — PubChem returned HTTP 429.
178    pub async fn lookup(&self, id: &SubstanceIdentifier) -> Result<PubChemCompound> {
179        // Fast-path: if CID is already known, check cache first
180        if let Some(cid) = id.cid {
181            if let Some(cached) = self.cache.get(&cid).await {
182                return Ok((*cached).clone());
183            }
184        }
185
186        let (namespace, input) = Self::pick_namespace(id)
187            .ok_or(PubChemError::NoUsableIdentifier)?;
188
189        self.fetch(namespace, &input).await
190    }
191
192    /// Enrich `id` in place with PubChem data, filling any missing fields.
193    ///
194    /// On [`PubChemError::NotFound`] or [`PubChemError::NoUsableIdentifier`]
195    /// this is a silent no-op (enrichment is best-effort).
196    /// Other errors (network, parse) are propagated.
197    pub async fn enrich(&self, id: &mut SubstanceIdentifier) -> Result<()> {
198        match self.lookup(id).await {
199            Ok(compound) => {
200                compound.apply_to(id);
201                Ok(())
202            }
203            Err(HsPredictError::PubChem(PubChemError::NotFound { .. }))
204            | Err(HsPredictError::PubChem(PubChemError::NoUsableIdentifier)) => Ok(()),
205            Err(e) => Err(e),
206        }
207    }
208
209    // ── Private helpers ───────────────────────────────────────────────
210
211    /// Pick the best (namespace, input) pair for a PubChem URL.
212    fn pick_namespace(id: &SubstanceIdentifier) -> Option<(&'static str, String)> {
213        if let Some(ref cas) = id.cas {
214            return Some(("name", cas.clone()));
215        }
216        if let Some(ref key) = id.inchi_key {
217            return Some(("inchikey", key.clone()));
218        }
219        if let Some(ref inchi) = id.inchi {
220            return Some(("inchi", inchi.clone()));
221        }
222        if let Some(ref smiles) = id.smiles {
223            return Some(("smiles", smiles.clone()));
224        }
225        if let Some(ref name) = id.iupac_name {
226            return Some(("name", name.clone()));
227        }
228        None
229    }
230
231    /// Fetch compound properties from PubChem and cache the result.
232    async fn fetch(&self, namespace: &str, input: &str) -> Result<PubChemCompound> {
233        // Honour the rate limit
234        self.limiter.until_ready().await;
235
236        let url = format!(
237            "{base}/compound/{ns}/{enc}/property/{props}/JSON",
238            base = self.base_url,
239            ns   = namespace,
240            enc  = encode(input),
241            props = PROPERTIES,
242        );
243
244        let resp = self
245            .http
246            .get(&url)
247            .send()
248            .await
249            .map_err(|e| PubChemError::Http(e.to_string()))?;
250
251        match resp.status().as_u16() {
252            200 => {}
253            404 => return Err(PubChemError::NotFound { input: input.to_string() }.into()),
254            429 => return Err(PubChemError::RateLimitExceeded.into()),
255            code => {
256                return Err(PubChemError::Http(format!("HTTP {code}")).into());
257            }
258        }
259
260        let body: PugPropertyResponse = resp
261            .json()
262            .await
263            .map_err(|e| PubChemError::Parse(e.to_string()))?;
264
265        let props = body
266            .property_table
267            .properties
268            .into_iter()
269            .next()
270            .ok_or_else(|| PubChemError::NotFound { input: input.to_string() })?;
271
272        let compound = Arc::new(PubChemCompound {
273            cid: props.cid,
274            iupac_name: props.iupac_name,
275            canonical_smiles: props.canonical_smiles,
276            inchi: props.in_chi,
277            inchi_key: props.in_chi_key,
278            molecular_formula: props.molecular_formula,
279            molecular_weight: props.molecular_weight.as_deref().and_then(|s| s.parse().ok()),
280        });
281
282        // Cache by CID
283        self.cache.insert(compound.cid, Arc::clone(&compound)).await;
284
285        Ok((*compound).clone())
286    }
287}
288
289// ─── Builder ─────────────────────────────────────────────────────────────────
290
291/// Builder for [`PubChemClient`].
292pub struct PubChemClientBuilder {
293    requests_per_second: u32,
294    cache_capacity: u64,
295    cache_ttl: Duration,
296    base_url: String,
297    user_agent: String,
298}
299
300impl Default for PubChemClientBuilder {
301    fn default() -> Self {
302        Self {
303            requests_per_second: 5,
304            cache_capacity: 1_000,
305            cache_ttl: Duration::from_secs(24 * 3600),
306            base_url: BASE_URL.to_string(),
307            user_agent: format!(
308                "hs-predict/{} ({})",
309                env!("CARGO_PKG_VERSION"),
310                env!("CARGO_PKG_REPOSITORY")
311            ),
312        }
313    }
314}
315
316impl PubChemClientBuilder {
317    /// Maximum HTTP requests per second (default: 5 — PubChem's published limit).
318    pub fn requests_per_second(mut self, n: u32) -> Self {
319        self.requests_per_second = n.max(1);
320        self
321    }
322
323    /// In-memory cache capacity in number of entries (default: 1 000).
324    pub fn cache_capacity(mut self, n: u64) -> Self {
325        self.cache_capacity = n;
326        self
327    }
328
329    /// Cache TTL (default: 24 hours).
330    pub fn cache_ttl(mut self, ttl: Duration) -> Self {
331        self.cache_ttl = ttl;
332        self
333    }
334
335    /// Override the PubChem base URL (useful for testing against a local mock server).
336    pub fn base_url(mut self, url: impl Into<String>) -> Self {
337        self.base_url = url.into();
338        self
339    }
340
341    /// Build the [`PubChemClient`].
342    pub fn build(self) -> PubChemClient {
343        let quota = Quota::per_second(
344            NonZeroU32::new(self.requests_per_second)
345                .expect("requests_per_second must be ≥ 1"),
346        );
347
348        PubChemClient {
349            http: reqwest::Client::builder()
350                .user_agent(self.user_agent)
351                .build()
352                .expect("failed to build reqwest::Client"),
353            cache: Cache::builder()
354                .max_capacity(self.cache_capacity)
355                .time_to_live(self.cache_ttl)
356                .build(),
357            limiter: Arc::new(RateLimiter::direct(quota)),
358            base_url: self.base_url,
359        }
360    }
361}
362
363// ─── PubChem JSON response types (private) ───────────────────────────────────
364
365#[derive(Deserialize)]
366struct PugPropertyResponse {
367    #[serde(rename = "PropertyTable")]
368    property_table: PropertyTable,
369}
370
371#[derive(Deserialize)]
372struct PropertyTable {
373    #[serde(rename = "Properties")]
374    properties: Vec<CompoundProperty>,
375}
376
377#[derive(Deserialize)]
378struct CompoundProperty {
379    #[serde(rename = "CID")]
380    cid: u64,
381    #[serde(rename = "IUPACName")]
382    iupac_name: Option<String>,
383    #[serde(rename = "CanonicalSMILES")]
384    canonical_smiles: Option<String>,
385    #[serde(rename = "InChI")]
386    in_chi: Option<String>,
387    #[serde(rename = "InChIKey")]
388    in_chi_key: Option<String>,
389    #[serde(rename = "MolecularFormula")]
390    molecular_formula: Option<String>,
391    /// PubChem returns molecular weight as a string (e.g. `"39.997"`).
392    #[serde(rename = "MolecularWeight")]
393    molecular_weight: Option<String>,
394}
395
396// ─── Tests ───────────────────────────────────────────────────────────────────
397
398#[cfg(test)]
399mod tests {
400    use super::*;
401
402    #[test]
403    fn client_builds_with_defaults() {
404        let client = PubChemClient::new();
405        assert_eq!(client.base_url, BASE_URL);
406    }
407
408    #[test]
409    fn builder_overrides_base_url() {
410        let client = PubChemClient::builder()
411            .base_url("http://localhost:8080")
412            .build();
413        assert_eq!(client.base_url, "http://localhost:8080");
414    }
415
416    #[test]
417    fn pick_namespace_cas_first() {
418        let id = SubstanceIdentifier {
419            cas: Some("1310-73-2".to_string()),
420            smiles: Some("[Na+].[OH-]".to_string()),
421            ..Default::default()
422        };
423        let (ns, inp) = PubChemClient::pick_namespace(&id).unwrap();
424        assert_eq!(ns, "name");
425        assert_eq!(inp, "1310-73-2");
426    }
427
428    #[test]
429    fn pick_namespace_inchikey_when_no_cas() {
430        let id = SubstanceIdentifier {
431            inchi_key: Some("HEMHJVSKTPXQMS-UHFFFAOYSA-M".to_string()),
432            ..Default::default()
433        };
434        let (ns, inp) = PubChemClient::pick_namespace(&id).unwrap();
435        assert_eq!(ns, "inchikey");
436        assert_eq!(inp, "HEMHJVSKTPXQMS-UHFFFAOYSA-M");
437    }
438
439    #[test]
440    fn pick_namespace_returns_none_for_empty_id() {
441        let id = SubstanceIdentifier::default();
442        assert!(PubChemClient::pick_namespace(&id).is_none());
443    }
444
445    #[test]
446    fn apply_to_fills_missing_fields_only() {
447        let compound = PubChemCompound {
448            cid: 14798,
449            iupac_name: Some("sodium hydroxide".to_string()),
450            canonical_smiles: Some("[Na+].[OH-]".to_string()),
451            inchi: Some("InChI=1S/Na.H2O/h;1H/q+1;/p-1".to_string()),
452            inchi_key: Some("HEMHJVSKTPXQMS-UHFFFAOYSA-M".to_string()),
453            molecular_formula: Some("HNaO".to_string()),
454            molecular_weight: Some(39.997),
455        };
456
457        let mut id = SubstanceIdentifier {
458            cas: Some("1310-73-2".to_string()),
459            smiles: Some("existing".to_string()), // should NOT be overwritten
460            ..Default::default()
461        };
462
463        compound.apply_to(&mut id);
464
465        assert_eq!(id.cid, Some(14798));
466        assert_eq!(id.smiles.as_deref(), Some("existing")); // preserved
467        assert_eq!(id.iupac_name.as_deref(), Some("sodium hydroxide")); // filled
468        assert_eq!(id.inchi_key.as_deref(), Some("HEMHJVSKTPXQMS-UHFFFAOYSA-M")); // filled
469    }
470
471    /// Integration test: real PubChem network call.
472    /// Run with `cargo test -- --ignored` (requires internet access).
473    #[tokio::test]
474    #[ignore = "requires internet access"]
475    async fn integration_lookup_naoh_by_cas() {
476        let client = PubChemClient::new();
477        let id = SubstanceIdentifier::from_cas("1310-73-2");
478        let compound = client.lookup(&id).await.unwrap();
479
480        assert_eq!(compound.cid, 14798);
481        assert_eq!(
482            compound.canonical_smiles.as_deref(),
483            Some("[Na+].[OH-]")
484        );
485        assert_eq!(
486            compound.iupac_name.as_deref(),
487            Some("sodium hydroxide")
488        );
489    }
490
491    #[tokio::test]
492    #[ignore = "requires internet access"]
493    async fn integration_enrich_fills_smiles() {
494        let client = PubChemClient::new();
495        let mut id = SubstanceIdentifier::from_cas("67-64-1"); // acetone
496        client.enrich(&mut id).await.unwrap();
497
498        assert!(id.smiles.is_some());
499        assert!(id.cid.is_some());
500        assert!(id.iupac_name.is_some());
501    }
502}