Skip to main content

hs_predict/pubchem/
mod.rs

1//! PubChem REST API client.
2//!
3//! Requires the **`pubchem`** Cargo feature.
4//!
5//! # Purpose
6//!
7//! Enriches a [`SubstanceIdentifier`] with structural data fetched from the
8//! [PubChem PUG REST API](https://pubchem.ncbi.nlm.nih.gov/docs/pug-rest):
9//! - CAS number → SMILES, InChI, InChIKey, IUPAC name, CID
10//! - IUPAC name → SMILES, CID, …
11//! - SMILES / InChIKey / InChI → CID + remaining fields
12//!
13//! # Usage
14//!
15//! ```rust,no_run
16//! # #[cfg(feature = "pubchem")]
17//! # async fn example() -> hs_predict::Result<()> {
18//! use hs_predict::pipeline::HsPipeline;
19//! use hs_predict::pubchem::PubChemClient;
20//! use hs_predict::types::{ProductDescription, SubstanceIdentifier};
21//!
22//! let pipeline = HsPipeline::new()
23//!     .with_pubchem(PubChemClient::new());
24//!
25//! let mut product = ProductDescription {
26//!     identifier: SubstanceIdentifier::from_cas("1310-73-2"),
27//!     physical_form: None,
28//!     purity_pct: None,
29//!     purity_type: None,
30//!     mixture_components: None,
31//!     intended_use: None,
32//!     additional_context: None,
33//! };
34//!
35//! // Enrich: CAS 1310-73-2 → SMILES "[Na+].[OH-]", IUPAC "sodium hydroxide", …
36//! pipeline.enrich(&mut product).await?;
37//!
38//! // Classify as normal (SMILES now available → better matching)
39//! let prediction = pipeline.classify(&product)?;
40//! println!("{}", prediction.display());
41//! # Ok(())
42//! # }
43//! ```
44//!
45//! # Rate limiting
46//!
47//! PubChem allows up to **5 requests / second** without an API key.
48//! [`PubChemClient`] enforces this automatically via an internal token-bucket
49//! rate limiter ([`governor`]).
50//!
51//! # Caching
52//!
53//! Responses are cached by PubChem CID using [`moka`] with a 24-hour TTL and
54//! a 1 000-entry capacity. The same compound looked up by different identifiers
55//! (CAS vs. InChIKey) is cached once after the first fetch.
56
57mod error;
58
59pub use error::PubChemError;
60
61use std::num::NonZeroU32;
62use std::sync::Arc;
63use std::time::Duration;
64
65use governor::{DefaultDirectRateLimiter, Quota, RateLimiter};
66use moka::future::Cache;
67use serde::Deserialize;
68use urlencoding::encode;
69
70use crate::error::{HsPredictError, Result};
71use crate::types::SubstanceIdentifier;
72
73// ─── PubChem API constants ────────────────────────────────────────────────────
74
75const BASE_URL: &str = "https://pubchem.ncbi.nlm.nih.gov/rest/pug";
76
77/// Properties fetched in each request (comma-separated PubChem field names).
78const PROPERTIES: &str =
79    "IUPACName,CanonicalSMILES,InChIKey,InChI,MolecularFormula,MolecularWeight";
80
81// ─── Result type ─────────────────────────────────────────────────────────────
82
83/// Compound data returned from a successful PubChem lookup.
84#[derive(Debug, Clone)]
85pub struct PubChemCompound {
86    /// PubChem Compound ID.
87    pub cid: u64,
88    /// Preferred IUPAC name as assigned by PubChem.
89    pub iupac_name: Option<String>,
90    /// Canonical SMILES string.
91    pub canonical_smiles: Option<String>,
92    /// Standard InChI string.
93    pub inchi: Option<String>,
94    /// 27-character InChIKey.
95    pub inchi_key: Option<String>,
96    /// Hill-notation molecular formula.
97    pub molecular_formula: Option<String>,
98    /// Molecular weight in g/mol.
99    pub molecular_weight: Option<f64>,
100}
101
102impl PubChemCompound {
103    /// Copy fields from this compound into `id`, filling only the **missing** fields.
104    ///
105    /// The CID is always set. Other fields are only written if the identifier
106    /// field is currently `None`.
107    pub fn apply_to(&self, id: &mut SubstanceIdentifier) {
108        id.cid = Some(self.cid);
109        if id.smiles.is_none() {
110            id.smiles = self.canonical_smiles.clone();
111        }
112        if id.iupac_name.is_none() {
113            id.iupac_name = self.iupac_name.clone();
114        }
115        if id.inchi.is_none() {
116            id.inchi = self.inchi.clone();
117        }
118        if id.inchi_key.is_none() {
119            id.inchi_key = self.inchi_key.clone();
120        }
121    }
122}
123
124// ─── Client ──────────────────────────────────────────────────────────────────
125
126/// PubChem REST API client with built-in rate limiting and in-memory caching.
127///
128/// Cheap to clone — all internal state is reference-counted.
129#[derive(Clone)]
130pub struct PubChemClient {
131    http: reqwest::Client,
132    /// CID → compound (24 h TTL, capacity 1 000).
133    cache: Cache<u64, Arc<PubChemCompound>>,
134    limiter: Arc<DefaultDirectRateLimiter>,
135    /// Configurable base URL (override for testing).
136    base_url: String,
137}
138
139impl std::fmt::Debug for PubChemClient {
140    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
141        f.debug_struct("PubChemClient")
142            .field("base_url", &self.base_url)
143            .finish_non_exhaustive()
144    }
145}
146
147impl Default for PubChemClient {
148    fn default() -> Self {
149        Self::new()
150    }
151}
152
153impl PubChemClient {
154    /// Create a client with PubChem's default rate limit (5 req/s).
155    ///
156    /// # Panics
157    /// Panics if the TLS backend cannot be initialised (extremely rare;
158    /// indicates a broken system environment).
159    pub fn new() -> Self {
160        Self::builder().build()
161    }
162
163    /// Start building a customised client.
164    pub fn builder() -> PubChemClientBuilder {
165        PubChemClientBuilder::default()
166    }
167
168    // ── Core lookup ──────────────────────────────────────────────────
169
170    /// Look up a compound by the best available identifier.
171    ///
172    /// Priority: CAS number → InChIKey → InChI → SMILES → IUPAC name.
173    ///
174    /// Results are cached by CID, so repeated calls for the same compound
175    /// are free after the first network request.
176    ///
177    /// # Errors
178    /// - [`PubChemError::NotFound`] — no compound matched.
179    /// - [`PubChemError::NoUsableIdentifier`] — the identifier has no usable field.
180    /// - [`PubChemError::Http`] — network or server error.
181    /// - [`PubChemError::RateLimitExceeded`] — PubChem returned HTTP 429.
182    pub async fn lookup(&self, id: &SubstanceIdentifier) -> Result<PubChemCompound> {
183        // Fast-path: if CID is already known, check cache first
184        if let Some(cid) = id.cid {
185            if let Some(cached) = self.cache.get(&cid).await {
186                return Ok((*cached).clone());
187            }
188        }
189
190        let (namespace, input) = Self::pick_namespace(id)
191            .ok_or(PubChemError::NoUsableIdentifier)?;
192
193        self.fetch(namespace, &input).await
194    }
195
196    /// Enrich `id` in place with PubChem data, filling any missing fields.
197    ///
198    /// On [`PubChemError::NotFound`] or [`PubChemError::NoUsableIdentifier`]
199    /// this is a silent no-op (enrichment is best-effort).
200    /// Other errors (network, parse) are propagated.
201    pub async fn enrich(&self, id: &mut SubstanceIdentifier) -> Result<()> {
202        match self.lookup(id).await {
203            Ok(compound) => {
204                compound.apply_to(id);
205                Ok(())
206            }
207            Err(HsPredictError::PubChem(PubChemError::NotFound { .. }))
208            | Err(HsPredictError::PubChem(PubChemError::NoUsableIdentifier)) => Ok(()),
209            Err(e) => Err(e),
210        }
211    }
212
213    // ── Private helpers ───────────────────────────────────────────────
214
215    /// Pick the best (namespace, input) pair for a PubChem URL.
216    fn pick_namespace(id: &SubstanceIdentifier) -> Option<(&'static str, String)> {
217        if let Some(ref cas) = id.cas {
218            return Some(("name", cas.clone()));
219        }
220        if let Some(ref key) = id.inchi_key {
221            return Some(("inchikey", key.clone()));
222        }
223        if let Some(ref inchi) = id.inchi {
224            return Some(("inchi", inchi.clone()));
225        }
226        if let Some(ref smiles) = id.smiles {
227            return Some(("smiles", smiles.clone()));
228        }
229        if let Some(ref name) = id.iupac_name {
230            return Some(("name", name.clone()));
231        }
232        None
233    }
234
235    /// Fetch compound properties from PubChem and cache the result.
236    async fn fetch(&self, namespace: &str, input: &str) -> Result<PubChemCompound> {
237        // Honour the rate limit
238        self.limiter.until_ready().await;
239
240        let url = format!(
241            "{base}/compound/{ns}/{enc}/property/{props}/JSON",
242            base = self.base_url,
243            ns   = namespace,
244            enc  = encode(input),
245            props = PROPERTIES,
246        );
247
248        let resp = self
249            .http
250            .get(&url)
251            .send()
252            .await
253            .map_err(|e| PubChemError::Http(e.to_string()))?;
254
255        match resp.status().as_u16() {
256            200 => {}
257            404 => return Err(PubChemError::NotFound { input: input.to_string() }.into()),
258            429 => return Err(PubChemError::RateLimitExceeded.into()),
259            code => {
260                return Err(PubChemError::Http(format!("HTTP {code}")).into());
261            }
262        }
263
264        let body: PugPropertyResponse = resp
265            .json()
266            .await
267            .map_err(|e| PubChemError::Parse(e.to_string()))?;
268
269        let props = body
270            .property_table
271            .properties
272            .into_iter()
273            .next()
274            .ok_or_else(|| PubChemError::NotFound { input: input.to_string() })?;
275
276        let compound = Arc::new(PubChemCompound {
277            cid: props.cid,
278            iupac_name: props.iupac_name,
279            canonical_smiles: props.canonical_smiles,
280            inchi: props.in_chi,
281            inchi_key: props.in_chi_key,
282            molecular_formula: props.molecular_formula,
283            molecular_weight: props.molecular_weight.as_deref().and_then(|s| s.parse().ok()),
284        });
285
286        // Cache by CID
287        self.cache.insert(compound.cid, Arc::clone(&compound)).await;
288
289        Ok((*compound).clone())
290    }
291}
292
293// ─── Builder ─────────────────────────────────────────────────────────────────
294
295/// Builder for [`PubChemClient`].
296pub struct PubChemClientBuilder {
297    requests_per_second: u32,
298    cache_capacity: u64,
299    cache_ttl: Duration,
300    base_url: String,
301    user_agent: String,
302}
303
304impl Default for PubChemClientBuilder {
305    fn default() -> Self {
306        Self {
307            requests_per_second: 5,
308            cache_capacity: 1_000,
309            cache_ttl: Duration::from_secs(24 * 3600),
310            base_url: BASE_URL.to_string(),
311            user_agent: format!(
312                "hs-predict/{} ({})",
313                env!("CARGO_PKG_VERSION"),
314                env!("CARGO_PKG_REPOSITORY")
315            ),
316        }
317    }
318}
319
320impl PubChemClientBuilder {
321    /// Maximum HTTP requests per second (default: 5 — PubChem's published limit).
322    pub fn requests_per_second(mut self, n: u32) -> Self {
323        self.requests_per_second = n.max(1);
324        self
325    }
326
327    /// In-memory cache capacity in number of entries (default: 1 000).
328    pub fn cache_capacity(mut self, n: u64) -> Self {
329        self.cache_capacity = n;
330        self
331    }
332
333    /// Cache TTL (default: 24 hours).
334    pub fn cache_ttl(mut self, ttl: Duration) -> Self {
335        self.cache_ttl = ttl;
336        self
337    }
338
339    /// Override the PubChem base URL (useful for testing against a local mock server).
340    pub fn base_url(mut self, url: impl Into<String>) -> Self {
341        self.base_url = url.into();
342        self
343    }
344
345    /// Build the [`PubChemClient`].
346    ///
347    /// # Panics
348    /// Panics if the TLS backend cannot be initialised (extremely rare;
349    /// indicates a broken system environment).  For an infallible path in
350    /// constrained environments use [`try_build`](Self::try_build).
351    pub fn build(self) -> PubChemClient {
352        self.try_build()
353            .expect("failed to build PubChemClient — TLS backend unavailable")
354    }
355
356    /// Build the [`PubChemClient`], returning an error instead of panicking if
357    /// the underlying HTTP client cannot be initialised (e.g. TLS failure).
358    ///
359    /// Prefer this over [`build`](Self::build) in long-running servers and WASM
360    /// environments where a panic is unacceptable.
361    pub fn try_build(self) -> Result<PubChemClient> {
362        // `requests_per_second` is always ≥ 1 because the setter clamps with
363        // `.max(1)` and the default is 5, so `NonZeroU32::new` never returns
364        // `None` here.
365        let rps = NonZeroU32::new(self.requests_per_second.max(1))
366            .expect("max(1) guarantees non-zero");
367        let quota = Quota::per_second(rps);
368
369        let http = reqwest::Client::builder()
370            .user_agent(self.user_agent)
371            .build()
372            .map_err(|e| HsPredictError::Http(format!("failed to build HTTP client: {e}")))?;
373
374        Ok(PubChemClient {
375            http,
376            cache: Cache::builder()
377                .max_capacity(self.cache_capacity)
378                .time_to_live(self.cache_ttl)
379                .build(),
380            limiter: Arc::new(RateLimiter::direct(quota)),
381            base_url: self.base_url,
382        })
383    }
384}
385
386// ─── PubChem JSON response types (private) ───────────────────────────────────
387
388#[derive(Deserialize)]
389struct PugPropertyResponse {
390    #[serde(rename = "PropertyTable")]
391    property_table: PropertyTable,
392}
393
394#[derive(Deserialize)]
395struct PropertyTable {
396    #[serde(rename = "Properties")]
397    properties: Vec<CompoundProperty>,
398}
399
400#[derive(Deserialize)]
401struct CompoundProperty {
402    #[serde(rename = "CID")]
403    cid: u64,
404    #[serde(rename = "IUPACName")]
405    iupac_name: Option<String>,
406    #[serde(rename = "CanonicalSMILES")]
407    canonical_smiles: Option<String>,
408    #[serde(rename = "InChI")]
409    in_chi: Option<String>,
410    #[serde(rename = "InChIKey")]
411    in_chi_key: Option<String>,
412    #[serde(rename = "MolecularFormula")]
413    molecular_formula: Option<String>,
414    /// PubChem returns molecular weight as a string (e.g. `"39.997"`).
415    #[serde(rename = "MolecularWeight")]
416    molecular_weight: Option<String>,
417}
418
419// ─── Tests ───────────────────────────────────────────────────────────────────
420
421#[cfg(test)]
422mod tests {
423    use super::*;
424
425    #[test]
426    fn client_builds_with_defaults() {
427        let client = PubChemClient::new();
428        assert_eq!(client.base_url, BASE_URL);
429    }
430
431    #[test]
432    fn builder_overrides_base_url() {
433        let client = PubChemClient::builder()
434            .base_url("http://localhost:8080")
435            .build();
436        assert_eq!(client.base_url, "http://localhost:8080");
437    }
438
439    #[test]
440    fn pick_namespace_cas_first() {
441        let id = SubstanceIdentifier {
442            cas: Some("1310-73-2".to_string()),
443            smiles: Some("[Na+].[OH-]".to_string()),
444            ..Default::default()
445        };
446        let (ns, inp) = PubChemClient::pick_namespace(&id).unwrap();
447        assert_eq!(ns, "name");
448        assert_eq!(inp, "1310-73-2");
449    }
450
451    #[test]
452    fn pick_namespace_inchikey_when_no_cas() {
453        let id = SubstanceIdentifier {
454            inchi_key: Some("HEMHJVSKTPXQMS-UHFFFAOYSA-M".to_string()),
455            ..Default::default()
456        };
457        let (ns, inp) = PubChemClient::pick_namespace(&id).unwrap();
458        assert_eq!(ns, "inchikey");
459        assert_eq!(inp, "HEMHJVSKTPXQMS-UHFFFAOYSA-M");
460    }
461
462    #[test]
463    fn pick_namespace_returns_none_for_empty_id() {
464        let id = SubstanceIdentifier::default();
465        assert!(PubChemClient::pick_namespace(&id).is_none());
466    }
467
468    #[test]
469    fn apply_to_fills_missing_fields_only() {
470        let compound = PubChemCompound {
471            cid: 14798,
472            iupac_name: Some("sodium hydroxide".to_string()),
473            canonical_smiles: Some("[Na+].[OH-]".to_string()),
474            inchi: Some("InChI=1S/Na.H2O/h;1H/q+1;/p-1".to_string()),
475            inchi_key: Some("HEMHJVSKTPXQMS-UHFFFAOYSA-M".to_string()),
476            molecular_formula: Some("HNaO".to_string()),
477            molecular_weight: Some(39.997),
478        };
479
480        let mut id = SubstanceIdentifier {
481            cas: Some("1310-73-2".to_string()),
482            smiles: Some("existing".to_string()), // should NOT be overwritten
483            ..Default::default()
484        };
485
486        compound.apply_to(&mut id);
487
488        assert_eq!(id.cid, Some(14798));
489        assert_eq!(id.smiles.as_deref(), Some("existing")); // preserved
490        assert_eq!(id.iupac_name.as_deref(), Some("sodium hydroxide")); // filled
491        assert_eq!(id.inchi_key.as_deref(), Some("HEMHJVSKTPXQMS-UHFFFAOYSA-M")); // filled
492    }
493
494    /// Integration test: real PubChem network call.
495    /// Run with `cargo test -- --ignored` (requires internet access).
496    #[tokio::test]
497    #[ignore = "requires internet access"]
498    async fn integration_lookup_naoh_by_cas() {
499        let client = PubChemClient::new();
500        let id = SubstanceIdentifier::from_cas("1310-73-2");
501        let compound = client.lookup(&id).await.unwrap();
502
503        assert_eq!(compound.cid, 14798);
504        assert_eq!(
505            compound.canonical_smiles.as_deref(),
506            Some("[Na+].[OH-]")
507        );
508        assert_eq!(
509            compound.iupac_name.as_deref(),
510            Some("sodium hydroxide")
511        );
512    }
513
514    #[tokio::test]
515    #[ignore = "requires internet access"]
516    async fn integration_enrich_fills_smiles() {
517        let client = PubChemClient::new();
518        let mut id = SubstanceIdentifier::from_cas("67-64-1"); // acetone
519        client.enrich(&mut id).await.unwrap();
520
521        assert!(id.smiles.is_some());
522        assert!(id.cid.is_some());
523        assert!(id.iupac_name.is_some());
524    }
525}