1mod error;
58
59pub use error::PubChemError;
60
61use std::num::NonZeroU32;
62use std::sync::Arc;
63use std::time::Duration;
64
65use governor::{DefaultDirectRateLimiter, Quota, RateLimiter};
66use moka::future::Cache;
67use serde::Deserialize;
68use urlencoding::encode;
69
70use crate::error::{HsPredictError, Result};
71use crate::types::SubstanceIdentifier;
72
73const BASE_URL: &str = "https://pubchem.ncbi.nlm.nih.gov/rest/pug";
76
77const PROPERTIES: &str =
79 "IUPACName,CanonicalSMILES,InChIKey,InChI,MolecularFormula,MolecularWeight";
80
81#[derive(Debug, Clone)]
85pub struct PubChemCompound {
86 pub cid: u64,
88 pub iupac_name: Option<String>,
90 pub canonical_smiles: Option<String>,
92 pub inchi: Option<String>,
94 pub inchi_key: Option<String>,
96 pub molecular_formula: Option<String>,
98 pub molecular_weight: Option<f64>,
100}
101
102impl PubChemCompound {
103 pub fn apply_to(&self, id: &mut SubstanceIdentifier) {
108 id.cid = Some(self.cid);
109 if id.smiles.is_none() {
110 id.smiles = self.canonical_smiles.clone();
111 }
112 if id.iupac_name.is_none() {
113 id.iupac_name = self.iupac_name.clone();
114 }
115 if id.inchi.is_none() {
116 id.inchi = self.inchi.clone();
117 }
118 if id.inchi_key.is_none() {
119 id.inchi_key = self.inchi_key.clone();
120 }
121 }
122}
123
124#[derive(Clone)]
130pub struct PubChemClient {
131 http: reqwest::Client,
132 cache: Cache<u64, Arc<PubChemCompound>>,
134 limiter: Arc<DefaultDirectRateLimiter>,
135 base_url: String,
137}
138
139impl std::fmt::Debug for PubChemClient {
140 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
141 f.debug_struct("PubChemClient")
142 .field("base_url", &self.base_url)
143 .finish_non_exhaustive()
144 }
145}
146
147impl Default for PubChemClient {
148 fn default() -> Self {
149 Self::new()
150 }
151}
152
153impl PubChemClient {
154 pub fn new() -> Self {
160 Self::builder().build()
161 }
162
163 pub fn builder() -> PubChemClientBuilder {
165 PubChemClientBuilder::default()
166 }
167
168 pub async fn lookup(&self, id: &SubstanceIdentifier) -> Result<PubChemCompound> {
183 if let Some(cid) = id.cid {
185 if let Some(cached) = self.cache.get(&cid).await {
186 return Ok((*cached).clone());
187 }
188 }
189
190 let (namespace, input) = Self::pick_namespace(id)
191 .ok_or(PubChemError::NoUsableIdentifier)?;
192
193 self.fetch(namespace, &input).await
194 }
195
196 pub async fn enrich(&self, id: &mut SubstanceIdentifier) -> Result<()> {
202 match self.lookup(id).await {
203 Ok(compound) => {
204 compound.apply_to(id);
205 Ok(())
206 }
207 Err(HsPredictError::PubChem(PubChemError::NotFound { .. }))
208 | Err(HsPredictError::PubChem(PubChemError::NoUsableIdentifier)) => Ok(()),
209 Err(e) => Err(e),
210 }
211 }
212
213 fn pick_namespace(id: &SubstanceIdentifier) -> Option<(&'static str, String)> {
217 if let Some(ref cas) = id.cas {
218 return Some(("name", cas.clone()));
219 }
220 if let Some(ref key) = id.inchi_key {
221 return Some(("inchikey", key.clone()));
222 }
223 if let Some(ref inchi) = id.inchi {
224 return Some(("inchi", inchi.clone()));
225 }
226 if let Some(ref smiles) = id.smiles {
227 return Some(("smiles", smiles.clone()));
228 }
229 if let Some(ref name) = id.iupac_name {
230 return Some(("name", name.clone()));
231 }
232 None
233 }
234
235 async fn fetch(&self, namespace: &str, input: &str) -> Result<PubChemCompound> {
237 self.limiter.until_ready().await;
239
240 let url = format!(
241 "{base}/compound/{ns}/{enc}/property/{props}/JSON",
242 base = self.base_url,
243 ns = namespace,
244 enc = encode(input),
245 props = PROPERTIES,
246 );
247
248 let resp = self
249 .http
250 .get(&url)
251 .send()
252 .await
253 .map_err(|e| PubChemError::Http(e.to_string()))?;
254
255 match resp.status().as_u16() {
256 200 => {}
257 404 => return Err(PubChemError::NotFound { input: input.to_string() }.into()),
258 429 => return Err(PubChemError::RateLimitExceeded.into()),
259 code => {
260 return Err(PubChemError::Http(format!("HTTP {code}")).into());
261 }
262 }
263
264 let body: PugPropertyResponse = resp
265 .json()
266 .await
267 .map_err(|e| PubChemError::Parse(e.to_string()))?;
268
269 let props = body
270 .property_table
271 .properties
272 .into_iter()
273 .next()
274 .ok_or_else(|| PubChemError::NotFound { input: input.to_string() })?;
275
276 let compound = Arc::new(PubChemCompound {
277 cid: props.cid,
278 iupac_name: props.iupac_name,
279 canonical_smiles: props.canonical_smiles,
280 inchi: props.in_chi,
281 inchi_key: props.in_chi_key,
282 molecular_formula: props.molecular_formula,
283 molecular_weight: props.molecular_weight.as_deref().and_then(|s| s.parse().ok()),
284 });
285
286 self.cache.insert(compound.cid, Arc::clone(&compound)).await;
288
289 Ok((*compound).clone())
290 }
291}
292
293pub struct PubChemClientBuilder {
297 requests_per_second: u32,
298 cache_capacity: u64,
299 cache_ttl: Duration,
300 base_url: String,
301 user_agent: String,
302}
303
304impl Default for PubChemClientBuilder {
305 fn default() -> Self {
306 Self {
307 requests_per_second: 5,
308 cache_capacity: 1_000,
309 cache_ttl: Duration::from_secs(24 * 3600),
310 base_url: BASE_URL.to_string(),
311 user_agent: format!(
312 "hs-predict/{} ({})",
313 env!("CARGO_PKG_VERSION"),
314 env!("CARGO_PKG_REPOSITORY")
315 ),
316 }
317 }
318}
319
320impl PubChemClientBuilder {
321 pub fn requests_per_second(mut self, n: u32) -> Self {
323 self.requests_per_second = n.max(1);
324 self
325 }
326
327 pub fn cache_capacity(mut self, n: u64) -> Self {
329 self.cache_capacity = n;
330 self
331 }
332
333 pub fn cache_ttl(mut self, ttl: Duration) -> Self {
335 self.cache_ttl = ttl;
336 self
337 }
338
339 pub fn base_url(mut self, url: impl Into<String>) -> Self {
341 self.base_url = url.into();
342 self
343 }
344
345 pub fn build(self) -> PubChemClient {
352 self.try_build()
353 .expect("failed to build PubChemClient — TLS backend unavailable")
354 }
355
356 pub fn try_build(self) -> Result<PubChemClient> {
362 let rps = NonZeroU32::new(self.requests_per_second.max(1))
366 .expect("max(1) guarantees non-zero");
367 let quota = Quota::per_second(rps);
368
369 let http = reqwest::Client::builder()
370 .user_agent(self.user_agent)
371 .build()
372 .map_err(|e| HsPredictError::Http(format!("failed to build HTTP client: {e}")))?;
373
374 Ok(PubChemClient {
375 http,
376 cache: Cache::builder()
377 .max_capacity(self.cache_capacity)
378 .time_to_live(self.cache_ttl)
379 .build(),
380 limiter: Arc::new(RateLimiter::direct(quota)),
381 base_url: self.base_url,
382 })
383 }
384}
385
386#[derive(Deserialize)]
389struct PugPropertyResponse {
390 #[serde(rename = "PropertyTable")]
391 property_table: PropertyTable,
392}
393
394#[derive(Deserialize)]
395struct PropertyTable {
396 #[serde(rename = "Properties")]
397 properties: Vec<CompoundProperty>,
398}
399
400#[derive(Deserialize)]
401struct CompoundProperty {
402 #[serde(rename = "CID")]
403 cid: u64,
404 #[serde(rename = "IUPACName")]
405 iupac_name: Option<String>,
406 #[serde(rename = "CanonicalSMILES")]
407 canonical_smiles: Option<String>,
408 #[serde(rename = "InChI")]
409 in_chi: Option<String>,
410 #[serde(rename = "InChIKey")]
411 in_chi_key: Option<String>,
412 #[serde(rename = "MolecularFormula")]
413 molecular_formula: Option<String>,
414 #[serde(rename = "MolecularWeight")]
416 molecular_weight: Option<String>,
417}
418
419#[cfg(test)]
422mod tests {
423 use super::*;
424
425 #[test]
426 fn client_builds_with_defaults() {
427 let client = PubChemClient::new();
428 assert_eq!(client.base_url, BASE_URL);
429 }
430
431 #[test]
432 fn builder_overrides_base_url() {
433 let client = PubChemClient::builder()
434 .base_url("http://localhost:8080")
435 .build();
436 assert_eq!(client.base_url, "http://localhost:8080");
437 }
438
439 #[test]
440 fn pick_namespace_cas_first() {
441 let id = SubstanceIdentifier {
442 cas: Some("1310-73-2".to_string()),
443 smiles: Some("[Na+].[OH-]".to_string()),
444 ..Default::default()
445 };
446 let (ns, inp) = PubChemClient::pick_namespace(&id).unwrap();
447 assert_eq!(ns, "name");
448 assert_eq!(inp, "1310-73-2");
449 }
450
451 #[test]
452 fn pick_namespace_inchikey_when_no_cas() {
453 let id = SubstanceIdentifier {
454 inchi_key: Some("HEMHJVSKTPXQMS-UHFFFAOYSA-M".to_string()),
455 ..Default::default()
456 };
457 let (ns, inp) = PubChemClient::pick_namespace(&id).unwrap();
458 assert_eq!(ns, "inchikey");
459 assert_eq!(inp, "HEMHJVSKTPXQMS-UHFFFAOYSA-M");
460 }
461
462 #[test]
463 fn pick_namespace_returns_none_for_empty_id() {
464 let id = SubstanceIdentifier::default();
465 assert!(PubChemClient::pick_namespace(&id).is_none());
466 }
467
468 #[test]
469 fn apply_to_fills_missing_fields_only() {
470 let compound = PubChemCompound {
471 cid: 14798,
472 iupac_name: Some("sodium hydroxide".to_string()),
473 canonical_smiles: Some("[Na+].[OH-]".to_string()),
474 inchi: Some("InChI=1S/Na.H2O/h;1H/q+1;/p-1".to_string()),
475 inchi_key: Some("HEMHJVSKTPXQMS-UHFFFAOYSA-M".to_string()),
476 molecular_formula: Some("HNaO".to_string()),
477 molecular_weight: Some(39.997),
478 };
479
480 let mut id = SubstanceIdentifier {
481 cas: Some("1310-73-2".to_string()),
482 smiles: Some("existing".to_string()), ..Default::default()
484 };
485
486 compound.apply_to(&mut id);
487
488 assert_eq!(id.cid, Some(14798));
489 assert_eq!(id.smiles.as_deref(), Some("existing")); assert_eq!(id.iupac_name.as_deref(), Some("sodium hydroxide")); assert_eq!(id.inchi_key.as_deref(), Some("HEMHJVSKTPXQMS-UHFFFAOYSA-M")); }
493
494 #[tokio::test]
497 #[ignore = "requires internet access"]
498 async fn integration_lookup_naoh_by_cas() {
499 let client = PubChemClient::new();
500 let id = SubstanceIdentifier::from_cas("1310-73-2");
501 let compound = client.lookup(&id).await.unwrap();
502
503 assert_eq!(compound.cid, 14798);
504 assert_eq!(
505 compound.canonical_smiles.as_deref(),
506 Some("[Na+].[OH-]")
507 );
508 assert_eq!(
509 compound.iupac_name.as_deref(),
510 Some("sodium hydroxide")
511 );
512 }
513
514 #[tokio::test]
515 #[ignore = "requires internet access"]
516 async fn integration_enrich_fills_smiles() {
517 let client = PubChemClient::new();
518 let mut id = SubstanceIdentifier::from_cas("67-64-1"); client.enrich(&mut id).await.unwrap();
520
521 assert!(id.smiles.is_some());
522 assert!(id.cid.is_some());
523 assert!(id.iupac_name.is_some());
524 }
525}