1mod error;
58
59pub use error::PubChemError;
60
61use std::num::NonZeroU32;
62use std::sync::Arc;
63use std::time::Duration;
64
65use governor::{DefaultDirectRateLimiter, Quota, RateLimiter};
66use moka::future::Cache;
67use serde::Deserialize;
68use urlencoding::encode;
69
70use crate::error::{HsPredictError, Result};
71use crate::types::SubstanceIdentifier;
72
73const BASE_URL: &str = "https://pubchem.ncbi.nlm.nih.gov/rest/pug";
76
77const PROPERTIES: &str =
79 "IUPACName,CanonicalSMILES,InChIKey,InChI,MolecularFormula,MolecularWeight";
80
81#[derive(Debug, Clone)]
85pub struct PubChemCompound {
86 pub cid: u64,
88 pub iupac_name: Option<String>,
90 pub canonical_smiles: Option<String>,
92 pub inchi: Option<String>,
94 pub inchi_key: Option<String>,
96 pub molecular_formula: Option<String>,
98 pub molecular_weight: Option<f64>,
100}
101
102impl PubChemCompound {
103 pub fn apply_to(&self, id: &mut SubstanceIdentifier) {
108 id.cid = Some(self.cid);
109 if id.smiles.is_none() {
110 id.smiles = self.canonical_smiles.clone();
111 }
112 if id.iupac_name.is_none() {
113 id.iupac_name = self.iupac_name.clone();
114 }
115 if id.inchi.is_none() {
116 id.inchi = self.inchi.clone();
117 }
118 if id.inchi_key.is_none() {
119 id.inchi_key = self.inchi_key.clone();
120 }
121 }
122}
123
124#[derive(Clone)]
130pub struct PubChemClient {
131 http: reqwest::Client,
132 cache: Cache<u64, Arc<PubChemCompound>>,
134 limiter: Arc<DefaultDirectRateLimiter>,
135 base_url: String,
137}
138
139impl std::fmt::Debug for PubChemClient {
140 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
141 f.debug_struct("PubChemClient")
142 .field("base_url", &self.base_url)
143 .finish_non_exhaustive()
144 }
145}
146
147impl Default for PubChemClient {
148 fn default() -> Self {
149 Self::new()
150 }
151}
152
153impl PubChemClient {
154 pub fn new() -> Self {
156 Self::builder().build()
157 }
158
159 pub fn builder() -> PubChemClientBuilder {
161 PubChemClientBuilder::default()
162 }
163
164 pub async fn lookup(&self, id: &SubstanceIdentifier) -> Result<PubChemCompound> {
179 if let Some(cid) = id.cid {
181 if let Some(cached) = self.cache.get(&cid).await {
182 return Ok((*cached).clone());
183 }
184 }
185
186 let (namespace, input) = Self::pick_namespace(id)
187 .ok_or(PubChemError::NoUsableIdentifier)?;
188
189 self.fetch(namespace, &input).await
190 }
191
192 pub async fn enrich(&self, id: &mut SubstanceIdentifier) -> Result<()> {
198 match self.lookup(id).await {
199 Ok(compound) => {
200 compound.apply_to(id);
201 Ok(())
202 }
203 Err(HsPredictError::PubChem(PubChemError::NotFound { .. }))
204 | Err(HsPredictError::PubChem(PubChemError::NoUsableIdentifier)) => Ok(()),
205 Err(e) => Err(e),
206 }
207 }
208
209 fn pick_namespace(id: &SubstanceIdentifier) -> Option<(&'static str, String)> {
213 if let Some(ref cas) = id.cas {
214 return Some(("name", cas.clone()));
215 }
216 if let Some(ref key) = id.inchi_key {
217 return Some(("inchikey", key.clone()));
218 }
219 if let Some(ref inchi) = id.inchi {
220 return Some(("inchi", inchi.clone()));
221 }
222 if let Some(ref smiles) = id.smiles {
223 return Some(("smiles", smiles.clone()));
224 }
225 if let Some(ref name) = id.iupac_name {
226 return Some(("name", name.clone()));
227 }
228 None
229 }
230
231 async fn fetch(&self, namespace: &str, input: &str) -> Result<PubChemCompound> {
233 self.limiter.until_ready().await;
235
236 let url = format!(
237 "{base}/compound/{ns}/{enc}/property/{props}/JSON",
238 base = self.base_url,
239 ns = namespace,
240 enc = encode(input),
241 props = PROPERTIES,
242 );
243
244 let resp = self
245 .http
246 .get(&url)
247 .send()
248 .await
249 .map_err(|e| PubChemError::Http(e.to_string()))?;
250
251 match resp.status().as_u16() {
252 200 => {}
253 404 => return Err(PubChemError::NotFound { input: input.to_string() }.into()),
254 429 => return Err(PubChemError::RateLimitExceeded.into()),
255 code => {
256 return Err(PubChemError::Http(format!("HTTP {code}")).into());
257 }
258 }
259
260 let body: PugPropertyResponse = resp
261 .json()
262 .await
263 .map_err(|e| PubChemError::Parse(e.to_string()))?;
264
265 let props = body
266 .property_table
267 .properties
268 .into_iter()
269 .next()
270 .ok_or_else(|| PubChemError::NotFound { input: input.to_string() })?;
271
272 let compound = Arc::new(PubChemCompound {
273 cid: props.cid,
274 iupac_name: props.iupac_name,
275 canonical_smiles: props.canonical_smiles,
276 inchi: props.in_chi,
277 inchi_key: props.in_chi_key,
278 molecular_formula: props.molecular_formula,
279 molecular_weight: props.molecular_weight.as_deref().and_then(|s| s.parse().ok()),
280 });
281
282 self.cache.insert(compound.cid, Arc::clone(&compound)).await;
284
285 Ok((*compound).clone())
286 }
287}
288
289pub struct PubChemClientBuilder {
293 requests_per_second: u32,
294 cache_capacity: u64,
295 cache_ttl: Duration,
296 base_url: String,
297 user_agent: String,
298}
299
300impl Default for PubChemClientBuilder {
301 fn default() -> Self {
302 Self {
303 requests_per_second: 5,
304 cache_capacity: 1_000,
305 cache_ttl: Duration::from_secs(24 * 3600),
306 base_url: BASE_URL.to_string(),
307 user_agent: format!(
308 "hs-predict/{} ({})",
309 env!("CARGO_PKG_VERSION"),
310 env!("CARGO_PKG_REPOSITORY")
311 ),
312 }
313 }
314}
315
316impl PubChemClientBuilder {
317 pub fn requests_per_second(mut self, n: u32) -> Self {
319 self.requests_per_second = n.max(1);
320 self
321 }
322
323 pub fn cache_capacity(mut self, n: u64) -> Self {
325 self.cache_capacity = n;
326 self
327 }
328
329 pub fn cache_ttl(mut self, ttl: Duration) -> Self {
331 self.cache_ttl = ttl;
332 self
333 }
334
335 pub fn base_url(mut self, url: impl Into<String>) -> Self {
337 self.base_url = url.into();
338 self
339 }
340
341 pub fn build(self) -> PubChemClient {
343 let quota = Quota::per_second(
344 NonZeroU32::new(self.requests_per_second)
345 .expect("requests_per_second must be ≥ 1"),
346 );
347
348 PubChemClient {
349 http: reqwest::Client::builder()
350 .user_agent(self.user_agent)
351 .build()
352 .expect("failed to build reqwest::Client"),
353 cache: Cache::builder()
354 .max_capacity(self.cache_capacity)
355 .time_to_live(self.cache_ttl)
356 .build(),
357 limiter: Arc::new(RateLimiter::direct(quota)),
358 base_url: self.base_url,
359 }
360 }
361}
362
363#[derive(Deserialize)]
366struct PugPropertyResponse {
367 #[serde(rename = "PropertyTable")]
368 property_table: PropertyTable,
369}
370
371#[derive(Deserialize)]
372struct PropertyTable {
373 #[serde(rename = "Properties")]
374 properties: Vec<CompoundProperty>,
375}
376
377#[derive(Deserialize)]
378struct CompoundProperty {
379 #[serde(rename = "CID")]
380 cid: u64,
381 #[serde(rename = "IUPACName")]
382 iupac_name: Option<String>,
383 #[serde(rename = "CanonicalSMILES")]
384 canonical_smiles: Option<String>,
385 #[serde(rename = "InChI")]
386 in_chi: Option<String>,
387 #[serde(rename = "InChIKey")]
388 in_chi_key: Option<String>,
389 #[serde(rename = "MolecularFormula")]
390 molecular_formula: Option<String>,
391 #[serde(rename = "MolecularWeight")]
393 molecular_weight: Option<String>,
394}
395
396#[cfg(test)]
399mod tests {
400 use super::*;
401
402 #[test]
403 fn client_builds_with_defaults() {
404 let client = PubChemClient::new();
405 assert_eq!(client.base_url, BASE_URL);
406 }
407
408 #[test]
409 fn builder_overrides_base_url() {
410 let client = PubChemClient::builder()
411 .base_url("http://localhost:8080")
412 .build();
413 assert_eq!(client.base_url, "http://localhost:8080");
414 }
415
416 #[test]
417 fn pick_namespace_cas_first() {
418 let id = SubstanceIdentifier {
419 cas: Some("1310-73-2".to_string()),
420 smiles: Some("[Na+].[OH-]".to_string()),
421 ..Default::default()
422 };
423 let (ns, inp) = PubChemClient::pick_namespace(&id).unwrap();
424 assert_eq!(ns, "name");
425 assert_eq!(inp, "1310-73-2");
426 }
427
428 #[test]
429 fn pick_namespace_inchikey_when_no_cas() {
430 let id = SubstanceIdentifier {
431 inchi_key: Some("HEMHJVSKTPXQMS-UHFFFAOYSA-M".to_string()),
432 ..Default::default()
433 };
434 let (ns, inp) = PubChemClient::pick_namespace(&id).unwrap();
435 assert_eq!(ns, "inchikey");
436 assert_eq!(inp, "HEMHJVSKTPXQMS-UHFFFAOYSA-M");
437 }
438
439 #[test]
440 fn pick_namespace_returns_none_for_empty_id() {
441 let id = SubstanceIdentifier::default();
442 assert!(PubChemClient::pick_namespace(&id).is_none());
443 }
444
445 #[test]
446 fn apply_to_fills_missing_fields_only() {
447 let compound = PubChemCompound {
448 cid: 14798,
449 iupac_name: Some("sodium hydroxide".to_string()),
450 canonical_smiles: Some("[Na+].[OH-]".to_string()),
451 inchi: Some("InChI=1S/Na.H2O/h;1H/q+1;/p-1".to_string()),
452 inchi_key: Some("HEMHJVSKTPXQMS-UHFFFAOYSA-M".to_string()),
453 molecular_formula: Some("HNaO".to_string()),
454 molecular_weight: Some(39.997),
455 };
456
457 let mut id = SubstanceIdentifier {
458 cas: Some("1310-73-2".to_string()),
459 smiles: Some("existing".to_string()), ..Default::default()
461 };
462
463 compound.apply_to(&mut id);
464
465 assert_eq!(id.cid, Some(14798));
466 assert_eq!(id.smiles.as_deref(), Some("existing")); assert_eq!(id.iupac_name.as_deref(), Some("sodium hydroxide")); assert_eq!(id.inchi_key.as_deref(), Some("HEMHJVSKTPXQMS-UHFFFAOYSA-M")); }
470
471 #[tokio::test]
474 #[ignore = "requires internet access"]
475 async fn integration_lookup_naoh_by_cas() {
476 let client = PubChemClient::new();
477 let id = SubstanceIdentifier::from_cas("1310-73-2");
478 let compound = client.lookup(&id).await.unwrap();
479
480 assert_eq!(compound.cid, 14798);
481 assert_eq!(
482 compound.canonical_smiles.as_deref(),
483 Some("[Na+].[OH-]")
484 );
485 assert_eq!(
486 compound.iupac_name.as_deref(),
487 Some("sodium hydroxide")
488 );
489 }
490
491 #[tokio::test]
492 #[ignore = "requires internet access"]
493 async fn integration_enrich_fills_smiles() {
494 let client = PubChemClient::new();
495 let mut id = SubstanceIdentifier::from_cas("67-64-1"); client.enrich(&mut id).await.unwrap();
497
498 assert!(id.smiles.is_some());
499 assert!(id.cid.is_some());
500 assert!(id.iupac_name.is_some());
501 }
502}