1pub use papers_datalab::ProcessingMode;
2use papers_datalab::{DatalabClient, MarkerRequest, OutputFormat};
3use papers_openalex::{GetParams, OpenAlexClient, Work};
4use papers_zotero::{ItemListParams, ZoteroClient};
5use serde::Serialize;
6use std::path::PathBuf;
7
8#[derive(Debug, Clone, Serialize)]
10#[serde(tag = "type", rename_all = "snake_case")]
11pub enum PdfSource {
12 ZoteroLocal { path: String },
13 ZoteroRemote { item_key: String },
14 DirectUrl { url: String },
15 OpenAlexContent,
16 DataLab,
17}
18
19#[derive(Debug, Clone, Serialize)]
21pub struct WorkTextResult {
22 pub text: String,
23 pub source: PdfSource,
24 pub work_id: String,
25 pub title: Option<String>,
26 pub doi: Option<String>,
27}
28
29#[derive(Debug, thiserror::Error)]
31pub enum WorkTextError {
32 #[error("OpenAlex error: {0}")]
33 OpenAlex(#[from] papers_openalex::OpenAlexError),
34
35 #[error("Filter error: {0}")]
36 Filter(#[from] crate::filter::FilterError),
37
38 #[error("Zotero error: {0}")]
39 Zotero(#[from] papers_zotero::ZoteroError),
40
41 #[error("HTTP error: {0}")]
42 Http(#[from] reqwest::Error),
43
44 #[error("PDF extraction error: {0}")]
45 PdfExtract(String),
46
47 #[error(transparent)]
48 DataLab(#[from] papers_datalab::DatalabError),
49
50 #[error("No PDF found for work {work_id}{}", title.as_ref().map(|t| format!(" ({})", t)).unwrap_or_default())]
51 NoPdfFound {
52 work_id: String,
53 title: Option<String>,
54 doi: Option<String>,
55 },
56}
57
58const DIRECT_PDF_DOMAINS: &[&str] = &[
60 "arxiv.org",
61 "europepmc.org",
62 "biorxiv.org",
63 "medrxiv.org",
64 "ncbi.nlm.nih.gov",
65 "peerj.com",
66 "mdpi.com",
67 "frontiersin.org",
68 "plos.org",
69];
70
71pub fn extract_text_bytes(pdf_bytes: &[u8]) -> Result<String, WorkTextError> {
73 extract_text(pdf_bytes)
74}
75
76fn extract_text(pdf_bytes: &[u8]) -> Result<String, WorkTextError> {
77 pdf_extract::extract_text_from_mem(pdf_bytes)
78 .map_err(|e| WorkTextError::PdfExtract(e.to_string()))
79}
80
81fn bare_doi(doi: &str) -> &str {
83 doi.strip_prefix("https://doi.org/").unwrap_or(doi)
84}
85
86fn short_openalex_id(full_id: &str) -> &str {
88 full_id
89 .strip_prefix("https://openalex.org/")
90 .unwrap_or(full_id)
91}
92
93fn is_whitelisted_url(url: &str) -> bool {
95 DIRECT_PDF_DOMAINS
96 .iter()
97 .any(|domain| url.contains(domain))
98}
99
100fn zotero_data_dir() -> Option<PathBuf> {
102 if let Ok(dir) = std::env::var("ZOTERO_DATA_DIR") {
103 return Some(PathBuf::from(dir));
104 }
105 dirs::home_dir().map(|h| h.join("Zotero"))
106}
107
108fn collect_pdf_urls(work: &Work) -> Vec<String> {
110 let mut urls = Vec::new();
111
112 if let Some(loc) = &work.best_oa_location {
113 if let Some(url) = &loc.pdf_url {
114 urls.push(url.clone());
115 }
116 }
117 if let Some(loc) = &work.primary_location {
118 if let Some(url) = &loc.pdf_url {
119 if !urls.contains(url) {
120 urls.push(url.clone());
121 }
122 }
123 }
124 if let Some(locations) = &work.locations {
125 for loc in locations {
126 if let Some(url) = &loc.pdf_url {
127 if !urls.contains(url) {
128 urls.push(url.clone());
129 }
130 }
131 }
132 }
133
134 urls
135}
136
137#[derive(Debug, Clone, Serialize)]
139pub struct ZoteroItemInfo {
140 pub key: String,
141 pub item_type: String,
142 pub tags: Vec<String>,
143 pub has_pdf: bool,
144 pub date_added: Option<String>,
145 pub uri: String,
146}
147
148pub async fn find_work_in_zotero(
153 zotero: &ZoteroClient,
154 work: &papers_openalex::Work,
155) -> Result<Option<ZoteroItemInfo>, papers_zotero::ZoteroError> {
156 let doi = match &work.doi {
157 Some(d) => bare_doi(d),
158 None => return Ok(None),
159 };
160 let title = work.display_name.as_deref().or(work.title.as_deref());
161
162 let t_search = std::time::Instant::now();
166 let items: Vec<papers_zotero::Item> = if let Some(t) = title {
167 let title_params = ItemListParams::builder().q(t).build();
168 let res = zotero.list_top_items(&title_params).await?;
169 eprintln!("[timing] zotero title search ({} results): {:?}", res.items.len(), t_search.elapsed());
170 res.items
171 } else {
172 eprintln!("[timing] zotero: no title, skipping search");
173 return Ok(None);
174 };
175
176 for item in &items {
177 let item_doi = match &item.data.doi {
178 Some(d) => d,
179 None => continue,
180 };
181 if !item_doi.eq_ignore_ascii_case(doi) {
182 continue;
183 }
184
185 let t_children = std::time::Instant::now();
186 let children = zotero
187 .list_item_children(&item.key, &ItemListParams::default())
188 .await?;
189 eprintln!("[timing] zotero list_item_children: {:?}", t_children.elapsed());
190 let has_pdf = children.items.iter().any(|child| {
191 child.data.content_type.as_deref() == Some("application/pdf")
192 && matches!(
193 child.data.link_mode.as_deref(),
194 Some("imported_file" | "imported_url")
195 )
196 });
197
198 let tags: Vec<String> = item.data.tags.iter().map(|t| t.tag.clone()).collect();
199 let uri = format!("zotero://select/library/items/{}", item.key);
200 return Ok(Some(ZoteroItemInfo {
201 key: item.key.clone(),
202 item_type: item.data.item_type.clone(),
203 tags,
204 has_pdf,
205 date_added: item.data.date_added.clone(),
206 uri,
207 }));
208 }
209
210 Ok(None)
211}
212
213pub async fn try_zotero(
215 zotero: &ZoteroClient,
216 doi: &str,
217 title: Option<&str>,
218) -> Result<Option<(Vec<u8>, PdfSource)>, WorkTextError> {
219 let mut candidate_queries: Vec<String> = Vec::new();
224 if let Some(t) = title {
225 candidate_queries.push(t.to_string());
226 }
227 candidate_queries.push(doi.to_string());
228
229 for query in &candidate_queries {
230 let params = ItemListParams::builder()
231 .q(query.as_str())
232 .qmode("everything")
233 .build();
234
235 let results = zotero.list_top_items(¶ms).await?;
236 if results.items.is_empty() {
237 continue;
238 }
239
240 for item in &results.items {
241 let item_doi = match &item.data.doi {
243 Some(d) => d,
244 None => continue,
245 };
246 if !item_doi.eq_ignore_ascii_case(doi) {
247 continue;
248 }
249
250 let children = zotero
252 .list_item_children(&item.key, &ItemListParams::default())
253 .await?;
254
255 for child in &children.items {
256 let is_pdf = child
257 .data
258 .content_type
259 .as_deref()
260 == Some("application/pdf");
261 let has_local_file = matches!(
262 child.data.link_mode.as_deref(),
263 Some("imported_file" | "imported_url")
264 );
265
266 if !is_pdf || !has_local_file {
267 continue;
268 }
269
270 if let Some(filename) = &child.data.filename {
272 if let Some(data_dir) = zotero_data_dir() {
273 let local_path = data_dir
274 .join("storage")
275 .join(&child.key)
276 .join(filename);
277 if local_path.exists() {
278 let bytes = tokio::fs::read(&local_path)
279 .await
280 .map_err(|e| WorkTextError::PdfExtract(format!("Failed to read local file: {e}")))?;
281 return Ok(Some((
282 bytes,
283 PdfSource::ZoteroLocal {
284 path: local_path.to_string_lossy().into_owned(),
285 },
286 )));
287 }
288 }
289 }
290
291 match zotero.download_item_file(&child.key).await {
293 Ok(bytes) if !bytes.is_empty() => {
294 return Ok(Some((
295 bytes,
296 PdfSource::ZoteroRemote {
297 item_key: child.key.clone(),
298 },
299 )));
300 }
301 _ => continue,
302 }
303 }
304 }
305 }
306
307 Ok(None)
308}
309
310async fn try_direct_urls(
312 http: &reqwest::Client,
313 urls: &[String],
314) -> Result<Option<(Vec<u8>, PdfSource)>, WorkTextError> {
315 for url in urls {
316 if !is_whitelisted_url(url) {
317 continue;
318 }
319
320 let resp = http
321 .get(url)
322 .header(
323 "User-Agent",
324 "papers-mcp/0.1 (https://github.com/mmgeorge/papers; mailto:papers@example.com)",
325 )
326 .send()
327 .await;
328
329 let resp = match resp {
330 Ok(r) if r.status().is_success() => r,
331 _ => continue,
332 };
333
334 let is_pdf = resp
336 .headers()
337 .get("content-type")
338 .and_then(|v| v.to_str().ok())
339 .is_some_and(|ct| ct.contains("application/pdf"));
340
341 if !is_pdf {
342 continue;
343 }
344
345 let bytes = resp.bytes().await?.to_vec();
346 if !bytes.is_empty() {
347 return Ok(Some((
348 bytes,
349 PdfSource::DirectUrl { url: url.clone() },
350 )));
351 }
352 }
353
354 Ok(None)
355}
356
357async fn try_openalex_content(
359 http: &reqwest::Client,
360 work: &Work,
361) -> Result<Option<(Vec<u8>, PdfSource)>, WorkTextError> {
362 let has_pdf = work
363 .has_content
364 .as_ref()
365 .and_then(|hc| hc.pdf)
366 .unwrap_or(false);
367
368 if !has_pdf {
369 return Ok(None);
370 }
371
372 let api_key = match std::env::var("OPENALEX_API_KEY") {
373 Ok(key) if !key.is_empty() => key,
374 _ => return Ok(None),
375 };
376
377 let short_id = short_openalex_id(&work.id);
378 let url = format!(
379 "https://content.openalex.org/works/{}.pdf?api_key={}",
380 short_id, api_key
381 );
382
383 let resp = http.get(&url).send().await;
384
385 let resp = match resp {
386 Ok(r) if r.status().is_success() => r,
387 _ => return Ok(None),
388 };
389
390 let bytes = resp.bytes().await?.to_vec();
391 if !bytes.is_empty() {
392 return Ok(Some((bytes, PdfSource::OpenAlexContent)));
393 }
394
395 Ok(None)
396}
397
398async fn do_extract(
400 pdf_bytes: Vec<u8>,
401 short_id: &str,
402 datalab: Option<(&DatalabClient, ProcessingMode)>,
403 source: &mut PdfSource,
404) -> Result<String, WorkTextError> {
405 if let Some((dl, mode)) = datalab {
406 let dl_result = dl
407 .convert_document(MarkerRequest {
408 file: Some(pdf_bytes),
409 filename: Some(format!("{}.pdf", short_id)),
410 output_format: OutputFormat::Markdown,
411 mode,
412 ..Default::default()
413 })
414 .await?;
415 *source = PdfSource::DataLab;
416 Ok(dl_result.markdown.unwrap_or_default())
417 } else {
418 extract_text(&pdf_bytes)
419 }
420}
421
422pub async fn work_text(
434 openalex: &OpenAlexClient,
435 zotero: Option<&ZoteroClient>,
436 datalab: Option<(&DatalabClient, ProcessingMode)>,
437 work_id: &str,
438) -> Result<WorkTextResult, WorkTextError> {
439 let work = crate::api::work_get(openalex, work_id, &GetParams::default()).await?;
441
442 let title = work.title.clone().or_else(|| work.display_name.clone());
443 let doi_raw = work.doi.as_deref();
444 let doi = doi_raw.map(bare_doi);
445 let short_id = short_openalex_id(&work.id);
446
447 let http = reqwest::Client::new();
448
449 if let (Some(zotero), Some(doi)) = (zotero, doi) {
451 if let Some((bytes, mut source)) = try_zotero(zotero, doi, title.as_deref()).await? {
452 let text = do_extract(bytes, short_id, datalab, &mut source).await?;
453 return Ok(WorkTextResult {
454 text,
455 source,
456 work_id: work.id.clone(),
457 title,
458 doi: doi_raw.map(String::from),
459 });
460 }
461 }
462
463 let pdf_urls = collect_pdf_urls(&work);
465 if let Some((bytes, mut source)) = try_direct_urls(&http, &pdf_urls).await? {
466 let text = do_extract(bytes, short_id, datalab, &mut source).await?;
467 return Ok(WorkTextResult {
468 text,
469 source,
470 work_id: work.id.clone(),
471 title,
472 doi: doi_raw.map(String::from),
473 });
474 }
475
476 if let Some((bytes, mut source)) = try_openalex_content(&http, &work).await? {
478 let text = do_extract(bytes, short_id, datalab, &mut source).await?;
479 return Ok(WorkTextResult {
480 text,
481 source,
482 work_id: work.id.clone(),
483 title,
484 doi: doi_raw.map(String::from),
485 });
486 }
487
488 Err(WorkTextError::NoPdfFound {
490 work_id: work.id.clone(),
491 title,
492 doi: doi_raw.map(String::from),
493 })
494}
495
496pub async fn poll_zotero_for_work(
501 zotero: &ZoteroClient,
502 work_id: &str,
503 title: Option<&str>,
504 doi: &str,
505) -> Result<WorkTextResult, WorkTextError> {
506 tokio::time::sleep(std::time::Duration::from_secs(5)).await;
508
509 for _ in 0..55 {
510 if let Some((bytes, source)) = try_zotero(zotero, doi, title).await? {
511 let text = extract_text(&bytes)?;
512 return Ok(WorkTextResult {
513 text,
514 source,
515 work_id: work_id.to_string(),
516 title: title.map(String::from),
517 doi: Some(doi.to_string()),
518 });
519 }
520 tokio::time::sleep(std::time::Duration::from_secs(2)).await;
521 }
522
523 Err(WorkTextError::NoPdfFound {
524 work_id: work_id.to_string(),
525 title: title.map(String::from),
526 doi: Some(doi.to_string()),
527 })
528}
529
530#[cfg(test)]
531mod tests {
532 use super::*;
533
534 #[test]
535 fn test_bare_doi() {
536 assert_eq!(bare_doi("https://doi.org/10.1234/test"), "10.1234/test");
537 assert_eq!(bare_doi("10.1234/test"), "10.1234/test");
538 }
539
540 #[test]
541 fn test_short_openalex_id() {
542 assert_eq!(
543 short_openalex_id("https://openalex.org/W2741809807"),
544 "W2741809807"
545 );
546 assert_eq!(short_openalex_id("W2741809807"), "W2741809807");
547 }
548
549 #[test]
550 fn test_is_whitelisted_url() {
551 assert!(is_whitelisted_url("https://arxiv.org/pdf/2301.12345"));
552 assert!(is_whitelisted_url(
553 "https://europepmc.org/articles/PMC123/pdf"
554 ));
555 assert!(is_whitelisted_url("https://www.biorxiv.org/content/pdf"));
556 assert!(is_whitelisted_url("https://www.mdpi.com/some/pdf"));
557 assert!(!is_whitelisted_url("https://evil.com/pdf"));
558 assert!(!is_whitelisted_url("https://publisher.com/paper.pdf"));
559 }
560
561 #[test]
562 fn test_collect_pdf_urls_empty() {
563 let work: Work = serde_json::from_str(r#"{"id": "https://openalex.org/W1"}"#).unwrap();
564 assert!(collect_pdf_urls(&work).is_empty());
565 }
566
567 #[test]
568 fn test_collect_pdf_urls_deduplicates() {
569 let work: Work = serde_json::from_value(serde_json::json!({
570 "id": "https://openalex.org/W1",
571 "best_oa_location": { "pdf_url": "https://arxiv.org/pdf/1234" },
572 "primary_location": { "pdf_url": "https://arxiv.org/pdf/1234" },
573 "locations": [
574 { "pdf_url": "https://arxiv.org/pdf/1234" },
575 { "pdf_url": "https://europepmc.org/pdf/5678" }
576 ]
577 }))
578 .unwrap();
579 let urls = collect_pdf_urls(&work);
580 assert_eq!(urls.len(), 2);
581 assert_eq!(urls[0], "https://arxiv.org/pdf/1234");
582 assert_eq!(urls[1], "https://europepmc.org/pdf/5678");
583 }
584}