1pub use papers_datalab::ProcessingMode;
2use base64::Engine as _;
3use papers_datalab::{DatalabClient, MarkerRequest, OutputFormat};
4use papers_openalex::{GetParams, OpenAlexClient, Work};
5use papers_zotero::{ItemListParams, ZoteroClient};
6use serde::{Deserialize, Serialize};
7use std::path::PathBuf;
8
9#[derive(Debug, Clone, Serialize)]
11#[serde(tag = "type", rename_all = "snake_case")]
12pub enum PdfSource {
13 ZoteroLocal { path: String },
14 ZoteroRemote { item_key: String },
15 DirectUrl { url: String },
16 OpenAlexContent,
17 DataLab,
18}
19
20#[derive(Debug, Clone, Serialize)]
22pub struct WorkTextResult {
23 pub text: String,
24 pub source: PdfSource,
25 pub work_id: String,
26 pub title: Option<String>,
27 pub doi: Option<String>,
28}
29
30#[derive(Debug, thiserror::Error)]
32pub enum WorkTextError {
33 #[error("OpenAlex error: {0}")]
34 OpenAlex(#[from] papers_openalex::OpenAlexError),
35
36 #[error("Filter error: {0}")]
37 Filter(#[from] crate::filter::FilterError),
38
39 #[error("Zotero error: {0}")]
40 Zotero(#[from] papers_zotero::ZoteroError),
41
42 #[error("HTTP error: {0}")]
43 Http(#[from] reqwest::Error),
44
45 #[error("PDF extraction error: {0}")]
46 PdfExtract(String),
47
48 #[error(transparent)]
49 DataLab(#[from] papers_datalab::DatalabError),
50
51 #[error("No PDF found for work {work_id}{}", title.as_ref().map(|t| format!(" ({})", t)).unwrap_or_default())]
52 NoPdfFound {
53 work_id: String,
54 title: Option<String>,
55 doi: Option<String>,
56 },
57
58 #[error("Invalid Zotero item key: {0}")]
59 InvalidZoteroKey(String),
60}
61
62const DIRECT_PDF_DOMAINS: &[&str] = &[
64 "arxiv.org",
65 "europepmc.org",
66 "biorxiv.org",
67 "medrxiv.org",
68 "ncbi.nlm.nih.gov",
69 "peerj.com",
70 "mdpi.com",
71 "frontiersin.org",
72 "plos.org",
73];
74
75pub fn extract_text_bytes(pdf_bytes: &[u8]) -> Result<String, WorkTextError> {
77 extract_text(pdf_bytes)
78}
79
80fn extract_text(pdf_bytes: &[u8]) -> Result<String, WorkTextError> {
81 pdf_extract::extract_text_from_mem(pdf_bytes)
82 .map_err(|e| WorkTextError::PdfExtract(e.to_string()))
83}
84
85fn bare_doi(doi: &str) -> &str {
87 doi.strip_prefix("https://doi.org/").unwrap_or(doi)
88}
89
90fn short_openalex_id(full_id: &str) -> &str {
92 full_id
93 .strip_prefix("https://openalex.org/")
94 .unwrap_or(full_id)
95}
96
97fn is_whitelisted_url(url: &str) -> bool {
99 DIRECT_PDF_DOMAINS
100 .iter()
101 .any(|domain| url.contains(domain))
102}
103
104fn zotero_data_dir() -> Option<PathBuf> {
106 if let Ok(dir) = std::env::var("ZOTERO_DATA_DIR") {
107 return Some(PathBuf::from(dir));
108 }
109 dirs::home_dir().map(|h| h.join("Zotero"))
110}
111
112fn datalab_cache_dir(short_id: &str) -> Option<PathBuf> {
113 if let Ok(base) = std::env::var("PAPERS_DATALAB_CACHE_DIR") {
114 return Some(PathBuf::from(base).join(short_id));
115 }
116 dirs::cache_dir().map(|d| d.join("papers").join("datalab").join(short_id))
117}
118
119pub async fn upload_extraction_to_zotero(
127 zc: &ZoteroClient,
128 item_key: &str,
129) -> Result<(), WorkTextError> {
130 let dir = datalab_cache_dir(item_key)
131 .ok_or_else(|| WorkTextError::PdfExtract("cannot determine cache directory".into()))?;
132 if !dir.join(format!("{item_key}.md")).exists() {
133 return Err(WorkTextError::PdfExtract(format!("no local cache for {item_key}")));
134 }
135 upload_papers_zip(zc, item_key, &dir, item_key).await
136}
137
138pub async fn download_extraction_from_zotero(
143 zc: &ZoteroClient,
144 att_key: &str,
145 item_key: &str,
146) -> Result<(), WorkTextError> {
147 let zip_bytes = zc.download_item_file(att_key).await?;
148 if zip_bytes.is_empty() {
149 return Err(WorkTextError::PdfExtract(format!("empty download for {item_key}")));
150 }
151 let dir = datalab_cache_dir(item_key)
152 .ok_or_else(|| WorkTextError::PdfExtract("cannot determine cache directory".into()))?;
153 unzip_to_cache_dir(&zip_bytes, &dir).map_err(|e| WorkTextError::PdfExtract(e.to_string()))
154}
155
156pub fn datalab_cached_markdown(cache_id: &str) -> Option<String> {
158 let dir = datalab_cache_dir(cache_id)?;
159 std::fs::read_to_string(dir.join(format!("{cache_id}.md"))).ok()
160}
161
162pub fn datalab_cached_item_keys() -> Vec<String> {
167 let base = if let Ok(base_str) = std::env::var("PAPERS_DATALAB_CACHE_DIR") {
168 PathBuf::from(base_str)
169 } else {
170 match dirs::cache_dir() {
171 Some(d) => d.join("papers").join("datalab"),
172 None => return vec![],
173 }
174 };
175 if !base.is_dir() {
176 return vec![];
177 }
178 let mut keys = Vec::new();
179 if let Ok(entries) = std::fs::read_dir(&base) {
180 for entry in entries.flatten() {
181 let key = match entry.file_name().to_str() {
182 Some(k) => k.to_string(),
183 None => continue,
184 };
185 if entry.path().join(format!("{key}.md")).exists() {
186 keys.push(key);
187 }
188 }
189 }
190 keys
191}
192
193pub fn datalab_cached_json(cache_id: &str) -> Option<String> {
195 let dir = datalab_cache_dir(cache_id)?;
196 std::fs::read_to_string(dir.join(format!("{cache_id}.json"))).ok()
197}
198
199pub fn datalab_cache_dir_path(cache_id: &str) -> Option<std::path::PathBuf> {
201 datalab_cache_dir(cache_id)
202}
203
204#[derive(Debug, Clone, Serialize, Deserialize)]
209pub struct ExtractionMeta {
210 pub item_key: String,
211 pub zotero_user_id: Option<String>,
212 pub title: Option<String>,
213 pub authors: Option<Vec<String>>,
214 pub item_type: Option<String>,
215 pub date: Option<String>,
216 pub doi: Option<String>,
217 pub url: Option<String>,
218 pub publication_title: Option<String>,
219 pub extracted_at: Option<String>,
220 pub processing_mode: Option<String>,
221 pub pdf_source: Option<serde_json::Value>,
222}
223
224pub fn read_extraction_meta(cache_id: &str) -> Option<ExtractionMeta> {
226 let dir = datalab_cache_dir(cache_id)?;
227 let bytes = std::fs::read(dir.join("meta.json")).ok()?;
228 serde_json::from_slice(&bytes).ok()
229}
230
231fn iso_now() -> String {
233 let secs = std::time::SystemTime::now()
234 .duration_since(std::time::UNIX_EPOCH)
235 .map(|d| d.as_secs())
236 .unwrap_or(0);
237 let days = (secs / 86400) as i64;
239 let z = days + 719_468;
240 let era = if z >= 0 { z } else { z - 146_096 } / 146_097;
241 let doe = (z - era * 146_097) as u64;
242 let yoe = (doe - doe / 1_460 + doe / 36_524 - doe / 146_096) / 365;
243 let y = yoe as i64 + era * 400;
244 let doy = doe - (365 * yoe + yoe / 4 - yoe / 100);
245 let mp = (5 * doy + 2) / 153;
246 let d = doy - (153 * mp + 2) / 5 + 1;
247 let m = if mp < 10 { mp + 3 } else { mp - 9 };
248 let y = if m <= 2 { y + 1 } else { y };
249 let hh = (secs % 86_400) / 3_600;
250 let mm = (secs % 3_600) / 60;
251 let ss = secs % 60;
252 format!("{y:04}-{m:02}-{d:02}T{hh:02}:{mm:02}:{ss:02}Z")
253}
254
255async fn write_extraction_meta(
260 dir: &std::path::Path,
261 item_key: &str,
262 zotero: Option<&ZoteroClient>,
263 mode_str: Option<&str>,
264 pdf_source: Option<&PdfSource>,
265) {
266 let mut meta = ExtractionMeta {
267 item_key: item_key.to_string(),
268 zotero_user_id: std::env::var("ZOTERO_USER_ID").ok(),
269 title: None,
270 authors: None,
271 item_type: None,
272 date: None,
273 doi: None,
274 url: None,
275 publication_title: None,
276 extracted_at: Some(iso_now()),
277 processing_mode: mode_str.map(String::from),
278 pdf_source: pdf_source.and_then(|s| serde_json::to_value(s).ok()),
279 };
280
281 if let Some(zc) = zotero {
282 if let Ok(item) = zc.get_item(item_key).await {
283 meta.title = item.data.title;
284 meta.item_type = Some(item.data.item_type);
285 meta.date = item.data.date;
286 meta.doi = item.data.doi;
287 meta.url = item.data.url;
288 meta.publication_title = item.data.publication_title;
289 let authors: Vec<String> = item
290 .data
291 .creators
292 .iter()
293 .filter(|c| c.creator_type == "author")
294 .map(|c| match (&c.first_name, &c.last_name, &c.name) {
295 (Some(f), Some(l), _) if !l.is_empty() => format!("{f} {l}"),
296 (_, Some(l), _) if !l.is_empty() => l.clone(),
297 (_, _, Some(n)) if !n.is_empty() => n.clone(),
298 _ => String::new(),
299 })
300 .filter(|s| !s.is_empty())
301 .collect();
302 if !authors.is_empty() {
303 meta.authors = Some(authors);
304 }
305 }
306 }
307
308 if let Ok(json) = serde_json::to_string_pretty(&meta) {
309 let _ = std::fs::write(dir.join("meta.json"), json);
310 }
311}
312
313fn collect_pdf_urls(work: &Work) -> Vec<String> {
315 let mut urls = Vec::new();
316
317 if let Some(loc) = &work.best_oa_location {
318 if let Some(url) = &loc.pdf_url {
319 urls.push(url.clone());
320 }
321 }
322 if let Some(loc) = &work.primary_location {
323 if let Some(url) = &loc.pdf_url {
324 if !urls.contains(url) {
325 urls.push(url.clone());
326 }
327 }
328 }
329 if let Some(locations) = &work.locations {
330 for loc in locations {
331 if let Some(url) = &loc.pdf_url {
332 if !urls.contains(url) {
333 urls.push(url.clone());
334 }
335 }
336 }
337 }
338
339 urls
340}
341
342#[derive(Debug, Clone, Serialize)]
344pub struct ZoteroItemInfo {
345 pub key: String,
346 pub item_type: String,
347 pub tags: Vec<String>,
348 pub has_pdf: bool,
349 pub date_added: Option<String>,
350 pub uri: String,
351}
352
353pub async fn find_work_in_zotero(
358 zotero: &ZoteroClient,
359 work: &papers_openalex::Work,
360) -> Result<Option<ZoteroItemInfo>, papers_zotero::ZoteroError> {
361 let doi = match &work.doi {
362 Some(d) => bare_doi(d),
363 None => return Ok(None),
364 };
365 let title = work.display_name.as_deref().or(work.title.as_deref());
366
367 let t_search = std::time::Instant::now();
371 let items: Vec<papers_zotero::Item> = if let Some(t) = title {
372 let title_params = ItemListParams::builder().q(t).build();
373 let res = zotero.list_top_items(&title_params).await?;
374 eprintln!("[timing] zotero title search ({} results): {:?}", res.items.len(), t_search.elapsed());
375 res.items
376 } else {
377 eprintln!("[timing] zotero: no title, skipping search");
378 return Ok(None);
379 };
380
381 for item in &items {
382 let item_doi = match &item.data.doi {
383 Some(d) => d,
384 None => continue,
385 };
386 if !item_doi.eq_ignore_ascii_case(doi) {
387 continue;
388 }
389
390 let t_children = std::time::Instant::now();
391 let children = zotero
392 .list_item_children(&item.key, &ItemListParams::default())
393 .await?;
394 eprintln!("[timing] zotero list_item_children: {:?}", t_children.elapsed());
395 let has_pdf = children.items.iter().any(|child| {
396 child.data.content_type.as_deref() == Some("application/pdf")
397 && matches!(
398 child.data.link_mode.as_deref(),
399 Some("imported_file" | "imported_url")
400 )
401 });
402
403 let tags: Vec<String> = item.data.tags.iter().map(|t| t.tag.clone()).collect();
404 let uri = format!("zotero://select/library/items/{}", item.key);
405 return Ok(Some(ZoteroItemInfo {
406 key: item.key.clone(),
407 item_type: item.data.item_type.clone(),
408 tags,
409 has_pdf,
410 date_added: item.data.date_added.clone(),
411 uri,
412 }));
413 }
414
415 Ok(None)
416}
417
418pub async fn try_zotero(
423 zotero: &ZoteroClient,
424 doi: &str,
425 title: Option<&str>,
426) -> Result<Option<(Vec<u8>, PdfSource, String)>, WorkTextError> {
427 let mut candidate_queries: Vec<String> = Vec::new();
432 if let Some(t) = title {
433 candidate_queries.push(t.to_string());
434 }
435 candidate_queries.push(doi.to_string());
436
437 for query in &candidate_queries {
438 let params = ItemListParams::builder()
439 .q(query.as_str())
440 .qmode("everything")
441 .build();
442
443 let results = zotero.list_top_items(¶ms).await?;
444 if results.items.is_empty() {
445 continue;
446 }
447
448 for item in &results.items {
449 let item_doi = match &item.data.doi {
451 Some(d) => d,
452 None => continue,
453 };
454 if !item_doi.eq_ignore_ascii_case(doi) {
455 continue;
456 }
457
458 let children = zotero
460 .list_item_children(&item.key, &ItemListParams::default())
461 .await?;
462
463 for child in &children.items {
464 let is_pdf = child
465 .data
466 .content_type
467 .as_deref()
468 == Some("application/pdf");
469 let has_local_file = matches!(
470 child.data.link_mode.as_deref(),
471 Some("imported_file" | "imported_url")
472 );
473
474 if !is_pdf || !has_local_file {
475 continue;
476 }
477
478 if let Some(filename) = &child.data.filename {
480 if let Some(data_dir) = zotero_data_dir() {
481 let local_path = data_dir
482 .join("storage")
483 .join(&child.key)
484 .join(filename);
485 if local_path.exists() {
486 let bytes = tokio::fs::read(&local_path)
487 .await
488 .map_err(|e| WorkTextError::PdfExtract(format!("Failed to read local file: {e}")))?;
489 return Ok(Some((
490 bytes,
491 PdfSource::ZoteroLocal {
492 path: local_path.to_string_lossy().into_owned(),
493 },
494 item.key.clone(),
495 )));
496 }
497 }
498 }
499
500 match zotero.download_item_file(&child.key).await {
502 Ok(bytes) if !bytes.is_empty() => {
503 return Ok(Some((
504 bytes,
505 PdfSource::ZoteroRemote {
506 item_key: child.key.clone(),
507 },
508 item.key.clone(),
509 )));
510 }
511 _ => continue,
512 }
513 }
514 }
515 }
516
517 Ok(None)
518}
519
520async fn try_direct_urls(
522 http: &reqwest::Client,
523 urls: &[String],
524) -> Result<Option<(Vec<u8>, PdfSource)>, WorkTextError> {
525 for url in urls {
526 if !is_whitelisted_url(url) {
527 continue;
528 }
529
530 let resp = http
531 .get(url)
532 .header(
533 "User-Agent",
534 "papers-mcp/0.1 (https://github.com/mmgeorge/papers; mailto:papers@example.com)",
535 )
536 .send()
537 .await;
538
539 let resp = match resp {
540 Ok(r) if r.status().is_success() => r,
541 _ => continue,
542 };
543
544 let is_pdf = resp
546 .headers()
547 .get("content-type")
548 .and_then(|v| v.to_str().ok())
549 .is_some_and(|ct| ct.contains("application/pdf"));
550
551 if !is_pdf {
552 continue;
553 }
554
555 let bytes = resp.bytes().await?.to_vec();
556 if !bytes.is_empty() {
557 return Ok(Some((
558 bytes,
559 PdfSource::DirectUrl { url: url.clone() },
560 )));
561 }
562 }
563
564 Ok(None)
565}
566
567async fn try_openalex_content(
569 http: &reqwest::Client,
570 work: &Work,
571) -> Result<Option<(Vec<u8>, PdfSource)>, WorkTextError> {
572 let has_pdf = work
573 .has_content
574 .as_ref()
575 .and_then(|hc| hc.pdf)
576 .unwrap_or(false);
577
578 if !has_pdf {
579 return Ok(None);
580 }
581
582 let api_key = match std::env::var("OPENALEX_API_KEY") {
583 Ok(key) if !key.is_empty() => key,
584 _ => return Ok(None),
585 };
586
587 let short_id = short_openalex_id(&work.id);
588 let url = format!(
589 "https://content.openalex.org/works/{}.pdf?api_key={}",
590 short_id, api_key
591 );
592
593 let resp = http.get(&url).send().await;
594
595 let resp = match resp {
596 Ok(r) if r.status().is_success() => r,
597 _ => return Ok(None),
598 };
599
600 let bytes = resp.bytes().await?.to_vec();
601 if !bytes.is_empty() {
602 return Ok(Some((bytes, PdfSource::OpenAlexContent)));
603 }
604
605 Ok(None)
606}
607
608fn is_valid_zotero_key(key: &str) -> bool {
610 key.len() == 8 && key.bytes().all(|b| b.is_ascii_uppercase() || b.is_ascii_digit())
611}
612
613fn is_zotero_write_denied(e: &WorkTextError) -> bool {
616 matches!(
617 e,
618 WorkTextError::Zotero(papers_zotero::ZoteroError::Api { status: 403, .. })
619 )
620}
621
622fn papers_extract_filename(parent_key: &str) -> String {
626 format!("papers_extract_{parent_key}.zip")
627}
628
629async fn find_papers_zip_key(
631 zc: &ZoteroClient,
632 parent_key: &str,
633) -> Result<Option<String>, WorkTextError> {
634 let expected = papers_extract_filename(parent_key);
635 let children = zc
636 .list_item_children(parent_key, &ItemListParams::default())
637 .await?;
638 for child in &children.items {
639 if child.data.filename.as_deref() == Some(&expected)
640 && child.data.link_mode.as_deref() == Some("imported_file")
641 {
642 return Ok(Some(child.key.clone()));
643 }
644 }
645 Ok(None)
646}
647
648fn zip_cache_dir(dir: &std::path::Path, id: &str) -> std::io::Result<Vec<u8>> {
650 use std::io::Write as _;
651 let buf = Vec::new();
652 let cursor = std::io::Cursor::new(buf);
653 let mut zip = zip::ZipWriter::new(cursor);
654 let opts = zip::write::SimpleFileOptions::default()
655 .compression_method(zip::CompressionMethod::Deflated);
656
657 let md_path = dir.join(format!("{id}.md"));
659 if md_path.exists() {
660 zip.start_file(format!("{id}.md"), opts)?;
661 zip.write_all(&std::fs::read(&md_path)?)?;
662 }
663
664 let json_path = dir.join(format!("{id}.json"));
666 if json_path.exists() {
667 zip.start_file(format!("{id}.json"), opts)?;
668 zip.write_all(&std::fs::read(&json_path)?)?;
669 }
670
671 let meta_path = dir.join("meta.json");
673 if meta_path.exists() {
674 zip.start_file("meta.json", opts)?;
675 zip.write_all(&std::fs::read(&meta_path)?)?;
676 }
677
678 let img_dir = dir.join("images");
680 if img_dir.is_dir() {
681 if let Ok(entries) = std::fs::read_dir(&img_dir) {
682 for entry in entries.flatten() {
683 let path = entry.path();
684 if path.is_file() {
685 if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
686 zip.start_file(format!("images/{name}"), opts)?;
687 zip.write_all(&std::fs::read(&path)?)?;
688 }
689 }
690 }
691 }
692 }
693
694 let cursor = zip.finish()?;
695 Ok(cursor.into_inner())
696}
697
698fn unzip_to_cache_dir(zip_bytes: &[u8], dir: &std::path::Path) -> std::io::Result<()> {
700 std::fs::create_dir_all(dir)?;
701 let cursor = std::io::Cursor::new(zip_bytes);
702 let mut archive = zip::ZipArchive::new(cursor)
703 .map_err(|e| std::io::Error::other(e.to_string()))?;
704 for i in 0..archive.len() {
705 let mut file = archive
706 .by_index(i)
707 .map_err(|e| std::io::Error::other(e.to_string()))?;
708 let out_path = dir.join(file.name());
709 if let Some(parent) = out_path.parent() {
710 std::fs::create_dir_all(parent)?;
711 }
712 let mut out = std::fs::File::create(&out_path)?;
713 std::io::copy(&mut file, &mut out)?;
714 }
715 Ok(())
716}
717
718async fn upload_papers_zip(
720 zc: &ZoteroClient,
721 parent_key: &str,
722 dir: &std::path::Path,
723 id: &str,
724) -> Result<(), WorkTextError> {
725 let filename = papers_extract_filename(id);
726 let zip_bytes = zip_cache_dir(dir, id).map_err(|e| WorkTextError::PdfExtract(e.to_string()))?;
727 let att_key = zc
728 .create_imported_attachment(parent_key, &filename, "application/zip")
729 .await?;
730 zc.upload_attachment_file(&att_key, &filename, zip_bytes)
731 .await?;
732 Ok(())
733}
734
735pub async fn do_extract(
741 pdf_bytes: Vec<u8>,
742 zotero_id: &str,
743 zotero: Option<&ZoteroClient>,
744 datalab: Option<(&DatalabClient, ProcessingMode)>,
745 source: &mut PdfSource,
746) -> Result<String, WorkTextError> {
747 if let Some((dl, mode)) = datalab {
748 if let Some(zc) = zotero {
750 if !is_valid_zotero_key(zotero_id) {
751 return Err(WorkTextError::InvalidZoteroKey(zotero_id.to_string()));
752 }
753 let _ = zc; }
755
756 let cache_dir = datalab_cache_dir(zotero_id);
757
758 if let Some(ref dir) = cache_dir {
760 let md_path = dir.join(format!("{zotero_id}.md"));
761 if let Ok(text) = std::fs::read_to_string(&md_path) {
762 *source = PdfSource::DataLab;
763 if let Some(zc) = zotero {
765 let zc = zc.clone();
766 let dir = dir.clone();
767 let id = zotero_id.to_string();
768 tokio::spawn(async move {
769 match find_papers_zip_key(&zc, &id).await {
770 Ok(None) => {
771 if let Err(e) = upload_papers_zip(&zc, &id, &dir, &id).await {
772 if !is_zotero_write_denied(&e) {
773 eprintln!("[papers] Zotero backup upload failed: {e}");
774 }
775 }
776 }
777 Ok(Some(_)) => {} Err(e) => {
779 if !is_zotero_write_denied(&e) {
780 eprintln!("[papers] Zotero children check failed: {e}");
781 }
782 }
783 }
784 });
785 }
786 return Ok(text);
787 }
788 }
789
790 if let Some(zc) = zotero {
792 if let Ok(Some(att_key)) = find_papers_zip_key(zc, zotero_id).await {
793 match zc.download_item_file(&att_key).await {
794 Ok(zip_bytes) if !zip_bytes.is_empty() => {
795 if let Some(ref dir) = cache_dir {
796 if unzip_to_cache_dir(&zip_bytes, dir).is_ok() {
797 let md_path = dir.join(format!("{zotero_id}.md"));
798 if let Ok(text) = std::fs::read_to_string(&md_path) {
799 *source = PdfSource::DataLab;
800 return Ok(text);
801 }
802 }
803 }
804 }
805 Err(e) => {
806 return Err(WorkTextError::Zotero(e));
807 }
808 _ => {}
809 }
810 }
811 }
812
813 let mode_str_opt = serde_json::to_value(&mode)
816 .ok()
817 .and_then(|v| v.as_str().map(String::from));
818 let original_source = source.clone();
819 let dl_result = dl
820 .convert_document(MarkerRequest {
821 file: Some(pdf_bytes),
822 filename: Some(format!("{zotero_id}.pdf")),
823 output_format: vec![OutputFormat::Markdown, OutputFormat::Json],
824 mode,
825 ..Default::default()
826 })
827 .await?;
828
829 *source = PdfSource::DataLab;
830 let markdown = dl_result.markdown.clone().unwrap_or_default();
831
832 if let Some(ref dir) = cache_dir {
834 let _ = std::fs::create_dir_all(dir);
835
836 let md_path = dir.join(format!("{zotero_id}.md"));
837 let _ = std::fs::write(&md_path, &markdown);
838
839 if let Some(ref json_val) = dl_result.json {
840 let json_path = dir.join(format!("{zotero_id}.json"));
841 let _ = std::fs::write(&json_path, json_val.to_string());
842 }
843
844 if let Some(ref images) = dl_result.images {
845 if !images.is_empty() {
846 let img_dir = dir.join("images");
847 let _ = std::fs::create_dir_all(&img_dir);
848 for (name, data) in images {
849 let b64 = if let Some(pos) = data.find(";base64,") {
850 &data[pos + 8..]
851 } else {
852 data.as_str()
853 };
854 if let Ok(bytes) = base64::engine::general_purpose::STANDARD.decode(b64) {
855 let img_path = img_dir.join(name);
856 let _ = std::fs::write(&img_path, bytes);
857 }
858 }
859 }
860 }
861
862 write_extraction_meta(
864 dir,
865 zotero_id,
866 zotero,
867 mode_str_opt.as_deref(),
868 Some(&original_source),
869 )
870 .await;
871
872 if let Some(zc) = zotero {
874 if let Err(e) = upload_papers_zip(zc, zotero_id, dir, zotero_id).await {
875 if !is_zotero_write_denied(&e) {
876 eprintln!("[papers] Zotero backup upload failed: {e}");
877 }
878 }
879 }
880 }
881
882 Ok(markdown)
883 } else {
884 extract_text(&pdf_bytes)
885 }
886}
887
888pub async fn work_text(
900 openalex: &OpenAlexClient,
901 zotero: Option<&ZoteroClient>,
902 datalab: Option<(&DatalabClient, ProcessingMode)>,
903 work_id: &str,
904) -> Result<WorkTextResult, WorkTextError> {
905 let work = crate::api::work_get(openalex, work_id, &GetParams::default()).await?;
907
908 let title = work.title.clone().or_else(|| work.display_name.clone());
909 let doi_raw = work.doi.as_deref();
910 let doi = doi_raw.map(bare_doi);
911 let short_id = short_openalex_id(&work.id);
912
913 let http = reqwest::Client::new();
914
915 if let (Some(zotero), Some(doi)) = (zotero, doi) {
917 if let Some((bytes, mut source, zotero_key)) = try_zotero(zotero, doi, title.as_deref()).await? {
918 let text = do_extract(bytes, &zotero_key, Some(zotero), datalab, &mut source).await?;
919 return Ok(WorkTextResult {
920 text,
921 source,
922 work_id: work.id.clone(),
923 title,
924 doi: doi_raw.map(String::from),
925 });
926 }
927 }
928
929 let pdf_urls = collect_pdf_urls(&work);
931 if let Some((bytes, mut source)) = try_direct_urls(&http, &pdf_urls).await? {
932 let text = do_extract(bytes, short_id, None, datalab, &mut source).await?;
933 return Ok(WorkTextResult {
934 text,
935 source,
936 work_id: work.id.clone(),
937 title,
938 doi: doi_raw.map(String::from),
939 });
940 }
941
942 if let Some((bytes, mut source)) = try_openalex_content(&http, &work).await? {
944 let text = do_extract(bytes, short_id, None, datalab, &mut source).await?;
945 return Ok(WorkTextResult {
946 text,
947 source,
948 work_id: work.id.clone(),
949 title,
950 doi: doi_raw.map(String::from),
951 });
952 }
953
954 Err(WorkTextError::NoPdfFound {
956 work_id: work.id.clone(),
957 title,
958 doi: doi_raw.map(String::from),
959 })
960}
961
962pub async fn poll_zotero_for_work(
967 zotero: &ZoteroClient,
968 work_id: &str,
969 title: Option<&str>,
970 doi: &str,
971) -> Result<WorkTextResult, WorkTextError> {
972 tokio::time::sleep(std::time::Duration::from_secs(5)).await;
974
975 for _ in 0..55 {
976 if let Some((bytes, source, _zotero_key)) = try_zotero(zotero, doi, title).await? {
977 let text = extract_text(&bytes)?;
978 return Ok(WorkTextResult {
979 text,
980 source,
981 work_id: work_id.to_string(),
982 title: title.map(String::from),
983 doi: Some(doi.to_string()),
984 });
985 }
986 tokio::time::sleep(std::time::Duration::from_secs(2)).await;
987 }
988
989 Err(WorkTextError::NoPdfFound {
990 work_id: work_id.to_string(),
991 title: title.map(String::from),
992 doi: Some(doi.to_string()),
993 })
994}
995
996#[cfg(test)]
997mod tests {
998 use super::*;
999
1000 #[test]
1001 fn test_bare_doi() {
1002 assert_eq!(bare_doi("https://doi.org/10.1234/test"), "10.1234/test");
1003 assert_eq!(bare_doi("10.1234/test"), "10.1234/test");
1004 }
1005
1006 #[test]
1007 fn test_short_openalex_id() {
1008 assert_eq!(
1009 short_openalex_id("https://openalex.org/W2741809807"),
1010 "W2741809807"
1011 );
1012 assert_eq!(short_openalex_id("W2741809807"), "W2741809807");
1013 }
1014
1015 #[test]
1016 fn test_is_whitelisted_url() {
1017 assert!(is_whitelisted_url("https://arxiv.org/pdf/2301.12345"));
1018 assert!(is_whitelisted_url(
1019 "https://europepmc.org/articles/PMC123/pdf"
1020 ));
1021 assert!(is_whitelisted_url("https://www.biorxiv.org/content/pdf"));
1022 assert!(is_whitelisted_url("https://www.mdpi.com/some/pdf"));
1023 assert!(!is_whitelisted_url("https://evil.com/pdf"));
1024 assert!(!is_whitelisted_url("https://publisher.com/paper.pdf"));
1025 }
1026
1027 #[test]
1028 fn test_collect_pdf_urls_empty() {
1029 let work: Work = serde_json::from_str(r#"{"id": "https://openalex.org/W1"}"#).unwrap();
1030 assert!(collect_pdf_urls(&work).is_empty());
1031 }
1032
1033 #[test]
1034 fn test_collect_pdf_urls_deduplicates() {
1035 let work: Work = serde_json::from_value(serde_json::json!({
1036 "id": "https://openalex.org/W1",
1037 "best_oa_location": { "pdf_url": "https://arxiv.org/pdf/1234" },
1038 "primary_location": { "pdf_url": "https://arxiv.org/pdf/1234" },
1039 "locations": [
1040 { "pdf_url": "https://arxiv.org/pdf/1234" },
1041 { "pdf_url": "https://europepmc.org/pdf/5678" }
1042 ]
1043 }))
1044 .unwrap();
1045 let urls = collect_pdf_urls(&work);
1046 assert_eq!(urls.len(), 2);
1047 assert_eq!(urls[0], "https://arxiv.org/pdf/1234");
1048 assert_eq!(urls[1], "https://europepmc.org/pdf/5678");
1049 }
1050}