papers_core/
text.rs

1pub use papers_datalab::ProcessingMode;
2use base64::Engine as _;
3use papers_datalab::{DatalabClient, MarkerRequest, OutputFormat};
4use papers_openalex::{GetParams, OpenAlexClient, Work};
5use papers_zotero::{ItemListParams, ZoteroClient};
6use serde::{Deserialize, Serialize};
7use std::path::PathBuf;
8
9/// Where the PDF was obtained from.
10#[derive(Debug, Clone, Serialize)]
11#[serde(tag = "type", rename_all = "snake_case")]
12pub enum PdfSource {
13    ZoteroLocal { path: String },
14    ZoteroRemote { item_key: String },
15    DirectUrl { url: String },
16    OpenAlexContent,
17    DataLab,
18}
19
20/// Result of extracting text from a work's PDF.
21#[derive(Debug, Clone, Serialize)]
22pub struct WorkTextResult {
23    pub text: String,
24    pub source: PdfSource,
25    pub work_id: String,
26    pub title: Option<String>,
27    pub doi: Option<String>,
28}
29
30/// Errors from the work_text pipeline.
31#[derive(Debug, thiserror::Error)]
32pub enum WorkTextError {
33    #[error("OpenAlex error: {0}")]
34    OpenAlex(#[from] papers_openalex::OpenAlexError),
35
36    #[error("Filter error: {0}")]
37    Filter(#[from] crate::filter::FilterError),
38
39    #[error("Zotero error: {0}")]
40    Zotero(#[from] papers_zotero::ZoteroError),
41
42    #[error("HTTP error: {0}")]
43    Http(#[from] reqwest::Error),
44
45    #[error("PDF extraction error: {0}")]
46    PdfExtract(String),
47
48    #[error(transparent)]
49    DataLab(#[from] papers_datalab::DatalabError),
50
51    #[error("No PDF found for work {work_id}{}", title.as_ref().map(|t| format!(" ({})", t)).unwrap_or_default())]
52    NoPdfFound {
53        work_id: String,
54        title: Option<String>,
55        doi: Option<String>,
56    },
57
58    #[error("Invalid Zotero item key: {0}")]
59    InvalidZoteroKey(String),
60}
61
62/// Whitelisted domains for direct PDF download.
63const DIRECT_PDF_DOMAINS: &[&str] = &[
64    "arxiv.org",
65    "europepmc.org",
66    "biorxiv.org",
67    "medrxiv.org",
68    "ncbi.nlm.nih.gov",
69    "peerj.com",
70    "mdpi.com",
71    "frontiersin.org",
72    "plos.org",
73];
74
75/// Extract text from PDF bytes using pdf-extract.
76pub fn extract_text_bytes(pdf_bytes: &[u8]) -> Result<String, WorkTextError> {
77    extract_text(pdf_bytes)
78}
79
80fn extract_text(pdf_bytes: &[u8]) -> Result<String, WorkTextError> {
81    pdf_extract::extract_text_from_mem(pdf_bytes)
82        .map_err(|e| WorkTextError::PdfExtract(e.to_string()))
83}
84
85/// Strip the `https://doi.org/` prefix from a DOI URL, returning the bare DOI.
86fn bare_doi(doi: &str) -> &str {
87    doi.strip_prefix("https://doi.org/").unwrap_or(doi)
88}
89
90/// Extract the short OpenAlex ID (e.g. `W12345`) from a full URL.
91fn short_openalex_id(full_id: &str) -> &str {
92    full_id
93        .strip_prefix("https://openalex.org/")
94        .unwrap_or(full_id)
95}
96
97/// Check if a URL's host matches one of the whitelisted domains.
98fn is_whitelisted_url(url: &str) -> bool {
99    DIRECT_PDF_DOMAINS
100        .iter()
101        .any(|domain| url.contains(domain))
102}
103
104/// Get the Zotero data directory path.
105fn zotero_data_dir() -> Option<PathBuf> {
106    if let Ok(dir) = std::env::var("ZOTERO_DATA_DIR") {
107        return Some(PathBuf::from(dir));
108    }
109    dirs::home_dir().map(|h| h.join("Zotero"))
110}
111
112fn datalab_cache_dir(short_id: &str) -> Option<PathBuf> {
113    if let Ok(base) = std::env::var("PAPERS_DATALAB_CACHE_DIR") {
114        return Some(PathBuf::from(base).join(short_id));
115    }
116    dirs::cache_dir().map(|d| d.join("papers").join("datalab").join(short_id))
117}
118
119/// Upload the local DataLab cache for `item_key` to Zotero as
120/// `papers_extract_{item_key}.zip` attached to that same item.
121///
122/// The caller is responsible for ensuring the Zotero item exists before calling
123/// this function (callers should check item existence and skip if absent).
124///
125/// Returns an error if there is no local cache for the key or the upload fails.
126pub async fn upload_extraction_to_zotero(
127    zc: &ZoteroClient,
128    item_key: &str,
129) -> Result<(), WorkTextError> {
130    let dir = datalab_cache_dir(item_key)
131        .ok_or_else(|| WorkTextError::PdfExtract("cannot determine cache directory".into()))?;
132    if !dir.join(format!("{item_key}.md")).exists() {
133        return Err(WorkTextError::PdfExtract(format!("no local cache for {item_key}")));
134    }
135    upload_papers_zip(zc, item_key, &dir, item_key).await
136}
137
138/// Download `papers_extract_{item_key}.zip` from Zotero (identified by `att_key`)
139/// and restore it to the local cache directory.
140///
141/// `att_key` is the Zotero key of the attachment item itself (not the parent).
142pub async fn download_extraction_from_zotero(
143    zc: &ZoteroClient,
144    att_key: &str,
145    item_key: &str,
146) -> Result<(), WorkTextError> {
147    let zip_bytes = zc.download_item_file(att_key).await?;
148    if zip_bytes.is_empty() {
149        return Err(WorkTextError::PdfExtract(format!("empty download for {item_key}")));
150    }
151    let dir = datalab_cache_dir(item_key)
152        .ok_or_else(|| WorkTextError::PdfExtract("cannot determine cache directory".into()))?;
153    unzip_to_cache_dir(&zip_bytes, &dir).map_err(|e| WorkTextError::PdfExtract(e.to_string()))
154}
155
156/// Return the cached markdown for `cache_id` if it exists, otherwise `None`.
157pub fn datalab_cached_markdown(cache_id: &str) -> Option<String> {
158    let dir = datalab_cache_dir(cache_id)?;
159    std::fs::read_to_string(dir.join(format!("{cache_id}.md"))).ok()
160}
161
162/// Return the keys of all locally cached DataLab extractions.
163///
164/// Scans the DataLab cache base directory and returns the name of every
165/// subdirectory that contains a `{key}.md` file.
166pub fn datalab_cached_item_keys() -> Vec<String> {
167    let base = if let Ok(base_str) = std::env::var("PAPERS_DATALAB_CACHE_DIR") {
168        PathBuf::from(base_str)
169    } else {
170        match dirs::cache_dir() {
171            Some(d) => d.join("papers").join("datalab"),
172            None => return vec![],
173        }
174    };
175    if !base.is_dir() {
176        return vec![];
177    }
178    let mut keys = Vec::new();
179    if let Ok(entries) = std::fs::read_dir(&base) {
180        for entry in entries.flatten() {
181            let key = match entry.file_name().to_str() {
182                Some(k) => k.to_string(),
183                None => continue,
184            };
185            if entry.path().join(format!("{key}.md")).exists() {
186                keys.push(key);
187            }
188        }
189    }
190    keys
191}
192
193/// Return the cached JSON for `cache_id` if it exists, otherwise `None`.
194pub fn datalab_cached_json(cache_id: &str) -> Option<String> {
195    let dir = datalab_cache_dir(cache_id)?;
196    std::fs::read_to_string(dir.join(format!("{cache_id}.json"))).ok()
197}
198
199/// Return the local cache directory path for `cache_id` if determinable.
200pub fn datalab_cache_dir_path(cache_id: &str) -> Option<std::path::PathBuf> {
201    datalab_cache_dir(cache_id)
202}
203
204/// Metadata written alongside each DataLab extraction cache entry as `meta.json`.
205///
206/// All fields except `item_key` are `Option` so that the struct can be read
207/// from older cache entries that may be missing fields.
208#[derive(Debug, Clone, Serialize, Deserialize)]
209pub struct ExtractionMeta {
210    pub item_key: String,
211    pub zotero_user_id: Option<String>,
212    pub title: Option<String>,
213    pub authors: Option<Vec<String>>,
214    pub item_type: Option<String>,
215    pub date: Option<String>,
216    pub doi: Option<String>,
217    pub url: Option<String>,
218    pub publication_title: Option<String>,
219    pub extracted_at: Option<String>,
220    pub processing_mode: Option<String>,
221    pub pdf_source: Option<serde_json::Value>,
222}
223
224/// Read the `meta.json` for `cache_id` from the local DataLab cache, if present.
225pub fn read_extraction_meta(cache_id: &str) -> Option<ExtractionMeta> {
226    let dir = datalab_cache_dir(cache_id)?;
227    let bytes = std::fs::read(dir.join("meta.json")).ok()?;
228    serde_json::from_slice(&bytes).ok()
229}
230
231/// Return an ISO 8601 UTC timestamp for the current moment (no external deps).
232fn iso_now() -> String {
233    let secs = std::time::SystemTime::now()
234        .duration_since(std::time::UNIX_EPOCH)
235        .map(|d| d.as_secs())
236        .unwrap_or(0);
237    // Civil date from unix epoch seconds using Hinnant's algorithm.
238    let days = (secs / 86400) as i64;
239    let z = days + 719_468;
240    let era = if z >= 0 { z } else { z - 146_096 } / 146_097;
241    let doe = (z - era * 146_097) as u64;
242    let yoe = (doe - doe / 1_460 + doe / 36_524 - doe / 146_096) / 365;
243    let y = yoe as i64 + era * 400;
244    let doy = doe - (365 * yoe + yoe / 4 - yoe / 100);
245    let mp = (5 * doy + 2) / 153;
246    let d = doy - (153 * mp + 2) / 5 + 1;
247    let m = if mp < 10 { mp + 3 } else { mp - 9 };
248    let y = if m <= 2 { y + 1 } else { y };
249    let hh = (secs % 86_400) / 3_600;
250    let mm = (secs % 3_600) / 60;
251    let ss = secs % 60;
252    format!("{y:04}-{m:02}-{d:02}T{hh:02}:{mm:02}:{ss:02}Z")
253}
254
255/// Write `meta.json` into `dir` for the given extraction.
256///
257/// Fetches Zotero item metadata when `zotero` is provided (best-effort; never
258/// returns an error — failures are silently ignored).
259async fn write_extraction_meta(
260    dir: &std::path::Path,
261    item_key: &str,
262    zotero: Option<&ZoteroClient>,
263    mode_str: Option<&str>,
264    pdf_source: Option<&PdfSource>,
265) {
266    let mut meta = ExtractionMeta {
267        item_key: item_key.to_string(),
268        zotero_user_id: std::env::var("ZOTERO_USER_ID").ok(),
269        title: None,
270        authors: None,
271        item_type: None,
272        date: None,
273        doi: None,
274        url: None,
275        publication_title: None,
276        extracted_at: Some(iso_now()),
277        processing_mode: mode_str.map(String::from),
278        pdf_source: pdf_source.and_then(|s| serde_json::to_value(s).ok()),
279    };
280
281    if let Some(zc) = zotero {
282        if let Ok(item) = zc.get_item(item_key).await {
283            meta.title = item.data.title;
284            meta.item_type = Some(item.data.item_type);
285            meta.date = item.data.date;
286            meta.doi = item.data.doi;
287            meta.url = item.data.url;
288            meta.publication_title = item.data.publication_title;
289            let authors: Vec<String> = item
290                .data
291                .creators
292                .iter()
293                .filter(|c| c.creator_type == "author")
294                .map(|c| match (&c.first_name, &c.last_name, &c.name) {
295                    (Some(f), Some(l), _) if !l.is_empty() => format!("{f} {l}"),
296                    (_, Some(l), _) if !l.is_empty() => l.clone(),
297                    (_, _, Some(n)) if !n.is_empty() => n.clone(),
298                    _ => String::new(),
299                })
300                .filter(|s| !s.is_empty())
301                .collect();
302            if !authors.is_empty() {
303                meta.authors = Some(authors);
304            }
305        }
306    }
307
308    if let Ok(json) = serde_json::to_string_pretty(&meta) {
309        let _ = std::fs::write(dir.join("meta.json"), json);
310    }
311}
312
313/// Collect all pdf_url values from an OpenAlex Work's locations.
314fn collect_pdf_urls(work: &Work) -> Vec<String> {
315    let mut urls = Vec::new();
316
317    if let Some(loc) = &work.best_oa_location {
318        if let Some(url) = &loc.pdf_url {
319            urls.push(url.clone());
320        }
321    }
322    if let Some(loc) = &work.primary_location {
323        if let Some(url) = &loc.pdf_url {
324            if !urls.contains(url) {
325                urls.push(url.clone());
326            }
327        }
328    }
329    if let Some(locations) = &work.locations {
330        for loc in locations {
331            if let Some(url) = &loc.pdf_url {
332                if !urls.contains(url) {
333                    urls.push(url.clone());
334                }
335            }
336        }
337    }
338
339    urls
340}
341
342/// Brief Zotero library info for a work matched by DOI.
343#[derive(Debug, Clone, Serialize)]
344pub struct ZoteroItemInfo {
345    pub key: String,
346    pub item_type: String,
347    pub tags: Vec<String>,
348    pub has_pdf: bool,
349    pub date_added: Option<String>,
350    pub uri: String,
351}
352
353/// Check if a work exists in the Zotero library, matched by DOI.
354///
355/// Returns `Ok(Some(...))` with brief metadata if found, `Ok(None)` if the
356/// work has no DOI or is not in the library, or an error on API failure.
357pub async fn find_work_in_zotero(
358    zotero: &ZoteroClient,
359    work: &papers_openalex::Work,
360) -> Result<Option<ZoteroItemInfo>, papers_zotero::ZoteroError> {
361    let doi = match &work.doi {
362        Some(d) => bare_doi(d),
363        None => return Ok(None),
364    };
365    let title = work.display_name.as_deref().or(work.title.as_deref());
366
367    // Search by title using the default q mode (title/creator/year only — fast).
368    // qmode("everything") would search full-text of attached PDFs, which is very slow.
369    // DOI validation is done below on the returned item's metadata, not via full-text search.
370    let t_search = std::time::Instant::now();
371    let items: Vec<papers_zotero::Item> = if let Some(t) = title {
372        let title_params = ItemListParams::builder().q(t).build();
373        let res = zotero.list_top_items(&title_params).await?;
374        eprintln!("[timing] zotero title search ({} results): {:?}", res.items.len(), t_search.elapsed());
375        res.items
376    } else {
377        eprintln!("[timing] zotero: no title, skipping search");
378        return Ok(None);
379    };
380
381    for item in &items {
382        let item_doi = match &item.data.doi {
383            Some(d) => d,
384            None => continue,
385        };
386        if !item_doi.eq_ignore_ascii_case(doi) {
387            continue;
388        }
389
390        let t_children = std::time::Instant::now();
391        let children = zotero
392            .list_item_children(&item.key, &ItemListParams::default())
393            .await?;
394        eprintln!("[timing] zotero list_item_children: {:?}", t_children.elapsed());
395        let has_pdf = children.items.iter().any(|child| {
396            child.data.content_type.as_deref() == Some("application/pdf")
397                && matches!(
398                    child.data.link_mode.as_deref(),
399                    Some("imported_file" | "imported_url")
400                )
401        });
402
403        let tags: Vec<String> = item.data.tags.iter().map(|t| t.tag.clone()).collect();
404        let uri = format!("zotero://select/library/items/{}", item.key);
405        return Ok(Some(ZoteroItemInfo {
406            key: item.key.clone(),
407            item_type: item.data.item_type.clone(),
408            tags,
409            has_pdf,
410            date_added: item.data.date_added.clone(),
411            uri,
412        }));
413    }
414
415    Ok(None)
416}
417
418/// Try to find and download a PDF from Zotero (local storage first, then remote API).
419///
420/// Returns `(pdf_bytes, source, zotero_item_key)` where `zotero_item_key` is the
421/// parent bibliographic item key (e.g. `U9PRIZJ7`), suitable for use as a cache ID.
422pub async fn try_zotero(
423    zotero: &ZoteroClient,
424    doi: &str,
425    title: Option<&str>,
426) -> Result<Option<(Vec<u8>, PdfSource, String)>, WorkTextError> {
427    // Zotero API's `q` parameter only searches title, creator, year, and full-text
428    // content — it does NOT search metadata fields like DOI (per Zotero docs:
429    // "Searching of other fields will be possible in the future").
430    // Search by title first, then fall back to DOI (which may match full-text content).
431    let mut candidate_queries: Vec<String> = Vec::new();
432    if let Some(t) = title {
433        candidate_queries.push(t.to_string());
434    }
435    candidate_queries.push(doi.to_string());
436
437    for query in &candidate_queries {
438        let params = ItemListParams::builder()
439            .q(query.as_str())
440            .qmode("everything")
441            .build();
442
443        let results = zotero.list_top_items(&params).await?;
444        if results.items.is_empty() {
445            continue;
446        }
447
448        for item in &results.items {
449            // Check that this item's DOI actually matches
450            let item_doi = match &item.data.doi {
451                Some(d) => d,
452                None => continue,
453            };
454            if !item_doi.eq_ignore_ascii_case(doi) {
455                continue;
456            }
457
458        // Get children to find PDF attachment
459        let children = zotero
460            .list_item_children(&item.key, &ItemListParams::default())
461            .await?;
462
463        for child in &children.items {
464            let is_pdf = child
465                .data
466                .content_type
467                .as_deref()
468                == Some("application/pdf");
469            let has_local_file = matches!(
470                child.data.link_mode.as_deref(),
471                Some("imported_file" | "imported_url")
472            );
473
474            if !is_pdf || !has_local_file {
475                continue;
476            }
477
478            // Try local file first
479            if let Some(filename) = &child.data.filename {
480                if let Some(data_dir) = zotero_data_dir() {
481                    let local_path = data_dir
482                        .join("storage")
483                        .join(&child.key)
484                        .join(filename);
485                    if local_path.exists() {
486                        let bytes = tokio::fs::read(&local_path)
487                            .await
488                            .map_err(|e| WorkTextError::PdfExtract(format!("Failed to read local file: {e}")))?;
489                        return Ok(Some((
490                            bytes,
491                            PdfSource::ZoteroLocal {
492                                path: local_path.to_string_lossy().into_owned(),
493                            },
494                            item.key.clone(),
495                        )));
496                    }
497                }
498            }
499
500            // Try remote download
501            match zotero.download_item_file(&child.key).await {
502                Ok(bytes) if !bytes.is_empty() => {
503                    return Ok(Some((
504                        bytes,
505                        PdfSource::ZoteroRemote {
506                            item_key: child.key.clone(),
507                        },
508                        item.key.clone(),
509                    )));
510                }
511                _ => continue,
512            }
513        }
514        }
515    }
516
517    Ok(None)
518}
519
520/// Try downloading a PDF from direct URLs (whitelisted domains only).
521async fn try_direct_urls(
522    http: &reqwest::Client,
523    urls: &[String],
524) -> Result<Option<(Vec<u8>, PdfSource)>, WorkTextError> {
525    for url in urls {
526        if !is_whitelisted_url(url) {
527            continue;
528        }
529
530        let resp = http
531            .get(url)
532            .header(
533                "User-Agent",
534                "papers-mcp/0.1 (https://github.com/mmgeorge/papers; mailto:papers@example.com)",
535            )
536            .send()
537            .await;
538
539        let resp = match resp {
540            Ok(r) if r.status().is_success() => r,
541            _ => continue,
542        };
543
544        // Verify content type
545        let is_pdf = resp
546            .headers()
547            .get("content-type")
548            .and_then(|v| v.to_str().ok())
549            .is_some_and(|ct| ct.contains("application/pdf"));
550
551        if !is_pdf {
552            continue;
553        }
554
555        let bytes = resp.bytes().await?.to_vec();
556        if !bytes.is_empty() {
557            return Ok(Some((
558                bytes,
559                PdfSource::DirectUrl { url: url.clone() },
560            )));
561        }
562    }
563
564    Ok(None)
565}
566
567/// Try downloading from the OpenAlex Content API.
568async fn try_openalex_content(
569    http: &reqwest::Client,
570    work: &Work,
571) -> Result<Option<(Vec<u8>, PdfSource)>, WorkTextError> {
572    let has_pdf = work
573        .has_content
574        .as_ref()
575        .and_then(|hc| hc.pdf)
576        .unwrap_or(false);
577
578    if !has_pdf {
579        return Ok(None);
580    }
581
582    let api_key = match std::env::var("OPENALEX_API_KEY") {
583        Ok(key) if !key.is_empty() => key,
584        _ => return Ok(None),
585    };
586
587    let short_id = short_openalex_id(&work.id);
588    let url = format!(
589        "https://content.openalex.org/works/{}.pdf?api_key={}",
590        short_id, api_key
591    );
592
593    let resp = http.get(&url).send().await;
594
595    let resp = match resp {
596        Ok(r) if r.status().is_success() => r,
597        _ => return Ok(None),
598    };
599
600    let bytes = resp.bytes().await?.to_vec();
601    if !bytes.is_empty() {
602        return Ok(Some((bytes, PdfSource::OpenAlexContent)));
603    }
604
605    Ok(None)
606}
607
608/// Returns true if `key` is a valid Zotero item key (8 ASCII uppercase letters or digits).
609fn is_valid_zotero_key(key: &str) -> bool {
610    key.len() == 8 && key.bytes().all(|b| b.is_ascii_uppercase() || b.is_ascii_digit())
611}
612
613/// Returns true if the error is a 403 Forbidden from the Zotero API.
614/// Used to silently skip upload attempts when only a read-only API key is available.
615fn is_zotero_write_denied(e: &WorkTextError) -> bool {
616    matches!(
617        e,
618        WorkTextError::Zotero(papers_zotero::ZoteroError::Api { status: 403, .. })
619    )
620}
621
622/// The filename used for DataLab extraction backups stored in Zotero.
623/// The parent item key is embedded so `extract list` can find all extractions
624/// with a single `q=papers_extract` query instead of paginating all attachments.
625fn papers_extract_filename(parent_key: &str) -> String {
626    format!("papers_extract_{parent_key}.zip")
627}
628
629/// Find the attachment key of the `papers_extract_{parent_key}.zip` child on `parent_key`.
630async fn find_papers_zip_key(
631    zc: &ZoteroClient,
632    parent_key: &str,
633) -> Result<Option<String>, WorkTextError> {
634    let expected = papers_extract_filename(parent_key);
635    let children = zc
636        .list_item_children(parent_key, &ItemListParams::default())
637        .await?;
638    for child in &children.items {
639        if child.data.filename.as_deref() == Some(&expected)
640            && child.data.link_mode.as_deref() == Some("imported_file")
641        {
642            return Ok(Some(child.key.clone()));
643        }
644    }
645    Ok(None)
646}
647
648/// Create an in-memory ZIP of the DataLab cache directory for `id`.
649fn zip_cache_dir(dir: &std::path::Path, id: &str) -> std::io::Result<Vec<u8>> {
650    use std::io::Write as _;
651    let buf = Vec::new();
652    let cursor = std::io::Cursor::new(buf);
653    let mut zip = zip::ZipWriter::new(cursor);
654    let opts = zip::write::SimpleFileOptions::default()
655        .compression_method(zip::CompressionMethod::Deflated);
656
657    // Add {id}.md
658    let md_path = dir.join(format!("{id}.md"));
659    if md_path.exists() {
660        zip.start_file(format!("{id}.md"), opts)?;
661        zip.write_all(&std::fs::read(&md_path)?)?;
662    }
663
664    // Add {id}.json (if present)
665    let json_path = dir.join(format!("{id}.json"));
666    if json_path.exists() {
667        zip.start_file(format!("{id}.json"), opts)?;
668        zip.write_all(&std::fs::read(&json_path)?)?;
669    }
670
671    // Add meta.json (if present)
672    let meta_path = dir.join("meta.json");
673    if meta_path.exists() {
674        zip.start_file("meta.json", opts)?;
675        zip.write_all(&std::fs::read(&meta_path)?)?;
676    }
677
678    // Add images/ (if present)
679    let img_dir = dir.join("images");
680    if img_dir.is_dir() {
681        if let Ok(entries) = std::fs::read_dir(&img_dir) {
682            for entry in entries.flatten() {
683                let path = entry.path();
684                if path.is_file() {
685                    if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
686                        zip.start_file(format!("images/{name}"), opts)?;
687                        zip.write_all(&std::fs::read(&path)?)?;
688                    }
689                }
690            }
691        }
692    }
693
694    let cursor = zip.finish()?;
695    Ok(cursor.into_inner())
696}
697
698/// Extract a ZIP archive into `dir`, creating it first if needed.
699fn unzip_to_cache_dir(zip_bytes: &[u8], dir: &std::path::Path) -> std::io::Result<()> {
700    std::fs::create_dir_all(dir)?;
701    let cursor = std::io::Cursor::new(zip_bytes);
702    let mut archive = zip::ZipArchive::new(cursor)
703        .map_err(|e| std::io::Error::other(e.to_string()))?;
704    for i in 0..archive.len() {
705        let mut file = archive
706            .by_index(i)
707            .map_err(|e| std::io::Error::other(e.to_string()))?;
708        let out_path = dir.join(file.name());
709        if let Some(parent) = out_path.parent() {
710            std::fs::create_dir_all(parent)?;
711        }
712        let mut out = std::fs::File::create(&out_path)?;
713        std::io::copy(&mut file, &mut out)?;
714    }
715    Ok(())
716}
717
718/// Best-effort: zip the cache dir for `id` and upload it as `papers_extract_{id}.zip` under `parent_key`.
719async fn upload_papers_zip(
720    zc: &ZoteroClient,
721    parent_key: &str,
722    dir: &std::path::Path,
723    id: &str,
724) -> Result<(), WorkTextError> {
725    let filename = papers_extract_filename(id);
726    let zip_bytes = zip_cache_dir(dir, id).map_err(|e| WorkTextError::PdfExtract(e.to_string()))?;
727    let att_key = zc
728        .create_imported_attachment(parent_key, &filename, "application/zip")
729        .await?;
730    zc.upload_attachment_file(&att_key, &filename, zip_bytes)
731        .await?;
732    Ok(())
733}
734
735/// Extract text from PDF bytes, routing through DataLab if `datalab` is `Some`.
736///
737/// `zotero_id` is the Zotero parent item key (or OpenAlex short ID for non-Zotero sources)
738/// used as the on-disk cache ID. When `zotero` is `Some`, the DataLab result is also
739/// backed up to/restored from a `Papers.zip` attachment on the parent Zotero item.
740pub async fn do_extract(
741    pdf_bytes: Vec<u8>,
742    zotero_id: &str,
743    zotero: Option<&ZoteroClient>,
744    datalab: Option<(&DatalabClient, ProcessingMode)>,
745    source: &mut PdfSource,
746) -> Result<String, WorkTextError> {
747    if let Some((dl, mode)) = datalab {
748        // Validate key if Zotero sync is requested
749        if let Some(zc) = zotero {
750            if !is_valid_zotero_key(zotero_id) {
751                return Err(WorkTextError::InvalidZoteroKey(zotero_id.to_string()));
752            }
753            let _ = zc; // used below
754        }
755
756        let cache_dir = datalab_cache_dir(zotero_id);
757
758        // --- local cache check ---
759        if let Some(ref dir) = cache_dir {
760            let md_path = dir.join(format!("{zotero_id}.md"));
761            if let Ok(text) = std::fs::read_to_string(&md_path) {
762                *source = PdfSource::DataLab;
763                // Best-effort: upload to Zotero if no Papers.zip exists yet
764                if let Some(zc) = zotero {
765                    let zc = zc.clone();
766                    let dir = dir.clone();
767                    let id = zotero_id.to_string();
768                    tokio::spawn(async move {
769                        match find_papers_zip_key(&zc, &id).await {
770                            Ok(None) => {
771                                if let Err(e) = upload_papers_zip(&zc, &id, &dir, &id).await {
772                                    if !is_zotero_write_denied(&e) {
773                                        eprintln!("[papers] Zotero backup upload failed: {e}");
774                                    }
775                                }
776                            }
777                            Ok(Some(_)) => {} // already present
778                            Err(e) => {
779                                if !is_zotero_write_denied(&e) {
780                                    eprintln!("[papers] Zotero children check failed: {e}");
781                                }
782                            }
783                        }
784                    });
785                }
786                return Ok(text);
787            }
788        }
789
790        // --- Zotero cache check (Papers.zip) ---
791        if let Some(zc) = zotero {
792            if let Ok(Some(att_key)) = find_papers_zip_key(zc, zotero_id).await {
793                match zc.download_item_file(&att_key).await {
794                    Ok(zip_bytes) if !zip_bytes.is_empty() => {
795                        if let Some(ref dir) = cache_dir {
796                            if unzip_to_cache_dir(&zip_bytes, dir).is_ok() {
797                                let md_path = dir.join(format!("{zotero_id}.md"));
798                                if let Ok(text) = std::fs::read_to_string(&md_path) {
799                                    *source = PdfSource::DataLab;
800                                    return Ok(text);
801                                }
802                            }
803                        }
804                    }
805                    Err(e) => {
806                        return Err(WorkTextError::Zotero(e));
807                    }
808                    _ => {}
809                }
810            }
811        }
812
813        // --- DataLab API call ---
814        // Capture mode string and original source before they are moved/overwritten.
815        let mode_str_opt = serde_json::to_value(&mode)
816            .ok()
817            .and_then(|v| v.as_str().map(String::from));
818        let original_source = source.clone();
819        let dl_result = dl
820            .convert_document(MarkerRequest {
821                file: Some(pdf_bytes),
822                filename: Some(format!("{zotero_id}.pdf")),
823                output_format: vec![OutputFormat::Markdown, OutputFormat::Json],
824                mode,
825                ..Default::default()
826            })
827            .await?;
828
829        *source = PdfSource::DataLab;
830        let markdown = dl_result.markdown.clone().unwrap_or_default();
831
832        // --- write local cache (best-effort) ---
833        if let Some(ref dir) = cache_dir {
834            let _ = std::fs::create_dir_all(dir);
835
836            let md_path = dir.join(format!("{zotero_id}.md"));
837            let _ = std::fs::write(&md_path, &markdown);
838
839            if let Some(ref json_val) = dl_result.json {
840                let json_path = dir.join(format!("{zotero_id}.json"));
841                let _ = std::fs::write(&json_path, json_val.to_string());
842            }
843
844            if let Some(ref images) = dl_result.images {
845                if !images.is_empty() {
846                    let img_dir = dir.join("images");
847                    let _ = std::fs::create_dir_all(&img_dir);
848                    for (name, data) in images {
849                        let b64 = if let Some(pos) = data.find(";base64,") {
850                            &data[pos + 8..]
851                        } else {
852                            data.as_str()
853                        };
854                        if let Ok(bytes) = base64::engine::general_purpose::STANDARD.decode(b64) {
855                            let img_path = img_dir.join(name);
856                            let _ = std::fs::write(&img_path, bytes);
857                        }
858                    }
859                }
860            }
861
862            // Write meta.json (best-effort, before upload so it's included in the ZIP)
863            write_extraction_meta(
864                dir,
865                zotero_id,
866                zotero,
867                mode_str_opt.as_deref(),
868                Some(&original_source),
869            )
870            .await;
871
872            // Best-effort: upload Papers.zip to Zotero (silently skip on 403)
873            if let Some(zc) = zotero {
874                if let Err(e) = upload_papers_zip(zc, zotero_id, dir, zotero_id).await {
875                    if !is_zotero_write_denied(&e) {
876                        eprintln!("[papers] Zotero backup upload failed: {e}");
877                    }
878                }
879            }
880        }
881
882        Ok(markdown)
883    } else {
884        extract_text(&pdf_bytes)
885    }
886}
887
888/// Download and extract the full text of a scholarly work.
889///
890/// Tries multiple sources in priority order:
891/// 1. Local Zotero storage (filesystem)
892/// 2. Remote Zotero API (if credentials available)
893/// 3. Direct PDF URLs from OpenAlex locations (whitelisted domains)
894/// 4. OpenAlex Content API (requires `OPENALEX_API_KEY`)
895///
896/// When `datalab` is `Some`, the final extraction step uses the DataLab Marker
897/// API instead of local pdfium extraction, producing higher-quality markdown.
898/// The `ProcessingMode` controls quality vs. speed: `Fast` < `Balanced` < `Accurate`.
899pub async fn work_text(
900    openalex: &OpenAlexClient,
901    zotero: Option<&ZoteroClient>,
902    datalab: Option<(&DatalabClient, ProcessingMode)>,
903    work_id: &str,
904) -> Result<WorkTextResult, WorkTextError> {
905    // 1. Fetch work metadata from OpenAlex
906    let work = crate::api::work_get(openalex, work_id, &GetParams::default()).await?;
907
908    let title = work.title.clone().or_else(|| work.display_name.clone());
909    let doi_raw = work.doi.as_deref();
910    let doi = doi_raw.map(bare_doi);
911    let short_id = short_openalex_id(&work.id);
912
913    let http = reqwest::Client::new();
914
915    // 2. Try Zotero (local then remote)
916    if let (Some(zotero), Some(doi)) = (zotero, doi) {
917        if let Some((bytes, mut source, zotero_key)) = try_zotero(zotero, doi, title.as_deref()).await? {
918            let text = do_extract(bytes, &zotero_key, Some(zotero), datalab, &mut source).await?;
919            return Ok(WorkTextResult {
920                text,
921                source,
922                work_id: work.id.clone(),
923                title,
924                doi: doi_raw.map(String::from),
925            });
926        }
927    }
928
929    // 3. Try direct PDF URLs from OpenAlex locations
930    let pdf_urls = collect_pdf_urls(&work);
931    if let Some((bytes, mut source)) = try_direct_urls(&http, &pdf_urls).await? {
932        let text = do_extract(bytes, short_id, None, datalab, &mut source).await?;
933        return Ok(WorkTextResult {
934            text,
935            source,
936            work_id: work.id.clone(),
937            title,
938            doi: doi_raw.map(String::from),
939        });
940    }
941
942    // 4. Try OpenAlex Content API
943    if let Some((bytes, mut source)) = try_openalex_content(&http, &work).await? {
944        let text = do_extract(bytes, short_id, None, datalab, &mut source).await?;
945        return Ok(WorkTextResult {
946            text,
947            source,
948            work_id: work.id.clone(),
949            title,
950            doi: doi_raw.map(String::from),
951        });
952    }
953
954    // 5. No PDF found
955    Err(WorkTextError::NoPdfFound {
956        work_id: work.id.clone(),
957        title,
958        doi: doi_raw.map(String::from),
959    })
960}
961
962/// Poll Zotero for a work by DOI. Waits 5s initially, then polls every 2s for up to ~2 min.
963///
964/// This is used by callers (CLI prompt, MCP elicitation) after asking the user to add a paper
965/// to Zotero. Returns the extracted text if the paper appears in Zotero within the timeout.
966pub async fn poll_zotero_for_work(
967    zotero: &ZoteroClient,
968    work_id: &str,
969    title: Option<&str>,
970    doi: &str,
971) -> Result<WorkTextResult, WorkTextError> {
972    // Initial wait to give user time to save
973    tokio::time::sleep(std::time::Duration::from_secs(5)).await;
974
975    for _ in 0..55 {
976        if let Some((bytes, source, _zotero_key)) = try_zotero(zotero, doi, title).await? {
977            let text = extract_text(&bytes)?;
978            return Ok(WorkTextResult {
979                text,
980                source,
981                work_id: work_id.to_string(),
982                title: title.map(String::from),
983                doi: Some(doi.to_string()),
984            });
985        }
986        tokio::time::sleep(std::time::Duration::from_secs(2)).await;
987    }
988
989    Err(WorkTextError::NoPdfFound {
990        work_id: work_id.to_string(),
991        title: title.map(String::from),
992        doi: Some(doi.to_string()),
993    })
994}
995
996#[cfg(test)]
997mod tests {
998    use super::*;
999
1000    #[test]
1001    fn test_bare_doi() {
1002        assert_eq!(bare_doi("https://doi.org/10.1234/test"), "10.1234/test");
1003        assert_eq!(bare_doi("10.1234/test"), "10.1234/test");
1004    }
1005
1006    #[test]
1007    fn test_short_openalex_id() {
1008        assert_eq!(
1009            short_openalex_id("https://openalex.org/W2741809807"),
1010            "W2741809807"
1011        );
1012        assert_eq!(short_openalex_id("W2741809807"), "W2741809807");
1013    }
1014
1015    #[test]
1016    fn test_is_whitelisted_url() {
1017        assert!(is_whitelisted_url("https://arxiv.org/pdf/2301.12345"));
1018        assert!(is_whitelisted_url(
1019            "https://europepmc.org/articles/PMC123/pdf"
1020        ));
1021        assert!(is_whitelisted_url("https://www.biorxiv.org/content/pdf"));
1022        assert!(is_whitelisted_url("https://www.mdpi.com/some/pdf"));
1023        assert!(!is_whitelisted_url("https://evil.com/pdf"));
1024        assert!(!is_whitelisted_url("https://publisher.com/paper.pdf"));
1025    }
1026
1027    #[test]
1028    fn test_collect_pdf_urls_empty() {
1029        let work: Work = serde_json::from_str(r#"{"id": "https://openalex.org/W1"}"#).unwrap();
1030        assert!(collect_pdf_urls(&work).is_empty());
1031    }
1032
1033    #[test]
1034    fn test_collect_pdf_urls_deduplicates() {
1035        let work: Work = serde_json::from_value(serde_json::json!({
1036            "id": "https://openalex.org/W1",
1037            "best_oa_location": { "pdf_url": "https://arxiv.org/pdf/1234" },
1038            "primary_location": { "pdf_url": "https://arxiv.org/pdf/1234" },
1039            "locations": [
1040                { "pdf_url": "https://arxiv.org/pdf/1234" },
1041                { "pdf_url": "https://europepmc.org/pdf/5678" }
1042            ]
1043        }))
1044        .unwrap();
1045        let urls = collect_pdf_urls(&work);
1046        assert_eq!(urls.len(), 2);
1047        assert_eq!(urls[0], "https://arxiv.org/pdf/1234");
1048        assert_eq!(urls[1], "https://europepmc.org/pdf/5678");
1049    }
1050}
papers_core/text.rs

papers_core/
text.rs