Skip to main content

papers_core/
text.rs

1pub use papers_datalab::ProcessingMode;
2use papers_datalab::{DatalabClient, MarkerRequest, OutputFormat};
3use papers_openalex::{GetParams, OpenAlexClient, Work};
4use papers_zotero::{ItemListParams, ZoteroClient};
5use serde::Serialize;
6use std::path::PathBuf;
7
8/// Where the PDF was obtained from.
9#[derive(Debug, Clone, Serialize)]
10#[serde(tag = "type", rename_all = "snake_case")]
11pub enum PdfSource {
12    ZoteroLocal { path: String },
13    ZoteroRemote { item_key: String },
14    DirectUrl { url: String },
15    OpenAlexContent,
16    DataLab,
17}
18
19/// Result of extracting text from a work's PDF.
20#[derive(Debug, Clone, Serialize)]
21pub struct WorkTextResult {
22    pub text: String,
23    pub source: PdfSource,
24    pub work_id: String,
25    pub title: Option<String>,
26    pub doi: Option<String>,
27}
28
29/// Errors from the work_text pipeline.
30#[derive(Debug, thiserror::Error)]
31pub enum WorkTextError {
32    #[error("OpenAlex error: {0}")]
33    OpenAlex(#[from] papers_openalex::OpenAlexError),
34
35    #[error("Filter error: {0}")]
36    Filter(#[from] crate::filter::FilterError),
37
38    #[error("Zotero error: {0}")]
39    Zotero(#[from] papers_zotero::ZoteroError),
40
41    #[error("HTTP error: {0}")]
42    Http(#[from] reqwest::Error),
43
44    #[error("PDF extraction error: {0}")]
45    PdfExtract(String),
46
47    #[error(transparent)]
48    DataLab(#[from] papers_datalab::DatalabError),
49
50    #[error("No PDF found for work {work_id}{}", title.as_ref().map(|t| format!(" ({})", t)).unwrap_or_default())]
51    NoPdfFound {
52        work_id: String,
53        title: Option<String>,
54        doi: Option<String>,
55    },
56}
57
58/// Whitelisted domains for direct PDF download.
59const DIRECT_PDF_DOMAINS: &[&str] = &[
60    "arxiv.org",
61    "europepmc.org",
62    "biorxiv.org",
63    "medrxiv.org",
64    "ncbi.nlm.nih.gov",
65    "peerj.com",
66    "mdpi.com",
67    "frontiersin.org",
68    "plos.org",
69];
70
71/// Extract text from PDF bytes using pdf-extract.
72pub fn extract_text_bytes(pdf_bytes: &[u8]) -> Result<String, WorkTextError> {
73    extract_text(pdf_bytes)
74}
75
76fn extract_text(pdf_bytes: &[u8]) -> Result<String, WorkTextError> {
77    pdf_extract::extract_text_from_mem(pdf_bytes)
78        .map_err(|e| WorkTextError::PdfExtract(e.to_string()))
79}
80
81/// Strip the `https://doi.org/` prefix from a DOI URL, returning the bare DOI.
82fn bare_doi(doi: &str) -> &str {
83    doi.strip_prefix("https://doi.org/").unwrap_or(doi)
84}
85
86/// Extract the short OpenAlex ID (e.g. `W12345`) from a full URL.
87fn short_openalex_id(full_id: &str) -> &str {
88    full_id
89        .strip_prefix("https://openalex.org/")
90        .unwrap_or(full_id)
91}
92
93/// Check if a URL's host matches one of the whitelisted domains.
94fn is_whitelisted_url(url: &str) -> bool {
95    DIRECT_PDF_DOMAINS
96        .iter()
97        .any(|domain| url.contains(domain))
98}
99
100/// Get the Zotero data directory path.
101fn zotero_data_dir() -> Option<PathBuf> {
102    if let Ok(dir) = std::env::var("ZOTERO_DATA_DIR") {
103        return Some(PathBuf::from(dir));
104    }
105    dirs::home_dir().map(|h| h.join("Zotero"))
106}
107
108/// Collect all pdf_url values from an OpenAlex Work's locations.
109fn collect_pdf_urls(work: &Work) -> Vec<String> {
110    let mut urls = Vec::new();
111
112    if let Some(loc) = &work.best_oa_location {
113        if let Some(url) = &loc.pdf_url {
114            urls.push(url.clone());
115        }
116    }
117    if let Some(loc) = &work.primary_location {
118        if let Some(url) = &loc.pdf_url {
119            if !urls.contains(url) {
120                urls.push(url.clone());
121            }
122        }
123    }
124    if let Some(locations) = &work.locations {
125        for loc in locations {
126            if let Some(url) = &loc.pdf_url {
127                if !urls.contains(url) {
128                    urls.push(url.clone());
129                }
130            }
131        }
132    }
133
134    urls
135}
136
137/// Brief Zotero library info for a work matched by DOI.
138#[derive(Debug, Clone, Serialize)]
139pub struct ZoteroItemInfo {
140    pub key: String,
141    pub item_type: String,
142    pub tags: Vec<String>,
143    pub has_pdf: bool,
144    pub date_added: Option<String>,
145    pub uri: String,
146}
147
148/// Check if a work exists in the Zotero library, matched by DOI.
149///
150/// Returns `Ok(Some(...))` with brief metadata if found, `Ok(None)` if the
151/// work has no DOI or is not in the library, or an error on API failure.
152pub async fn find_work_in_zotero(
153    zotero: &ZoteroClient,
154    work: &papers_openalex::Work,
155) -> Result<Option<ZoteroItemInfo>, papers_zotero::ZoteroError> {
156    let doi = match &work.doi {
157        Some(d) => bare_doi(d),
158        None => return Ok(None),
159    };
160    let title = work.display_name.as_deref().or(work.title.as_deref());
161
162    // Search by title using the default q mode (title/creator/year only — fast).
163    // qmode("everything") would search full-text of attached PDFs, which is very slow.
164    // DOI validation is done below on the returned item's metadata, not via full-text search.
165    let t_search = std::time::Instant::now();
166    let items: Vec<papers_zotero::Item> = if let Some(t) = title {
167        let title_params = ItemListParams::builder().q(t).build();
168        let res = zotero.list_top_items(&title_params).await?;
169        eprintln!("[timing] zotero title search ({} results): {:?}", res.items.len(), t_search.elapsed());
170        res.items
171    } else {
172        eprintln!("[timing] zotero: no title, skipping search");
173        return Ok(None);
174    };
175
176    for item in &items {
177        let item_doi = match &item.data.doi {
178            Some(d) => d,
179            None => continue,
180        };
181        if !item_doi.eq_ignore_ascii_case(doi) {
182            continue;
183        }
184
185        let t_children = std::time::Instant::now();
186        let children = zotero
187            .list_item_children(&item.key, &ItemListParams::default())
188            .await?;
189        eprintln!("[timing] zotero list_item_children: {:?}", t_children.elapsed());
190        let has_pdf = children.items.iter().any(|child| {
191            child.data.content_type.as_deref() == Some("application/pdf")
192                && matches!(
193                    child.data.link_mode.as_deref(),
194                    Some("imported_file" | "imported_url")
195                )
196        });
197
198        let tags: Vec<String> = item.data.tags.iter().map(|t| t.tag.clone()).collect();
199        let uri = format!("zotero://select/library/items/{}", item.key);
200        return Ok(Some(ZoteroItemInfo {
201            key: item.key.clone(),
202            item_type: item.data.item_type.clone(),
203            tags,
204            has_pdf,
205            date_added: item.data.date_added.clone(),
206            uri,
207        }));
208    }
209
210    Ok(None)
211}
212
213/// Try to find and download a PDF from Zotero (local storage first, then remote API).
214pub async fn try_zotero(
215    zotero: &ZoteroClient,
216    doi: &str,
217    title: Option<&str>,
218) -> Result<Option<(Vec<u8>, PdfSource)>, WorkTextError> {
219    // Zotero API's `q` parameter only searches title, creator, year, and full-text
220    // content — it does NOT search metadata fields like DOI (per Zotero docs:
221    // "Searching of other fields will be possible in the future").
222    // Search by title first, then fall back to DOI (which may match full-text content).
223    let mut candidate_queries: Vec<String> = Vec::new();
224    if let Some(t) = title {
225        candidate_queries.push(t.to_string());
226    }
227    candidate_queries.push(doi.to_string());
228
229    for query in &candidate_queries {
230        let params = ItemListParams::builder()
231            .q(query.as_str())
232            .qmode("everything")
233            .build();
234
235        let results = zotero.list_top_items(&params).await?;
236        if results.items.is_empty() {
237            continue;
238        }
239
240        for item in &results.items {
241            // Check that this item's DOI actually matches
242            let item_doi = match &item.data.doi {
243                Some(d) => d,
244                None => continue,
245            };
246            if !item_doi.eq_ignore_ascii_case(doi) {
247                continue;
248            }
249
250        // Get children to find PDF attachment
251        let children = zotero
252            .list_item_children(&item.key, &ItemListParams::default())
253            .await?;
254
255        for child in &children.items {
256            let is_pdf = child
257                .data
258                .content_type
259                .as_deref()
260                == Some("application/pdf");
261            let has_local_file = matches!(
262                child.data.link_mode.as_deref(),
263                Some("imported_file" | "imported_url")
264            );
265
266            if !is_pdf || !has_local_file {
267                continue;
268            }
269
270            // Try local file first
271            if let Some(filename) = &child.data.filename {
272                if let Some(data_dir) = zotero_data_dir() {
273                    let local_path = data_dir
274                        .join("storage")
275                        .join(&child.key)
276                        .join(filename);
277                    if local_path.exists() {
278                        let bytes = tokio::fs::read(&local_path)
279                            .await
280                            .map_err(|e| WorkTextError::PdfExtract(format!("Failed to read local file: {e}")))?;
281                        return Ok(Some((
282                            bytes,
283                            PdfSource::ZoteroLocal {
284                                path: local_path.to_string_lossy().into_owned(),
285                            },
286                        )));
287                    }
288                }
289            }
290
291            // Try remote download
292            match zotero.download_item_file(&child.key).await {
293                Ok(bytes) if !bytes.is_empty() => {
294                    return Ok(Some((
295                        bytes,
296                        PdfSource::ZoteroRemote {
297                            item_key: child.key.clone(),
298                        },
299                    )));
300                }
301                _ => continue,
302            }
303        }
304        }
305    }
306
307    Ok(None)
308}
309
310/// Try downloading a PDF from direct URLs (whitelisted domains only).
311async fn try_direct_urls(
312    http: &reqwest::Client,
313    urls: &[String],
314) -> Result<Option<(Vec<u8>, PdfSource)>, WorkTextError> {
315    for url in urls {
316        if !is_whitelisted_url(url) {
317            continue;
318        }
319
320        let resp = http
321            .get(url)
322            .header(
323                "User-Agent",
324                "papers-mcp/0.1 (https://github.com/mmgeorge/papers; mailto:papers@example.com)",
325            )
326            .send()
327            .await;
328
329        let resp = match resp {
330            Ok(r) if r.status().is_success() => r,
331            _ => continue,
332        };
333
334        // Verify content type
335        let is_pdf = resp
336            .headers()
337            .get("content-type")
338            .and_then(|v| v.to_str().ok())
339            .is_some_and(|ct| ct.contains("application/pdf"));
340
341        if !is_pdf {
342            continue;
343        }
344
345        let bytes = resp.bytes().await?.to_vec();
346        if !bytes.is_empty() {
347            return Ok(Some((
348                bytes,
349                PdfSource::DirectUrl { url: url.clone() },
350            )));
351        }
352    }
353
354    Ok(None)
355}
356
357/// Try downloading from the OpenAlex Content API.
358async fn try_openalex_content(
359    http: &reqwest::Client,
360    work: &Work,
361) -> Result<Option<(Vec<u8>, PdfSource)>, WorkTextError> {
362    let has_pdf = work
363        .has_content
364        .as_ref()
365        .and_then(|hc| hc.pdf)
366        .unwrap_or(false);
367
368    if !has_pdf {
369        return Ok(None);
370    }
371
372    let api_key = match std::env::var("OPENALEX_API_KEY") {
373        Ok(key) if !key.is_empty() => key,
374        _ => return Ok(None),
375    };
376
377    let short_id = short_openalex_id(&work.id);
378    let url = format!(
379        "https://content.openalex.org/works/{}.pdf?api_key={}",
380        short_id, api_key
381    );
382
383    let resp = http.get(&url).send().await;
384
385    let resp = match resp {
386        Ok(r) if r.status().is_success() => r,
387        _ => return Ok(None),
388    };
389
390    let bytes = resp.bytes().await?.to_vec();
391    if !bytes.is_empty() {
392        return Ok(Some((bytes, PdfSource::OpenAlexContent)));
393    }
394
395    Ok(None)
396}
397
398/// Extract text from PDF bytes, routing through DataLab if `datalab` is `Some`.
399async fn do_extract(
400    pdf_bytes: Vec<u8>,
401    short_id: &str,
402    datalab: Option<(&DatalabClient, ProcessingMode)>,
403    source: &mut PdfSource,
404) -> Result<String, WorkTextError> {
405    if let Some((dl, mode)) = datalab {
406        let dl_result = dl
407            .convert_document(MarkerRequest {
408                file: Some(pdf_bytes),
409                filename: Some(format!("{}.pdf", short_id)),
410                output_format: OutputFormat::Markdown,
411                mode,
412                ..Default::default()
413            })
414            .await?;
415        *source = PdfSource::DataLab;
416        Ok(dl_result.markdown.unwrap_or_default())
417    } else {
418        extract_text(&pdf_bytes)
419    }
420}
421
422/// Download and extract the full text of a scholarly work.
423///
424/// Tries multiple sources in priority order:
425/// 1. Local Zotero storage (filesystem)
426/// 2. Remote Zotero API (if credentials available)
427/// 3. Direct PDF URLs from OpenAlex locations (whitelisted domains)
428/// 4. OpenAlex Content API (requires `OPENALEX_API_KEY`)
429///
430/// When `datalab` is `Some`, the final extraction step uses the DataLab Marker
431/// API instead of local pdfium extraction, producing higher-quality markdown.
432/// The `ProcessingMode` controls quality vs. speed: `Fast` < `Balanced` < `Accurate`.
433pub async fn work_text(
434    openalex: &OpenAlexClient,
435    zotero: Option<&ZoteroClient>,
436    datalab: Option<(&DatalabClient, ProcessingMode)>,
437    work_id: &str,
438) -> Result<WorkTextResult, WorkTextError> {
439    // 1. Fetch work metadata from OpenAlex
440    let work = crate::api::work_get(openalex, work_id, &GetParams::default()).await?;
441
442    let title = work.title.clone().or_else(|| work.display_name.clone());
443    let doi_raw = work.doi.as_deref();
444    let doi = doi_raw.map(bare_doi);
445    let short_id = short_openalex_id(&work.id);
446
447    let http = reqwest::Client::new();
448
449    // 2. Try Zotero (local then remote)
450    if let (Some(zotero), Some(doi)) = (zotero, doi) {
451        if let Some((bytes, mut source)) = try_zotero(zotero, doi, title.as_deref()).await? {
452            let text = do_extract(bytes, short_id, datalab, &mut source).await?;
453            return Ok(WorkTextResult {
454                text,
455                source,
456                work_id: work.id.clone(),
457                title,
458                doi: doi_raw.map(String::from),
459            });
460        }
461    }
462
463    // 3. Try direct PDF URLs from OpenAlex locations
464    let pdf_urls = collect_pdf_urls(&work);
465    if let Some((bytes, mut source)) = try_direct_urls(&http, &pdf_urls).await? {
466        let text = do_extract(bytes, short_id, datalab, &mut source).await?;
467        return Ok(WorkTextResult {
468            text,
469            source,
470            work_id: work.id.clone(),
471            title,
472            doi: doi_raw.map(String::from),
473        });
474    }
475
476    // 4. Try OpenAlex Content API
477    if let Some((bytes, mut source)) = try_openalex_content(&http, &work).await? {
478        let text = do_extract(bytes, short_id, datalab, &mut source).await?;
479        return Ok(WorkTextResult {
480            text,
481            source,
482            work_id: work.id.clone(),
483            title,
484            doi: doi_raw.map(String::from),
485        });
486    }
487
488    // 5. No PDF found
489    Err(WorkTextError::NoPdfFound {
490        work_id: work.id.clone(),
491        title,
492        doi: doi_raw.map(String::from),
493    })
494}
495
496/// Poll Zotero for a work by DOI. Waits 5s initially, then polls every 2s for up to ~2 min.
497///
498/// This is used by callers (CLI prompt, MCP elicitation) after asking the user to add a paper
499/// to Zotero. Returns the extracted text if the paper appears in Zotero within the timeout.
500pub async fn poll_zotero_for_work(
501    zotero: &ZoteroClient,
502    work_id: &str,
503    title: Option<&str>,
504    doi: &str,
505) -> Result<WorkTextResult, WorkTextError> {
506    // Initial wait to give user time to save
507    tokio::time::sleep(std::time::Duration::from_secs(5)).await;
508
509    for _ in 0..55 {
510        if let Some((bytes, source)) = try_zotero(zotero, doi, title).await? {
511            let text = extract_text(&bytes)?;
512            return Ok(WorkTextResult {
513                text,
514                source,
515                work_id: work_id.to_string(),
516                title: title.map(String::from),
517                doi: Some(doi.to_string()),
518            });
519        }
520        tokio::time::sleep(std::time::Duration::from_secs(2)).await;
521    }
522
523    Err(WorkTextError::NoPdfFound {
524        work_id: work_id.to_string(),
525        title: title.map(String::from),
526        doi: Some(doi.to_string()),
527    })
528}
529
530#[cfg(test)]
531mod tests {
532    use super::*;
533
534    #[test]
535    fn test_bare_doi() {
536        assert_eq!(bare_doi("https://doi.org/10.1234/test"), "10.1234/test");
537        assert_eq!(bare_doi("10.1234/test"), "10.1234/test");
538    }
539
540    #[test]
541    fn test_short_openalex_id() {
542        assert_eq!(
543            short_openalex_id("https://openalex.org/W2741809807"),
544            "W2741809807"
545        );
546        assert_eq!(short_openalex_id("W2741809807"), "W2741809807");
547    }
548
549    #[test]
550    fn test_is_whitelisted_url() {
551        assert!(is_whitelisted_url("https://arxiv.org/pdf/2301.12345"));
552        assert!(is_whitelisted_url(
553            "https://europepmc.org/articles/PMC123/pdf"
554        ));
555        assert!(is_whitelisted_url("https://www.biorxiv.org/content/pdf"));
556        assert!(is_whitelisted_url("https://www.mdpi.com/some/pdf"));
557        assert!(!is_whitelisted_url("https://evil.com/pdf"));
558        assert!(!is_whitelisted_url("https://publisher.com/paper.pdf"));
559    }
560
561    #[test]
562    fn test_collect_pdf_urls_empty() {
563        let work: Work = serde_json::from_str(r#"{"id": "https://openalex.org/W1"}"#).unwrap();
564        assert!(collect_pdf_urls(&work).is_empty());
565    }
566
567    #[test]
568    fn test_collect_pdf_urls_deduplicates() {
569        let work: Work = serde_json::from_value(serde_json::json!({
570            "id": "https://openalex.org/W1",
571            "best_oa_location": { "pdf_url": "https://arxiv.org/pdf/1234" },
572            "primary_location": { "pdf_url": "https://arxiv.org/pdf/1234" },
573            "locations": [
574                { "pdf_url": "https://arxiv.org/pdf/1234" },
575                { "pdf_url": "https://europepmc.org/pdf/5678" }
576            ]
577        }))
578        .unwrap();
579        let urls = collect_pdf_urls(&work);
580        assert_eq!(urls.len(), 2);
581        assert_eq!(urls[0], "https://arxiv.org/pdf/1234");
582        assert_eq!(urls[1], "https://europepmc.org/pdf/5678");
583    }
584}