doiget_core/sources/
arxiv.rs

1//! arXiv source — arXiv id → PDF + Atom-feed metadata.
2//!
3//! Spec: `docs/SOURCES.md` §4 arXiv. No auth; the API has a 3-second-per-request
4//! rate guideline that doiget's 5/sec global + 200ms per-source backoff
5//! comfortably respects (no extra source-specific tuning needed).
6//!
7//! # Fetch flow (full)
8//!
9//! 1. `can_serve` returns `true` only for `Ref::Arxiv(_)`; `Ref::Doi(_)` is
10//!    rejected up front.
11//! 2. `fetch` acquires a permit from the shared `RateLimiter`, then
12//!    best-effort fetches the Atom feed (`<base>/api/query?id_list=<id>`)
13//!    and parses it into a JSON metadata object via the private
14//!    `parse_atom_feed` helper. Atom failures degrade gracefully
15//!    (`metadata_json = None` + `tracing::warn!`) — the existing 1.0
16//!    PDF-leg semantics are preserved.
17//! 3. The PDF URL `<base>/pdf/<id>.pdf` is fetched via
18//!    [`crate::http::HttpClient::fetch_pdf`] which enforces the magic-byte
19//!    (`%PDF-`) check per `docs/SECURITY.md` §1.2.
20//! 4. ONE `LogEvent::Fetch` row is appended for the PDF leg. The Atom leg
21//!    does NOT emit its own row — the source-level audit unit is
22//!    "one fetch attempt = one row" and the Atom call is a supporting
23//!    leg of the same attempt.
24//!
25//! # Metadata-only path
26//!
27//! [`ArxivSource::fetch_metadata_only`] performs ONLY the Atom feed fetch
28//! and is the entry point for the `metadata_only` orchestrator
29//! (`crate::orchestrator::metadata_only`). It MUST NOT call
30//! [`crate::http::HttpClient::fetch_pdf`] — doing so would violate the
31//! `doiget_metadata_only` contract (`docs/MCP_TOOLS.md` §11). It emits
32//! one `LogEvent::Fetch` row under `Capability::Metadata` so the audit
33//! trail distinguishes metadata-only fetches from full fetches without
34//! breaking the schema (the `capability` field is the structured channel
35//! for this distinction; spec §3 documents it as one of `oa` / `metadata`
36//! / `tdm-*`).
37
38use async_trait::async_trait;
39use bytes::Bytes;
40use quick_xml::events::Event;
41use quick_xml::Reader;
42use serde_json::{json, Value};
43use url::Url;
44
45use crate::provenance::{Capability, LogEvent, LogResult, RowInput};
46use crate::source::{FetchContext, FetchError, FetchResult, Source};
47use crate::{ArxivId, CapabilityProfile, Ref};
48
49/// Default base for the PDF endpoint. arXiv serves PDFs at
50/// `https://arxiv.org/pdf/<id>` (the trailing `.pdf` is optional but
51/// most reliable to include). PDFs may redirect to `cdn.arxiv.org` —
52/// the per-source allowlist in `crate::http::tier_1_allowlist()` covers
53/// this via the `*.arxiv.org` glob.
54const PDF_BASE: &str = "https://arxiv.org";
55
56/// Default base for the Atom metadata endpoint. arXiv serves the API at
57/// `https://export.arxiv.org/api/query` — a DIFFERENT host from the PDF
58/// endpoint. Hitting `arxiv.org/api/query` instead redirects and fails
59/// the metadata leg, so the two endpoints must use separate bases.
60/// `export.arxiv.org` is covered by the `*.arxiv.org` allowlist glob.
61const META_BASE: &str = "https://export.arxiv.org";
62
63/// arXiv [`Source`] impl. PDFs are served from `arxiv.org`; Atom metadata
64/// from `export.arxiv.org` (the `metadata_url` builder).
65#[derive(Clone, Debug)]
66pub struct ArxivSource {
67    /// PDF endpoint base (`arxiv.org` in production).
68    base: Url,
69    /// Atom metadata endpoint base (`export.arxiv.org` in production).
70    meta_base: Url,
71}
72
73impl ArxivSource {
74    /// Production constructor. PDFs from `arxiv.org`, Atom metadata from
75    /// `export.arxiv.org`.
76    pub fn new() -> Self {
77        // Both hard-coded constants are `'static` string literals known at
78        // compile time to be valid absolute URLs; the `expect`s can only
79        // fire if a constant regresses, which every `ArxivSource::new()`
80        // test exercises.
81        #[allow(clippy::expect_used)]
82        let base = Url::parse(PDF_BASE).expect("hard-coded PDF base URL is valid");
83        #[allow(clippy::expect_used)]
84        let meta_base = Url::parse(META_BASE).expect("hard-coded meta base URL is valid");
85        Self { base, meta_base }
86    }
87
88    /// Construct with an arbitrary base URL.
89    ///
90    /// The orchestrator (`doiget-cli::commands::fetch`) uses this to honor
91    /// the `DOIGET_ARXIV_BASE` env var, which lets integration tests point
92    /// the source at a wiremock origin without resorting to compile-time
93    /// gates. Both the PDF and metadata legs share the one override base
94    /// (a single wiremock origin serves both paths). Production callers
95    /// use [`ArxivSource::new`].
96    pub fn with_base(base: Url) -> Self {
97        Self {
98            meta_base: base.clone(),
99            base,
100        }
101    }
102
103    /// Build the PDF URL for a given arXiv id. arXiv accepts both
104    /// `/pdf/<id>` and `/pdf/<id>.pdf`; we use the trailing-`.pdf` form to
105    /// make the URL self-describing.
106    ///
107    /// Old-style ids (`cond-mat/9501001`) contain a `/` in the id itself;
108    /// the resulting path `/pdf/cond-mat/9501001.pdf` is the form arXiv
109    /// expects. Because the base URL has no path beyond `/`, `Url::join`
110    /// resolves the absolute reference `/pdf/<id>.pdf` to exactly that
111    /// path for both new-style (`2401.12345`) and old-style
112    /// (`cond-mat/9501001`) ids. The `arxiv_fetch_old_style_id_*` test
113    /// pins this behavior.
114    fn pdf_url(&self, id: &ArxivId) -> Result<Url, FetchError> {
115        let path = format!("/pdf/{}.pdf", id.as_str());
116        self.base.join(&path).map_err(|e| FetchError::SourceSchema {
117            hint: format!("arxiv URL construction failed: {e}"),
118        })
119    }
120
121    /// Build the Atom-feed metadata URL for a given arXiv id.
122    ///
123    /// Production: `https://export.arxiv.org/api/query?id_list=<id>`. In
124    /// tests the base is the wiremock origin; the path is the same
125    /// (`/api/query?id_list=<id>`). The `export.arxiv.org` host is on the
126    /// `arxiv` redirect allowlist (per
127    /// `crate::http::tier_1_allowlist`) so the redirect closure does not
128    /// reject this leg.
129    ///
130    /// Old-style ids (`cond-mat/9501001`) contain a `/` which we
131    /// URL-encode via `query_pairs_mut().append_pair` so the wire form is
132    /// `id_list=cond-mat%2F9501001`.
133    fn metadata_url(&self, id: &ArxivId) -> Result<Url, FetchError> {
134        let mut url = self
135            .meta_base
136            .join("/api/query")
137            .map_err(|e| FetchError::SourceSchema {
138                hint: format!("arxiv metadata URL construction failed: {e}"),
139            })?;
140        url.query_pairs_mut().append_pair("id_list", id.as_str());
141        Ok(url)
142    }
143
144    /// Fetch ONLY the Atom-feed metadata for the given arXiv id. Does NOT
145    /// touch the PDF endpoint — this is the entry point for the
146    /// `metadata_only` orchestrator (`docs/MCP_TOOLS.md` §11).
147    ///
148    /// Emits a single `LogEvent::Fetch` row under `Capability::Metadata`
149    /// so the audit trail distinguishes metadata-only attempts from full
150    /// (PDF) fetches.
151    ///
152    /// # Errors
153    ///
154    /// - [`FetchError::Http`] on transport / status / size-cap failures.
155    /// - [`FetchError::SourceSchema`] if the response body is not
156    ///   well-formed Atom XML.
157    /// - [`FetchError::Log`] if the provenance row write fails
158    ///   (fail-closed per `docs/PROVENANCE_LOG.md` §5).
159    pub async fn fetch_metadata_only(
160        &self,
161        id: &ArxivId,
162        ctx: &FetchContext,
163    ) -> Result<Value, FetchError> {
164        // Same politeness gate as the full fetch path.
165        let _permit = ctx.rate_limiter.acquire(self.name()).await;
166
167        let url = self.metadata_url(id)?;
168        let (body, _final_url) = ctx.http.fetch_bytes(self.name(), url).await?;
169        let metadata = parse_atom_feed(&body)?;
170
171        // ADR-0021 §1 canonical-digest under the "arxiv" resolver
172        // profile. version=None until a follow-up slice threads the
173        // Atom-feed-discovered version (`v2`, etc.) into this row.
174        let canonical =
175            crate::CanonicalRef::new(crate::SourceType::Arxiv, id.as_str(), self.name(), None)
176                .digest_hex();
177        ctx.log.append(RowInput {
178            event: LogEvent::Fetch,
179            result: LogResult::Ok,
180            // Distinguish metadata-only from full (PDF) fetches via the
181            // structured `capability` channel rather than mangling the
182            // `source` string — `docs/PROVENANCE_LOG.md` §3 lists
183            // `metadata` as a first-class capability value.
184            capability: Capability::Metadata,
185            ref_: Some(id.as_str()),
186            source: Some(self.name()),
187            error_code: None,
188            size_bytes: Some(body.len() as u64),
189            license: Some("arxiv-default"),
190            store_path: None,
191            canonical_digest: Some(&canonical),
192        })?;
193
194        Ok(metadata)
195    }
196}
197
198impl Default for ArxivSource {
199    fn default() -> Self {
200        Self::new()
201    }
202}
203
204#[async_trait]
205impl Source for ArxivSource {
206    fn name(&self) -> &str {
207        "arxiv"
208    }
209
210    fn can_serve(&self, _profile: &CapabilityProfile, ref_: &Ref) -> bool {
211        matches!(ref_, Ref::Arxiv(_))
212    }
213
214    async fn fetch(
215        &self,
216        ref_: &Ref,
217        _profile: &CapabilityProfile,
218        ctx: &FetchContext,
219    ) -> Result<FetchResult, FetchError> {
220        // Eligibility gate. The orchestrator is expected to call
221        // `can_serve` first, but a runtime check here gives a clean error
222        // path if it does not.
223        let id = match ref_ {
224            Ref::Arxiv(a) => a,
225            Ref::Doi(_) => {
226                return Err(FetchError::NotEligible {
227                    source_key: "arxiv".into(),
228                });
229            }
230        };
231
232        // Hold the rate-limiter permit for the duration of the HTTP
233        // fetch. Drop happens at end of scope after the log append below.
234        let _permit = ctx.rate_limiter.acquire(self.name()).await;
235
236        // ----- Atom-feed metadata leg (best-effort) -------------------
237        //
238        // Fetched BEFORE the PDF so that `FetchResult::metadata_json` is
239        // populated for a single-pass fetch (the orchestrator does not
240        // need to re-issue a metadata-only call). Failures here degrade
241        // gracefully: we set `metadata_json = None`, emit a tracing
242        // warning, and proceed with the PDF leg unchanged. NO log row
243        // is emitted from this leg — the source-level audit unit is
244        // "one fetch attempt = one row" and the row comes from the PDF
245        // leg below. This is what preserves the 4-row sequence asserted
246        // by `crates/doiget-cli/tests/fetch_arxiv_e2e.rs`.
247        let metadata_json = match self.metadata_url(id) {
248            Ok(meta_url) => match ctx.http.fetch_bytes(self.name(), meta_url).await {
249                Ok((bytes, _final)) => match parse_atom_feed(&bytes) {
250                    Ok(v) => Some(v),
251                    Err(e) => {
252                        tracing::warn!(
253                            arxiv_id = %id.as_str(),
254                            error = %e,
255                            "arxiv Atom feed parse failed; continuing with PDF-only fetch"
256                        );
257                        None
258                    }
259                },
260                Err(e) => {
261                    tracing::warn!(
262                        arxiv_id = %id.as_str(),
263                        error = %e,
264                        "arxiv Atom feed fetch failed; continuing with PDF-only fetch"
265                    );
266                    None
267                }
268            },
269            Err(e) => {
270                tracing::warn!(
271                    arxiv_id = %id.as_str(),
272                    error = %e,
273                    "arxiv metadata URL construction failed; continuing with PDF-only fetch"
274                );
275                None
276            }
277        };
278
279        // ----- PDF leg -------------------------------------------------
280        let url = self.pdf_url(id)?;
281
282        // `fetch_pdf` enforces the magic-byte check (`%PDF-`) per
283        // `docs/SECURITY.md` §1.2 — non-PDF response surfaces as
284        // `HttpError::NotAPdf`, which `From` converts to `FetchError::Http`.
285        let (body, final_url): (Bytes, Url) = ctx.http.fetch_pdf(self.name(), url).await?;
286
287        // One `event=fetch` row per attempt, per `docs/ARCHITECTURE.md` §6
288        // and `docs/PROVENANCE_LOG.md` §3. Per `docs/SECURITY.md` §1.8 a
289        // log write failure is fail-closed — the `?` aborts the fetch.
290        // ADR-0021 §1 canonical-digest: build under the "arxiv" resolver
291        // profile. version=None in Slice 4 — a follow-up may surface
292        // the `vN` discriminator from the Atom-feed `id` element.
293        let canonical = ref_.promote(self.name(), None).digest_hex();
294        ctx.log.append(RowInput {
295            event: LogEvent::Fetch,
296            result: LogResult::Ok,
297            capability: Capability::Oa,
298            ref_: Some(id.as_str()),
299            source: Some(self.name()),
300            error_code: None,
301            size_bytes: Some(body.len() as u64),
302            // arXiv does not expose a per-item license string; the
303            // platform-wide license declaration lives at
304            // <https://info.arxiv.org/help/license/>. Phase 1 records
305            // `"arxiv-default"` so the value is informative without
306            // claiming a specific Creative Commons license.
307            license: Some("arxiv-default"),
308            store_path: None,
309            canonical_digest: Some(&canonical),
310        })?;
311
312        Ok(FetchResult {
313            source: self.name().to_string(),
314            license: "arxiv-default".into(),
315            pdf_bytes: Some(body),
316            final_url: Some(final_url),
317            metadata_json,
318        })
319    }
320}
321
322// ---------------------------------------------------------------------------
323// Atom-feed parser (B.1)
324// ---------------------------------------------------------------------------
325
326/// Parse the arXiv Atom-feed response body into a structured JSON
327/// metadata object.
328///
329/// Endpoint: `https://export.arxiv.org/api/query?id_list=<id>` (see
330/// arXiv API user manual §3.1). The response is an `<feed>` document
331/// containing one `<entry>` per requested id. We extract the fields
332/// listed in `docs/SOURCES.md` §4 arXiv (title, summary/abstract,
333/// authors, published, updated, categories) into the synthetic JSON
334/// shape:
335///
336/// ```jsonc
337/// {
338///   "title": "...",
339///   "abstract": "...",
340///   "authors": ["Family, Given", ...],
341///   "published": "YYYY-MM-DDTHH:MM:SSZ",  // RFC3339 UTC, passed through verbatim
342///   "updated":   "YYYY-MM-DDTHH:MM:SSZ",
343///   "categories": ["cs.LG", "stat.ML"]
344/// }
345/// ```
346///
347/// All fields are best-effort: any missing element is omitted from the
348/// JSON output (NOT serialized as `null`). The parser is a small
349/// `quick-xml` event walker — no DOM allocation. Only the FIRST `<entry>`
350/// element is consumed (we always query a single id).
351///
352/// # Errors
353///
354/// Returns [`FetchError::SourceSchema`] if the XML is malformed (parser
355/// reports a syntax error) or if no `<entry>` element is present (arXiv
356/// returns an empty `<feed>` on an unknown id).
357pub(crate) fn parse_atom_feed(xml: &[u8]) -> Result<Value, FetchError> {
358    let mut reader = Reader::from_reader(xml);
359    let config = reader.config_mut();
360    config.trim_text(true);
361
362    // Top-level state. `in_entry` tracks whether we are inside the first
363    // (and only) `<entry>` element; once we exit, we stop collecting.
364    let mut in_entry = false;
365    let mut saw_entry = false;
366    let mut depth = 0_i32; // depth WITHIN the entry; 0 = at <entry> root
367
368    // Accumulators. Per-author state is kept on a stack so a nested
369    // `<author><name>...</name></author>` populates the right slot.
370    let mut title: Option<String> = None;
371    let mut abstract_: Option<String> = None;
372    let mut published: Option<String> = None;
373    let mut updated: Option<String> = None;
374    let mut authors: Vec<String> = Vec::new();
375    let mut categories: Vec<String> = Vec::new();
376
377    // Current text-collection target — None when we are not inside a
378    // leaf element whose text we want.
379    #[derive(Clone, Copy)]
380    enum Target {
381        Title,
382        Summary,
383        Published,
384        Updated,
385        AuthorName,
386    }
387    let mut target: Option<Target> = None;
388    let mut in_author = false;
389    let mut buf: Vec<u8> = Vec::new();
390
391    loop {
392        match reader.read_event_into(&mut buf) {
393            Ok(Event::Start(e)) => {
394                let name_bytes = e.name();
395                let local = local_name(name_bytes.as_ref());
396                if !in_entry {
397                    if local == b"entry" {
398                        in_entry = true;
399                        saw_entry = true;
400                        depth = 0;
401                    }
402                    buf.clear();
403                    continue;
404                }
405                depth += 1;
406                // Depth==1 means a direct child of `<entry>`.
407                if depth == 1 {
408                    match local {
409                        b"title" => target = Some(Target::Title),
410                        b"summary" => target = Some(Target::Summary),
411                        b"published" => target = Some(Target::Published),
412                        b"updated" => target = Some(Target::Updated),
413                        b"author" => {
414                            in_author = true;
415                            authors.push(String::new());
416                        }
417                        _ => {}
418                    }
419                } else if depth == 2 && in_author && local == b"name" {
420                    target = Some(Target::AuthorName);
421                }
422                buf.clear();
423            }
424            Ok(Event::Empty(e)) => {
425                let name_bytes = e.name();
426                let local = local_name(name_bytes.as_ref());
427                if in_entry && depth == 0 && local == b"category" {
428                    // <category term="cs.LG" scheme="..."/> — extract `term`.
429                    for attr in e.attributes().flatten() {
430                        if attr.key.as_ref() == b"term" {
431                            // quick-xml 0.40: `unescape_value()` is
432                            // deprecated in favour of `normalized_value()`
433                            // (attribute-value normalization resolves the
434                            // same character/entity references). arXiv's
435                            // Atom feed is XML 1.0.
436                            if let Ok(v) = attr.normalized_value(quick_xml::XmlVersion::Explicit1_0)
437                            {
438                                categories.push(v.into_owned());
439                            }
440                        }
441                    }
442                }
443                buf.clear();
444            }
445            Ok(Event::Text(t)) => {
446                if let Some(tg) = target {
447                    // quick-xml 0.40 removed `BytesText::unescape`.
448                    // Reproduce the old behaviour: decode the bytes, then
449                    // unescape XML entities via `quick_xml::escape::unescape`.
450                    // Best-effort — skip the text on decode/unescape error.
451                    if let Some(s) = t.decode().ok().and_then(|raw| {
452                        quick_xml::escape::unescape(&raw)
453                            .ok()
454                            .map(|c| c.into_owned())
455                    }) {
456                        match tg {
457                            Target::Title => title.get_or_insert_with(String::new).push_str(&s),
458                            Target::Summary => {
459                                abstract_.get_or_insert_with(String::new).push_str(&s)
460                            }
461                            Target::Published => {
462                                published.get_or_insert_with(String::new).push_str(&s)
463                            }
464                            Target::Updated => updated.get_or_insert_with(String::new).push_str(&s),
465                            Target::AuthorName => {
466                                if let Some(last) = authors.last_mut() {
467                                    last.push_str(&s);
468                                }
469                            }
470                        }
471                    }
472                }
473                buf.clear();
474            }
475            Ok(Event::End(e)) => {
476                if !in_entry {
477                    buf.clear();
478                    continue;
479                }
480                let name_bytes = e.name();
481                let local = local_name(name_bytes.as_ref());
482                if depth == 0 && local == b"entry" {
483                    // Done with the first entry — stop. We deliberately
484                    // ignore any subsequent entries since the orchestrator
485                    // always queries a single id.
486                    break;
487                }
488                depth -= 1;
489                if depth == 0 {
490                    if local == b"author" {
491                        in_author = false;
492                        // Drop empty author names (defensive).
493                        if let Some(last) = authors.last() {
494                            if last.is_empty() {
495                                authors.pop();
496                            }
497                        }
498                    }
499                    target = None;
500                } else if depth == 1 && in_author && local == b"name" {
501                    target = None;
502                }
503                buf.clear();
504            }
505            Ok(Event::Eof) => break,
506            Err(e) => {
507                return Err(FetchError::SourceSchema {
508                    hint: format!("arxiv Atom XML parse error: {e}"),
509                });
510            }
511            // CDATA / Comment / Decl / PI / DocType — ignored.
512            _ => {
513                buf.clear();
514            }
515        }
516    }
517
518    if !saw_entry {
519        return Err(FetchError::SourceSchema {
520            hint: "arxiv Atom feed had no <entry> element (unknown id?)".into(),
521        });
522    }
523
524    // Build the JSON object, omitting empty optionals. `serde_json::Map`
525    // preserves insertion order so the output is stable.
526    let mut obj = serde_json::Map::new();
527    if let Some(t) = title {
528        let trimmed = t.trim().to_string();
529        if !trimmed.is_empty() {
530            obj.insert("title".into(), Value::String(trimmed));
531        }
532    }
533    if let Some(a) = abstract_ {
534        let trimmed = a.trim().to_string();
535        if !trimmed.is_empty() {
536            obj.insert("abstract".into(), Value::String(trimmed));
537        }
538    }
539    if !authors.is_empty() {
540        obj.insert(
541            "authors".into(),
542            Value::Array(authors.into_iter().map(Value::String).collect()),
543        );
544    }
545    if let Some(p) = published {
546        let trimmed = p.trim().to_string();
547        if !trimmed.is_empty() {
548            obj.insert("published".into(), Value::String(trimmed));
549        }
550    }
551    if let Some(u) = updated {
552        let trimmed = u.trim().to_string();
553        if !trimmed.is_empty() {
554            obj.insert("updated".into(), Value::String(trimmed));
555        }
556    }
557    if !categories.is_empty() {
558        obj.insert(
559            "categories".into(),
560            Value::Array(categories.into_iter().map(Value::String).collect()),
561        );
562    }
563    Ok(json!(obj))
564}
565
566/// Strip an XML namespace prefix from a qualified name, returning the
567/// local-part bytes. `b"atom:entry"` -> `b"entry"`. Atom uses the default
568/// namespace so most names arrive unprefixed; this helper makes the
569/// parser robust to either form without depending on quick-xml's
570/// namespace resolver (which would require us to thread a
571/// `NsReader` and explicit prefix bindings through every event).
572fn local_name(qname: &[u8]) -> &[u8] {
573    match qname.iter().rposition(|&b| b == b':') {
574        Some(idx) => &qname[idx + 1..],
575        None => qname,
576    }
577}
578
579// ---------------------------------------------------------------------------
580// Tests
581// ---------------------------------------------------------------------------
582
583#[cfg(test)]
584#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
585mod tests {
586    use super::*;
587
588    use std::sync::Arc;
589
590    use camino::Utf8PathBuf;
591    use tempfile::TempDir;
592    use wiremock::matchers::{method, path};
593    use wiremock::{Mock, MockServer, ResponseTemplate};
594
595    use crate::http::{HttpClient, HttpError};
596    use crate::provenance::{LogRow, ProvenanceLog};
597    use crate::rate_limiter::RateLimiter;
598    use crate::source::FetchContext;
599    use crate::{ArxivId, CapabilityProfile, Doi, RateLimits, Ref};
600
601    const TEST_SESSION_ID: &str = "01J0000000000000000000TEST";
602
603    /// Build a complete `FetchContext` against a wiremock host for use in
604    /// the source-level tests below.
605    fn build_test_context(wiremock_host: &str) -> (TempDir, FetchContext) {
606        let td = TempDir::new().expect("tempdir");
607        let log_dir =
608            Utf8PathBuf::try_from(td.path().to_path_buf()).expect("temp dir path must be UTF-8");
609        let log_path = log_dir.join("test.jsonl");
610
611        let http = Arc::new(HttpClient::new_for_tests_allow_http("arxiv", wiremock_host));
612        let rate_limiter = Arc::new(RateLimiter::new(RateLimits::HARD_CODED));
613        let session_id = TEST_SESSION_ID.to_string();
614        let log = Arc::new(
615            ProvenanceLog::open(log_path, session_id.clone()).expect("provenance log opens"),
616        );
617
618        (
619            td,
620            FetchContext {
621                http,
622                rate_limiter,
623                log,
624                session_id,
625                cache_root: None,
626            },
627        )
628    }
629
630    fn read_rows(path: &camino::Utf8Path) -> Vec<LogRow> {
631        let raw = std::fs::read_to_string(path).expect("read log");
632        raw.lines()
633            .filter(|l| !l.is_empty())
634            .map(|l| serde_json::from_str::<LogRow>(l).expect("valid LogRow"))
635            .collect()
636    }
637
638    fn profile() -> CapabilityProfile {
639        CapabilityProfile::from_env().expect("Phase 0 stub profile")
640    }
641
642    // -----------------------------------------------------------------
643    // can_serve
644    // -----------------------------------------------------------------
645
646    #[test]
647    fn arxiv_can_serve_returns_true_for_arxiv() {
648        let s = ArxivSource::new();
649        let id = ArxivId::parse("2401.12345").expect("valid id");
650        let r = Ref::Arxiv(id);
651        assert!(s.can_serve(&profile(), &r));
652    }
653
654    #[test]
655    fn production_metadata_url_uses_export_host_pdf_uses_arxiv() {
656        // Regression guard: the Atom metadata leg MUST hit
657        // export.arxiv.org, while PDFs hit arxiv.org. Sending metadata to
658        // arxiv.org/api/query redirects and fails the resolve.
659        let s = ArxivSource::new();
660        let id = ArxivId::parse("1706.03762").expect("valid id");
661        let meta = s.metadata_url(&id).expect("meta url");
662        assert_eq!(meta.host_str(), Some("export.arxiv.org"));
663        assert_eq!(meta.path(), "/api/query");
664        let pdf = s.pdf_url(&id).expect("pdf url");
665        assert_eq!(pdf.host_str(), Some("arxiv.org"));
666    }
667
668    #[test]
669    fn with_base_shares_one_origin_for_both_legs() {
670        // The DOIGET_ARXIV_BASE override (wiremock) serves both paths from
671        // a single origin, so meta and PDF must resolve to the same host.
672        let s = ArxivSource::with_base("http://127.0.0.1:9999".parse().expect("url"));
673        let id = ArxivId::parse("2401.12345").expect("valid id");
674        assert_eq!(
675            s.metadata_url(&id).expect("meta").host_str(),
676            s.pdf_url(&id).expect("pdf").host_str()
677        );
678    }
679
680    #[test]
681    fn arxiv_can_serve_returns_false_for_doi() {
682        let s = ArxivSource::new();
683        let r = Ref::Doi(Doi("10.1234/example".to_string()));
684        assert!(!s.can_serve(&profile(), &r));
685    }
686
687    // -----------------------------------------------------------------
688    // fetch — happy paths
689    // -----------------------------------------------------------------
690
691    #[tokio::test]
692    async fn arxiv_fetch_new_style_id_returns_pdf_bytes() {
693        let server = MockServer::start().await;
694        let body = b"%PDF-1.7\n%fixture\n".to_vec();
695        Mock::given(method("GET"))
696            .and(path("/pdf/2401.12345.pdf"))
697            .respond_with(ResponseTemplate::new(200).set_body_bytes(body.clone()))
698            .mount(&server)
699            .await;
700
701        let host = server
702            .uri()
703            .parse::<Url>()
704            .unwrap()
705            .host_str()
706            .unwrap()
707            .to_string();
708        let (_td, ctx) = build_test_context(&host);
709        let s = ArxivSource::with_base(server.uri().parse().unwrap());
710
711        let id = ArxivId::parse("2401.12345").unwrap();
712        let r = Ref::Arxiv(id);
713        let res = s.fetch(&r, &profile(), &ctx).await.expect("fetch ok");
714
715        assert_eq!(res.source, "arxiv");
716        assert_eq!(res.license, "arxiv-default");
717        let bytes = res.pdf_bytes.expect("pdf bytes set");
718        assert!(
719            bytes.starts_with(b"%PDF-"),
720            "expected PDF magic prefix, got {:?}",
721            &bytes[..bytes.len().min(8)]
722        );
723        assert_eq!(&bytes[..], &body[..]);
724    }
725
726    #[tokio::test]
727    async fn arxiv_fetch_old_style_id_returns_pdf_bytes() {
728        // Old-style id contains `/` (`cond-mat/9501001`); the URL must
729        // become `/pdf/cond-mat/9501001.pdf`. This pins the URL-builder
730        // behavior across both id shapes.
731        let server = MockServer::start().await;
732        let body = b"%PDF-1.4\n%old-style fixture\n".to_vec();
733        Mock::given(method("GET"))
734            .and(path("/pdf/cond-mat/9501001.pdf"))
735            .respond_with(ResponseTemplate::new(200).set_body_bytes(body.clone()))
736            .mount(&server)
737            .await;
738
739        let host = server
740            .uri()
741            .parse::<Url>()
742            .unwrap()
743            .host_str()
744            .unwrap()
745            .to_string();
746        let (_td, ctx) = build_test_context(&host);
747        let s = ArxivSource::with_base(server.uri().parse().unwrap());
748
749        let id = ArxivId::parse("cond-mat/9501001").expect("old-style id");
750        let r = Ref::Arxiv(id);
751        let res = s.fetch(&r, &profile(), &ctx).await.expect("fetch ok");
752
753        let bytes = res.pdf_bytes.expect("pdf bytes set");
754        assert!(bytes.starts_with(b"%PDF-"));
755        assert_eq!(&bytes[..], &body[..]);
756    }
757
758    // -----------------------------------------------------------------
759    // fetch — error paths
760    // -----------------------------------------------------------------
761
762    #[tokio::test]
763    async fn arxiv_fetch_with_doi_ref_errors_not_eligible() {
764        let server = MockServer::start().await;
765        let host = server
766            .uri()
767            .parse::<Url>()
768            .unwrap()
769            .host_str()
770            .unwrap()
771            .to_string();
772        let (_td, ctx) = build_test_context(&host);
773        let s = ArxivSource::with_base(server.uri().parse().unwrap());
774
775        let r = Ref::Doi(Doi("10.1234/example".to_string()));
776        let err = s
777            .fetch(&r, &profile(), &ctx)
778            .await
779            .expect_err("doi ref must not be eligible");
780        match err {
781            FetchError::NotEligible { source_key } => {
782                assert_eq!(source_key, "arxiv");
783            }
784            other => panic!("expected NotEligible, got {:?}", other),
785        }
786    }
787
788    #[tokio::test]
789    async fn arxiv_fetch_writes_log_row_with_arxiv_default_license() {
790        let server = MockServer::start().await;
791        let body = b"%PDF-1.7\n%log-row fixture\n".to_vec();
792        Mock::given(method("GET"))
793            .and(path("/pdf/2401.12345.pdf"))
794            .respond_with(ResponseTemplate::new(200).set_body_bytes(body.clone()))
795            .mount(&server)
796            .await;
797        let host = server
798            .uri()
799            .parse::<Url>()
800            .unwrap()
801            .host_str()
802            .unwrap()
803            .to_string();
804        let (_td, ctx) = build_test_context(&host);
805        // Capture the log path before the fetch call for later read-back.
806        let log_path = ctx.log.path().to_path_buf();
807        let s = ArxivSource::with_base(server.uri().parse().unwrap());
808
809        let id = ArxivId::parse("2401.12345").unwrap();
810        let r = Ref::Arxiv(id);
811        let _ = s.fetch(&r, &profile(), &ctx).await.expect("fetch ok");
812
813        let rows = read_rows(&log_path);
814        assert_eq!(rows.len(), 1, "exactly one fetch row expected");
815        let row = &rows[0];
816        assert_eq!(row.source.as_deref(), Some("arxiv"));
817        assert_eq!(row.ref_.as_deref(), Some("2401.12345"));
818        assert_eq!(row.license.as_deref(), Some("arxiv-default"));
819        assert_eq!(row.size_bytes, Some(body.len() as u64));
820        assert!(row.error_code.is_none());
821    }
822
823    #[tokio::test]
824    async fn arxiv_non_pdf_body_rejected() {
825        // Wiremock returns 200 with a non-PDF body. The magic-byte check
826        // inside `HttpClient::fetch_pdf` rejects it as `HttpError::NotAPdf`,
827        // surfacing as `FetchError::Http`.
828        let server = MockServer::start().await;
829        Mock::given(method("GET"))
830            .and(path("/pdf/2401.12345.pdf"))
831            .respond_with(
832                ResponseTemplate::new(200).set_body_bytes(b"<html>not a pdf</html>".to_vec()),
833            )
834            .mount(&server)
835            .await;
836        let host = server
837            .uri()
838            .parse::<Url>()
839            .unwrap()
840            .host_str()
841            .unwrap()
842            .to_string();
843        let (_td, ctx) = build_test_context(&host);
844        let s = ArxivSource::with_base(server.uri().parse().unwrap());
845
846        let id = ArxivId::parse("2401.12345").unwrap();
847        let r = Ref::Arxiv(id);
848        let err = s
849            .fetch(&r, &profile(), &ctx)
850            .await
851            .expect_err("non-pdf body must be rejected");
852        match err {
853            FetchError::Http(HttpError::NotAPdf { got }) => {
854                assert_eq!(&got, b"<html");
855            }
856            other => panic!("expected FetchError::Http(NotAPdf), got {:?}", other),
857        }
858    }
859
860    #[tokio::test]
861    async fn arxiv_404_maps_to_http_error() {
862        let server = MockServer::start().await;
863        Mock::given(method("GET"))
864            .and(path("/pdf/2401.99999.pdf"))
865            .respond_with(ResponseTemplate::new(404))
866            .mount(&server)
867            .await;
868        let host = server
869            .uri()
870            .parse::<Url>()
871            .unwrap()
872            .host_str()
873            .unwrap()
874            .to_string();
875        let (_td, ctx) = build_test_context(&host);
876        let s = ArxivSource::with_base(server.uri().parse().unwrap());
877
878        let id = ArxivId::parse("2401.99999").unwrap();
879        let r = Ref::Arxiv(id);
880        let err = s
881            .fetch(&r, &profile(), &ctx)
882            .await
883            .expect_err("404 must surface");
884        match err {
885            FetchError::Http(HttpError::HttpStatus { status, .. }) => {
886                assert_eq!(status, 404);
887            }
888            other => panic!("expected FetchError::Http(HttpStatus), got {:?}", other),
889        }
890    }
891
892    // -----------------------------------------------------------------
893    // parse_atom_feed (B.1) — unit tests
894    // -----------------------------------------------------------------
895
896    /// Synthetic Atom payload from the Slice 1 spec (deliverable B.3). Do
897    /// not hit real arXiv from tests.
898    const SAMPLE_ATOM_FEED: &str = r#"<?xml version="1.0" encoding="UTF-8"?>
899<feed xmlns="http://www.w3.org/2005/Atom">
900  <entry>
901    <id>http://arxiv.org/abs/2401.12345v1</id>
902    <updated>2024-02-01T00:00:00Z</updated>
903    <published>2024-01-15T00:00:00Z</published>
904    <title>Example arXiv Paper Title</title>
905    <summary>This is an example abstract.</summary>
906    <author>
907      <name>Jane Doe</name>
908    </author>
909    <author>
910      <name>John Roe</name>
911    </author>
912    <category term="cs.LG" scheme="http://arxiv.org/schemas/atom"/>
913    <category term="stat.ML" scheme="http://arxiv.org/schemas/atom"/>
914  </entry>
915</feed>"#;
916
917    #[test]
918    fn parse_atom_feed_extracts_all_fields() {
919        let v = parse_atom_feed(SAMPLE_ATOM_FEED.as_bytes()).expect("Atom parses");
920        assert_eq!(v["title"], serde_json::json!("Example arXiv Paper Title"));
921        assert_eq!(
922            v["abstract"],
923            serde_json::json!("This is an example abstract.")
924        );
925        assert_eq!(v["authors"], serde_json::json!(["Jane Doe", "John Roe"]));
926        assert_eq!(v["published"], serde_json::json!("2024-01-15T00:00:00Z"));
927        assert_eq!(v["updated"], serde_json::json!("2024-02-01T00:00:00Z"));
928        assert_eq!(v["categories"], serde_json::json!(["cs.LG", "stat.ML"]));
929    }
930
931    #[test]
932    fn parse_atom_feed_empty_feed_errors_source_schema() {
933        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
934<feed xmlns="http://www.w3.org/2005/Atom"></feed>"#;
935        let err = parse_atom_feed(xml.as_bytes()).expect_err("empty feed must error");
936        match err {
937            FetchError::SourceSchema { hint } => {
938                assert!(
939                    hint.contains("entry"),
940                    "expected mention of <entry>; got {hint}"
941                );
942            }
943            other => panic!("expected SourceSchema, got {other:?}"),
944        }
945    }
946
947    #[test]
948    fn parse_atom_feed_omits_missing_optional_fields() {
949        // An entry with only an id and title — abstract/authors/categories
950        // absent. The output must omit those keys entirely (not emit
951        // `null`).
952        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
953<feed xmlns="http://www.w3.org/2005/Atom">
954  <entry>
955    <id>http://arxiv.org/abs/2401.00001v1</id>
956    <title>Minimal Entry</title>
957  </entry>
958</feed>"#;
959        let v = parse_atom_feed(xml.as_bytes()).expect("parses");
960        let obj = v.as_object().expect("object");
961        assert_eq!(
962            obj.get("title").and_then(Value::as_str),
963            Some("Minimal Entry")
964        );
965        assert!(
966            !obj.contains_key("abstract"),
967            "abstract should be omitted: {obj:?}"
968        );
969        assert!(
970            !obj.contains_key("authors"),
971            "authors should be omitted: {obj:?}"
972        );
973        assert!(
974            !obj.contains_key("categories"),
975            "categories should be omitted: {obj:?}"
976        );
977    }
978
979    // -----------------------------------------------------------------
980    // fetch_metadata_only — orchestrator entry point
981    // -----------------------------------------------------------------
982
983    #[tokio::test]
984    async fn arxiv_fetch_metadata_only_returns_atom_metadata() {
985        let server = MockServer::start().await;
986        Mock::given(method("GET"))
987            .and(path("/api/query"))
988            .respond_with(ResponseTemplate::new(200).set_body_string(SAMPLE_ATOM_FEED))
989            .mount(&server)
990            .await;
991        let host = server
992            .uri()
993            .parse::<Url>()
994            .unwrap()
995            .host_str()
996            .unwrap()
997            .to_string();
998        let (_td, ctx) = build_test_context(&host);
999        let s = ArxivSource::with_base(server.uri().parse().unwrap());
1000        let id = ArxivId::parse("2401.12345").unwrap();
1001
1002        let meta = s
1003            .fetch_metadata_only(&id, &ctx)
1004            .await
1005            .expect("metadata_only ok");
1006        assert_eq!(
1007            meta["title"],
1008            serde_json::json!("Example arXiv Paper Title")
1009        );
1010        assert_eq!(meta["authors"], serde_json::json!(["Jane Doe", "John Roe"]));
1011    }
1012
1013    #[tokio::test]
1014    async fn arxiv_fetch_populates_metadata_json_when_atom_endpoint_mocked() {
1015        // Full Source::fetch with BOTH Atom and PDF endpoints mocked must
1016        // populate `metadata_json` from the Atom response.
1017        let server = MockServer::start().await;
1018        Mock::given(method("GET"))
1019            .and(path("/api/query"))
1020            .respond_with(ResponseTemplate::new(200).set_body_string(SAMPLE_ATOM_FEED))
1021            .mount(&server)
1022            .await;
1023        Mock::given(method("GET"))
1024            .and(path("/pdf/2401.12345.pdf"))
1025            .respond_with(ResponseTemplate::new(200).set_body_bytes(b"%PDF-1.7\n%fix\n".to_vec()))
1026            .mount(&server)
1027            .await;
1028        let host = server
1029            .uri()
1030            .parse::<Url>()
1031            .unwrap()
1032            .host_str()
1033            .unwrap()
1034            .to_string();
1035        let (_td, ctx) = build_test_context(&host);
1036        let s = ArxivSource::with_base(server.uri().parse().unwrap());
1037        let id = ArxivId::parse("2401.12345").unwrap();
1038        let r = Ref::Arxiv(id);
1039
1040        let res = s.fetch(&r, &profile(), &ctx).await.expect("fetch ok");
1041        let meta = res.metadata_json.expect("metadata_json populated");
1042        assert_eq!(
1043            meta["title"],
1044            serde_json::json!("Example arXiv Paper Title")
1045        );
1046    }
1047
1048    #[tokio::test]
1049    async fn arxiv_fetch_atom_failure_falls_back_to_pdf_only() {
1050        // PDF endpoint mocked; Atom endpoint deliberately unmocked
1051        // (will 404). The fetch must still succeed with
1052        // `metadata_json = None` — the best-effort contract.
1053        let server = MockServer::start().await;
1054        Mock::given(method("GET"))
1055            .and(path("/pdf/2401.12345.pdf"))
1056            .respond_with(ResponseTemplate::new(200).set_body_bytes(b"%PDF-1.7\nx".to_vec()))
1057            .mount(&server)
1058            .await;
1059        let host = server
1060            .uri()
1061            .parse::<Url>()
1062            .unwrap()
1063            .host_str()
1064            .unwrap()
1065            .to_string();
1066        let (_td, ctx) = build_test_context(&host);
1067        let s = ArxivSource::with_base(server.uri().parse().unwrap());
1068        let id = ArxivId::parse("2401.12345").unwrap();
1069        let r = Ref::Arxiv(id);
1070
1071        let res = s.fetch(&r, &profile(), &ctx).await.expect("fetch ok");
1072        assert!(res.metadata_json.is_none());
1073        assert!(res.pdf_bytes.is_some());
1074    }
1075}
doiget_core/sources/arxiv.rs

doiget_core/sources/
arxiv.rs