Skip to main content

doiget_core/sources/
arxiv.rs

1//! arXiv source — arXiv id → PDF + Atom-feed metadata.
2//!
3//! Spec: `docs/SOURCES.md` §4 arXiv. No auth; the API has a 3-second-per-request
4//! rate guideline that doiget's 5/sec global + 200ms per-source backoff
5//! comfortably respects (no extra source-specific tuning needed).
6//!
7//! # Fetch flow (full)
8//!
9//! 1. `can_serve` returns `true` only for `Ref::Arxiv(_)`; `Ref::Doi(_)` is
10//!    rejected up front.
11//! 2. `fetch` acquires a permit from the shared `RateLimiter`, then
12//!    best-effort fetches the Atom feed (`<base>/api/query?id_list=<id>`)
13//!    and parses it into a JSON metadata object via the private
14//!    `parse_atom_feed` helper. Atom failures degrade gracefully
15//!    (`metadata_json = None` + `tracing::warn!`) — the existing 1.0
16//!    PDF-leg semantics are preserved.
17//! 3. The PDF URL `<base>/pdf/<id>.pdf` is fetched via
18//!    [`crate::http::HttpClient::fetch_pdf`] which enforces the magic-byte
19//!    (`%PDF-`) check per `docs/SECURITY.md` §1.2.
20//! 4. ONE `LogEvent::Fetch` row is appended for the PDF leg. The Atom leg
21//!    does NOT emit its own row — the source-level audit unit is
22//!    "one fetch attempt = one row" and the Atom call is a supporting
23//!    leg of the same attempt.
24//!
25//! # Metadata-only path
26//!
27//! [`ArxivSource::fetch_metadata_only`] performs ONLY the Atom feed fetch
28//! and is the entry point for the `metadata_only` orchestrator
29//! (`crate::orchestrator::metadata_only`). It MUST NOT call
30//! [`crate::http::HttpClient::fetch_pdf`] — doing so would violate the
31//! `doiget_metadata_only` contract (`docs/MCP_TOOLS.md` §11). It emits
32//! one `LogEvent::Fetch` row under `Capability::Metadata` so the audit
33//! trail distinguishes metadata-only fetches from full fetches without
34//! breaking the schema (the `capability` field is the structured channel
35//! for this distinction; spec §3 documents it as one of `oa` / `metadata`
36//! / `tdm-*`).
37
38use async_trait::async_trait;
39use bytes::Bytes;
40use quick_xml::events::Event;
41use quick_xml::Reader;
42use serde_json::{json, Value};
43use url::Url;
44
45use crate::provenance::{Capability, LogEvent, LogResult, RowInput};
46use crate::source::{FetchContext, FetchError, FetchResult, Source};
47use crate::{ArxivId, CapabilityProfile, Ref};
48
49/// Default base for the PDF endpoint. arXiv serves PDFs at
50/// `https://arxiv.org/pdf/<id>` (the trailing `.pdf` is optional but
51/// most reliable to include). PDFs may redirect to `cdn.arxiv.org` —
52/// the per-source allowlist in `crate::http::tier_1_allowlist()` covers
53/// this via the `*.arxiv.org` glob.
54const PDF_BASE: &str = "https://arxiv.org";
55
56/// arXiv [`Source`] impl. Phase 1 returns the PDF bytes and skips metadata
57/// (the export.arxiv.org Atom feed is documented but XML parsing is
58/// deferred to a follow-up PR — TODO Phase 1+).
59#[derive(Clone, Debug)]
60pub struct ArxivSource {
61    base: Url,
62}
63
64impl ArxivSource {
65    /// Production constructor. Uses the public arxiv.org PDF endpoint.
66    pub fn new() -> Self {
67        // The hard-coded `PDF_BASE` is a `'static` string literal known
68        // at compile time to be a valid absolute URL. The `expect` here
69        // can only fire if the constant itself regresses, which is
70        // exercised at every test run via `ArxivSource::new()`.
71        #[allow(clippy::expect_used)]
72        let base = Url::parse(PDF_BASE).expect("hard-coded base URL is valid");
73        Self { base }
74    }
75
76    /// Construct with an arbitrary base URL.
77    ///
78    /// The orchestrator (`doiget-cli::commands::fetch`) uses this to honor
79    /// the `DOIGET_ARXIV_BASE` env var, which lets integration tests point
80    /// the source at a wiremock origin without resorting to compile-time
81    /// gates. Production callers use [`ArxivSource::new`].
82    pub fn with_base(base: Url) -> Self {
83        Self { base }
84    }
85
86    /// Build the PDF URL for a given arXiv id. arXiv accepts both
87    /// `/pdf/<id>` and `/pdf/<id>.pdf`; we use the trailing-`.pdf` form to
88    /// make the URL self-describing.
89    ///
90    /// Old-style ids (`cond-mat/9501001`) contain a `/` in the id itself;
91    /// the resulting path `/pdf/cond-mat/9501001.pdf` is the form arXiv
92    /// expects. Because the base URL has no path beyond `/`, `Url::join`
93    /// resolves the absolute reference `/pdf/<id>.pdf` to exactly that
94    /// path for both new-style (`2401.12345`) and old-style
95    /// (`cond-mat/9501001`) ids. The `arxiv_fetch_old_style_id_*` test
96    /// pins this behavior.
97    fn pdf_url(&self, id: &ArxivId) -> Result<Url, FetchError> {
98        let path = format!("/pdf/{}.pdf", id.as_str());
99        self.base.join(&path).map_err(|e| FetchError::SourceSchema {
100            hint: format!("arxiv URL construction failed: {e}"),
101        })
102    }
103
104    /// Build the Atom-feed metadata URL for a given arXiv id.
105    ///
106    /// Production: `https://export.arxiv.org/api/query?id_list=<id>`. In
107    /// tests the base is the wiremock origin; the path is the same
108    /// (`/api/query?id_list=<id>`). The `export.arxiv.org` host is on the
109    /// `arxiv` redirect allowlist (per
110    /// `crate::http::tier_1_allowlist`) so the redirect closure does not
111    /// reject this leg.
112    ///
113    /// Old-style ids (`cond-mat/9501001`) contain a `/` which we
114    /// URL-encode via `query_pairs_mut().append_pair` so the wire form is
115    /// `id_list=cond-mat%2F9501001`.
116    fn metadata_url(&self, id: &ArxivId) -> Result<Url, FetchError> {
117        let mut url = self
118            .base
119            .join("/api/query")
120            .map_err(|e| FetchError::SourceSchema {
121                hint: format!("arxiv metadata URL construction failed: {e}"),
122            })?;
123        url.query_pairs_mut().append_pair("id_list", id.as_str());
124        Ok(url)
125    }
126
127    /// Fetch ONLY the Atom-feed metadata for the given arXiv id. Does NOT
128    /// touch the PDF endpoint — this is the entry point for the
129    /// `metadata_only` orchestrator (`docs/MCP_TOOLS.md` §11).
130    ///
131    /// Emits a single `LogEvent::Fetch` row under `Capability::Metadata`
132    /// so the audit trail distinguishes metadata-only attempts from full
133    /// (PDF) fetches.
134    ///
135    /// # Errors
136    ///
137    /// - [`FetchError::Http`] on transport / status / size-cap failures.
138    /// - [`FetchError::SourceSchema`] if the response body is not
139    ///   well-formed Atom XML.
140    /// - [`FetchError::Log`] if the provenance row write fails
141    ///   (fail-closed per `docs/PROVENANCE_LOG.md` §5).
142    pub async fn fetch_metadata_only(
143        &self,
144        id: &ArxivId,
145        ctx: &FetchContext,
146    ) -> Result<Value, FetchError> {
147        // Same politeness gate as the full fetch path.
148        let _permit = ctx.rate_limiter.acquire(self.name()).await;
149
150        let url = self.metadata_url(id)?;
151        let (body, _final_url) = ctx.http.fetch_bytes(self.name(), url).await?;
152        let metadata = parse_atom_feed(&body)?;
153
154        // ADR-0021 §1 canonical-digest under the "arxiv" resolver
155        // profile. version=None until a follow-up slice threads the
156        // Atom-feed-discovered version (`v2`, etc.) into this row.
157        let canonical =
158            crate::CanonicalRef::new(crate::SourceType::Arxiv, id.as_str(), self.name(), None)
159                .digest_hex();
160        ctx.log.append(RowInput {
161            event: LogEvent::Fetch,
162            result: LogResult::Ok,
163            // Distinguish metadata-only from full (PDF) fetches via the
164            // structured `capability` channel rather than mangling the
165            // `source` string — `docs/PROVENANCE_LOG.md` §3 lists
166            // `metadata` as a first-class capability value.
167            capability: Capability::Metadata,
168            ref_: Some(id.as_str()),
169            source: Some(self.name()),
170            error_code: None,
171            size_bytes: Some(body.len() as u64),
172            license: Some("arxiv-default"),
173            store_path: None,
174            canonical_digest: Some(&canonical),
175        })?;
176
177        Ok(metadata)
178    }
179}
180
181impl Default for ArxivSource {
182    fn default() -> Self {
183        Self::new()
184    }
185}
186
187#[async_trait]
188impl Source for ArxivSource {
189    fn name(&self) -> &str {
190        "arxiv"
191    }
192
193    fn can_serve(&self, _profile: &CapabilityProfile, ref_: &Ref) -> bool {
194        matches!(ref_, Ref::Arxiv(_))
195    }
196
197    async fn fetch(
198        &self,
199        ref_: &Ref,
200        _profile: &CapabilityProfile,
201        ctx: &FetchContext,
202    ) -> Result<FetchResult, FetchError> {
203        // Eligibility gate. The orchestrator is expected to call
204        // `can_serve` first, but a runtime check here gives a clean error
205        // path if it does not.
206        let id = match ref_ {
207            Ref::Arxiv(a) => a,
208            Ref::Doi(_) => {
209                return Err(FetchError::NotEligible {
210                    source_key: "arxiv".into(),
211                });
212            }
213        };
214
215        // Hold the rate-limiter permit for the duration of the HTTP
216        // fetch. Drop happens at end of scope after the log append below.
217        let _permit = ctx.rate_limiter.acquire(self.name()).await;
218
219        // ----- Atom-feed metadata leg (best-effort) -------------------
220        //
221        // Fetched BEFORE the PDF so that `FetchResult::metadata_json` is
222        // populated for a single-pass fetch (the orchestrator does not
223        // need to re-issue a metadata-only call). Failures here degrade
224        // gracefully: we set `metadata_json = None`, emit a tracing
225        // warning, and proceed with the PDF leg unchanged. NO log row
226        // is emitted from this leg — the source-level audit unit is
227        // "one fetch attempt = one row" and the row comes from the PDF
228        // leg below. This is what preserves the 4-row sequence asserted
229        // by `crates/doiget-cli/tests/fetch_arxiv_e2e.rs`.
230        let metadata_json = match self.metadata_url(id) {
231            Ok(meta_url) => match ctx.http.fetch_bytes(self.name(), meta_url).await {
232                Ok((bytes, _final)) => match parse_atom_feed(&bytes) {
233                    Ok(v) => Some(v),
234                    Err(e) => {
235                        tracing::warn!(
236                            arxiv_id = %id.as_str(),
237                            error = %e,
238                            "arxiv Atom feed parse failed; continuing with PDF-only fetch"
239                        );
240                        None
241                    }
242                },
243                Err(e) => {
244                    tracing::warn!(
245                        arxiv_id = %id.as_str(),
246                        error = %e,
247                        "arxiv Atom feed fetch failed; continuing with PDF-only fetch"
248                    );
249                    None
250                }
251            },
252            Err(e) => {
253                tracing::warn!(
254                    arxiv_id = %id.as_str(),
255                    error = %e,
256                    "arxiv metadata URL construction failed; continuing with PDF-only fetch"
257                );
258                None
259            }
260        };
261
262        // ----- PDF leg -------------------------------------------------
263        let url = self.pdf_url(id)?;
264
265        // `fetch_pdf` enforces the magic-byte check (`%PDF-`) per
266        // `docs/SECURITY.md` §1.2 — non-PDF response surfaces as
267        // `HttpError::NotAPdf`, which `From` converts to `FetchError::Http`.
268        let (body, final_url): (Bytes, Url) = ctx.http.fetch_pdf(self.name(), url).await?;
269
270        // One `event=fetch` row per attempt, per `docs/ARCHITECTURE.md` §6
271        // and `docs/PROVENANCE_LOG.md` §3. Per `docs/SECURITY.md` §1.8 a
272        // log write failure is fail-closed — the `?` aborts the fetch.
273        // ADR-0021 §1 canonical-digest: build under the "arxiv" resolver
274        // profile. version=None in Slice 4 — a follow-up may surface
275        // the `vN` discriminator from the Atom-feed `id` element.
276        let canonical = ref_.promote(self.name(), None).digest_hex();
277        ctx.log.append(RowInput {
278            event: LogEvent::Fetch,
279            result: LogResult::Ok,
280            capability: Capability::Oa,
281            ref_: Some(id.as_str()),
282            source: Some(self.name()),
283            error_code: None,
284            size_bytes: Some(body.len() as u64),
285            // arXiv does not expose a per-item license string; the
286            // platform-wide license declaration lives at
287            // <https://info.arxiv.org/help/license/>. Phase 1 records
288            // `"arxiv-default"` so the value is informative without
289            // claiming a specific Creative Commons license.
290            license: Some("arxiv-default"),
291            store_path: None,
292            canonical_digest: Some(&canonical),
293        })?;
294
295        Ok(FetchResult {
296            source: self.name().to_string(),
297            license: "arxiv-default".into(),
298            pdf_bytes: Some(body),
299            final_url: Some(final_url),
300            metadata_json,
301        })
302    }
303}
304
305// ---------------------------------------------------------------------------
306// Atom-feed parser (B.1)
307// ---------------------------------------------------------------------------
308
309/// Parse the arXiv Atom-feed response body into a structured JSON
310/// metadata object.
311///
312/// Endpoint: `https://export.arxiv.org/api/query?id_list=<id>` (see
313/// arXiv API user manual §3.1). The response is an `<feed>` document
314/// containing one `<entry>` per requested id. We extract the fields
315/// listed in `docs/SOURCES.md` §4 arXiv (title, summary/abstract,
316/// authors, published, updated, categories) into the synthetic JSON
317/// shape:
318///
319/// ```jsonc
320/// {
321///   "title": "...",
322///   "abstract": "...",
323///   "authors": ["Family, Given", ...],
324///   "published": "YYYY-MM-DDTHH:MM:SSZ",  // RFC3339 UTC, passed through verbatim
325///   "updated":   "YYYY-MM-DDTHH:MM:SSZ",
326///   "categories": ["cs.LG", "stat.ML"]
327/// }
328/// ```
329///
330/// All fields are best-effort: any missing element is omitted from the
331/// JSON output (NOT serialized as `null`). The parser is a small
332/// `quick-xml` event walker — no DOM allocation. Only the FIRST `<entry>`
333/// element is consumed (we always query a single id).
334///
335/// # Errors
336///
337/// Returns [`FetchError::SourceSchema`] if the XML is malformed (parser
338/// reports a syntax error) or if no `<entry>` element is present (arXiv
339/// returns an empty `<feed>` on an unknown id).
340pub(crate) fn parse_atom_feed(xml: &[u8]) -> Result<Value, FetchError> {
341    let mut reader = Reader::from_reader(xml);
342    let config = reader.config_mut();
343    config.trim_text(true);
344
345    // Top-level state. `in_entry` tracks whether we are inside the first
346    // (and only) `<entry>` element; once we exit, we stop collecting.
347    let mut in_entry = false;
348    let mut saw_entry = false;
349    let mut depth = 0_i32; // depth WITHIN the entry; 0 = at <entry> root
350
351    // Accumulators. Per-author state is kept on a stack so a nested
352    // `<author><name>...</name></author>` populates the right slot.
353    let mut title: Option<String> = None;
354    let mut abstract_: Option<String> = None;
355    let mut published: Option<String> = None;
356    let mut updated: Option<String> = None;
357    let mut authors: Vec<String> = Vec::new();
358    let mut categories: Vec<String> = Vec::new();
359
360    // Current text-collection target — None when we are not inside a
361    // leaf element whose text we want.
362    #[derive(Clone, Copy)]
363    enum Target {
364        Title,
365        Summary,
366        Published,
367        Updated,
368        AuthorName,
369    }
370    let mut target: Option<Target> = None;
371    let mut in_author = false;
372    let mut buf: Vec<u8> = Vec::new();
373
374    loop {
375        match reader.read_event_into(&mut buf) {
376            Ok(Event::Start(e)) => {
377                let name_bytes = e.name();
378                let local = local_name(name_bytes.as_ref());
379                if !in_entry {
380                    if local == b"entry" {
381                        in_entry = true;
382                        saw_entry = true;
383                        depth = 0;
384                    }
385                    buf.clear();
386                    continue;
387                }
388                depth += 1;
389                // Depth==1 means a direct child of `<entry>`.
390                if depth == 1 {
391                    match local {
392                        b"title" => target = Some(Target::Title),
393                        b"summary" => target = Some(Target::Summary),
394                        b"published" => target = Some(Target::Published),
395                        b"updated" => target = Some(Target::Updated),
396                        b"author" => {
397                            in_author = true;
398                            authors.push(String::new());
399                        }
400                        _ => {}
401                    }
402                } else if depth == 2 && in_author && local == b"name" {
403                    target = Some(Target::AuthorName);
404                }
405                buf.clear();
406            }
407            Ok(Event::Empty(e)) => {
408                let name_bytes = e.name();
409                let local = local_name(name_bytes.as_ref());
410                if in_entry && depth == 0 && local == b"category" {
411                    // <category term="cs.LG" scheme="..."/> — extract `term`.
412                    for attr in e.attributes().flatten() {
413                        if attr.key.as_ref() == b"term" {
414                            // quick-xml 0.40: `unescape_value()` is
415                            // deprecated in favour of `normalized_value()`
416                            // (attribute-value normalization resolves the
417                            // same character/entity references). arXiv's
418                            // Atom feed is XML 1.0.
419                            if let Ok(v) = attr.normalized_value(quick_xml::XmlVersion::Explicit1_0)
420                            {
421                                categories.push(v.into_owned());
422                            }
423                        }
424                    }
425                }
426                buf.clear();
427            }
428            Ok(Event::Text(t)) => {
429                if let Some(tg) = target {
430                    // quick-xml 0.40 removed `BytesText::unescape`.
431                    // Reproduce the old behaviour: decode the bytes, then
432                    // unescape XML entities via `quick_xml::escape::unescape`.
433                    // Best-effort — skip the text on decode/unescape error.
434                    if let Some(s) = t.decode().ok().and_then(|raw| {
435                        quick_xml::escape::unescape(&raw)
436                            .ok()
437                            .map(|c| c.into_owned())
438                    }) {
439                        match tg {
440                            Target::Title => title.get_or_insert_with(String::new).push_str(&s),
441                            Target::Summary => {
442                                abstract_.get_or_insert_with(String::new).push_str(&s)
443                            }
444                            Target::Published => {
445                                published.get_or_insert_with(String::new).push_str(&s)
446                            }
447                            Target::Updated => updated.get_or_insert_with(String::new).push_str(&s),
448                            Target::AuthorName => {
449                                if let Some(last) = authors.last_mut() {
450                                    last.push_str(&s);
451                                }
452                            }
453                        }
454                    }
455                }
456                buf.clear();
457            }
458            Ok(Event::End(e)) => {
459                if !in_entry {
460                    buf.clear();
461                    continue;
462                }
463                let name_bytes = e.name();
464                let local = local_name(name_bytes.as_ref());
465                if depth == 0 && local == b"entry" {
466                    // Done with the first entry — stop. We deliberately
467                    // ignore any subsequent entries since the orchestrator
468                    // always queries a single id.
469                    break;
470                }
471                depth -= 1;
472                if depth == 0 {
473                    if local == b"author" {
474                        in_author = false;
475                        // Drop empty author names (defensive).
476                        if let Some(last) = authors.last() {
477                            if last.is_empty() {
478                                authors.pop();
479                            }
480                        }
481                    }
482                    target = None;
483                } else if depth == 1 && in_author && local == b"name" {
484                    target = None;
485                }
486                buf.clear();
487            }
488            Ok(Event::Eof) => break,
489            Err(e) => {
490                return Err(FetchError::SourceSchema {
491                    hint: format!("arxiv Atom XML parse error: {e}"),
492                });
493            }
494            // CDATA / Comment / Decl / PI / DocType — ignored.
495            _ => {
496                buf.clear();
497            }
498        }
499    }
500
501    if !saw_entry {
502        return Err(FetchError::SourceSchema {
503            hint: "arxiv Atom feed had no <entry> element (unknown id?)".into(),
504        });
505    }
506
507    // Build the JSON object, omitting empty optionals. `serde_json::Map`
508    // preserves insertion order so the output is stable.
509    let mut obj = serde_json::Map::new();
510    if let Some(t) = title {
511        let trimmed = t.trim().to_string();
512        if !trimmed.is_empty() {
513            obj.insert("title".into(), Value::String(trimmed));
514        }
515    }
516    if let Some(a) = abstract_ {
517        let trimmed = a.trim().to_string();
518        if !trimmed.is_empty() {
519            obj.insert("abstract".into(), Value::String(trimmed));
520        }
521    }
522    if !authors.is_empty() {
523        obj.insert(
524            "authors".into(),
525            Value::Array(authors.into_iter().map(Value::String).collect()),
526        );
527    }
528    if let Some(p) = published {
529        let trimmed = p.trim().to_string();
530        if !trimmed.is_empty() {
531            obj.insert("published".into(), Value::String(trimmed));
532        }
533    }
534    if let Some(u) = updated {
535        let trimmed = u.trim().to_string();
536        if !trimmed.is_empty() {
537            obj.insert("updated".into(), Value::String(trimmed));
538        }
539    }
540    if !categories.is_empty() {
541        obj.insert(
542            "categories".into(),
543            Value::Array(categories.into_iter().map(Value::String).collect()),
544        );
545    }
546    Ok(json!(obj))
547}
548
549/// Strip an XML namespace prefix from a qualified name, returning the
550/// local-part bytes. `b"atom:entry"` -> `b"entry"`. Atom uses the default
551/// namespace so most names arrive unprefixed; this helper makes the
552/// parser robust to either form without depending on quick-xml's
553/// namespace resolver (which would require us to thread a
554/// `NsReader` and explicit prefix bindings through every event).
555fn local_name(qname: &[u8]) -> &[u8] {
556    match qname.iter().rposition(|&b| b == b':') {
557        Some(idx) => &qname[idx + 1..],
558        None => qname,
559    }
560}
561
562// ---------------------------------------------------------------------------
563// Tests
564// ---------------------------------------------------------------------------
565
566#[cfg(test)]
567#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
568mod tests {
569    use super::*;
570
571    use std::sync::Arc;
572
573    use camino::Utf8PathBuf;
574    use tempfile::TempDir;
575    use wiremock::matchers::{method, path};
576    use wiremock::{Mock, MockServer, ResponseTemplate};
577
578    use crate::http::{HttpClient, HttpError};
579    use crate::provenance::{LogRow, ProvenanceLog};
580    use crate::rate_limiter::RateLimiter;
581    use crate::source::FetchContext;
582    use crate::{ArxivId, CapabilityProfile, Doi, RateLimits, Ref};
583
584    const TEST_SESSION_ID: &str = "01J0000000000000000000TEST";
585
586    /// Build a complete `FetchContext` against a wiremock host for use in
587    /// the source-level tests below.
588    fn build_test_context(wiremock_host: &str) -> (TempDir, FetchContext) {
589        let td = TempDir::new().expect("tempdir");
590        let log_dir =
591            Utf8PathBuf::try_from(td.path().to_path_buf()).expect("temp dir path must be UTF-8");
592        let log_path = log_dir.join("test.jsonl");
593
594        let http = Arc::new(HttpClient::new_for_tests_allow_http("arxiv", wiremock_host));
595        let rate_limiter = Arc::new(RateLimiter::new(RateLimits::HARD_CODED));
596        let session_id = TEST_SESSION_ID.to_string();
597        let log = Arc::new(
598            ProvenanceLog::open(log_path, session_id.clone()).expect("provenance log opens"),
599        );
600
601        (
602            td,
603            FetchContext {
604                http,
605                rate_limiter,
606                log,
607                session_id,
608            },
609        )
610    }
611
612    fn read_rows(path: &camino::Utf8Path) -> Vec<LogRow> {
613        let raw = std::fs::read_to_string(path).expect("read log");
614        raw.lines()
615            .filter(|l| !l.is_empty())
616            .map(|l| serde_json::from_str::<LogRow>(l).expect("valid LogRow"))
617            .collect()
618    }
619
620    fn profile() -> CapabilityProfile {
621        CapabilityProfile::from_env().expect("Phase 0 stub profile")
622    }
623
624    // -----------------------------------------------------------------
625    // can_serve
626    // -----------------------------------------------------------------
627
628    #[test]
629    fn arxiv_can_serve_returns_true_for_arxiv() {
630        let s = ArxivSource::new();
631        let id = ArxivId::parse("2401.12345").expect("valid id");
632        let r = Ref::Arxiv(id);
633        assert!(s.can_serve(&profile(), &r));
634    }
635
636    #[test]
637    fn arxiv_can_serve_returns_false_for_doi() {
638        let s = ArxivSource::new();
639        let r = Ref::Doi(Doi("10.1234/example".to_string()));
640        assert!(!s.can_serve(&profile(), &r));
641    }
642
643    // -----------------------------------------------------------------
644    // fetch — happy paths
645    // -----------------------------------------------------------------
646
647    #[tokio::test]
648    async fn arxiv_fetch_new_style_id_returns_pdf_bytes() {
649        let server = MockServer::start().await;
650        let body = b"%PDF-1.7\n%fixture\n".to_vec();
651        Mock::given(method("GET"))
652            .and(path("/pdf/2401.12345.pdf"))
653            .respond_with(ResponseTemplate::new(200).set_body_bytes(body.clone()))
654            .mount(&server)
655            .await;
656
657        let host = server
658            .uri()
659            .parse::<Url>()
660            .unwrap()
661            .host_str()
662            .unwrap()
663            .to_string();
664        let (_td, ctx) = build_test_context(&host);
665        let s = ArxivSource::with_base(server.uri().parse().unwrap());
666
667        let id = ArxivId::parse("2401.12345").unwrap();
668        let r = Ref::Arxiv(id);
669        let res = s.fetch(&r, &profile(), &ctx).await.expect("fetch ok");
670
671        assert_eq!(res.source, "arxiv");
672        assert_eq!(res.license, "arxiv-default");
673        let bytes = res.pdf_bytes.expect("pdf bytes set");
674        assert!(
675            bytes.starts_with(b"%PDF-"),
676            "expected PDF magic prefix, got {:?}",
677            &bytes[..bytes.len().min(8)]
678        );
679        assert_eq!(&bytes[..], &body[..]);
680    }
681
682    #[tokio::test]
683    async fn arxiv_fetch_old_style_id_returns_pdf_bytes() {
684        // Old-style id contains `/` (`cond-mat/9501001`); the URL must
685        // become `/pdf/cond-mat/9501001.pdf`. This pins the URL-builder
686        // behavior across both id shapes.
687        let server = MockServer::start().await;
688        let body = b"%PDF-1.4\n%old-style fixture\n".to_vec();
689        Mock::given(method("GET"))
690            .and(path("/pdf/cond-mat/9501001.pdf"))
691            .respond_with(ResponseTemplate::new(200).set_body_bytes(body.clone()))
692            .mount(&server)
693            .await;
694
695        let host = server
696            .uri()
697            .parse::<Url>()
698            .unwrap()
699            .host_str()
700            .unwrap()
701            .to_string();
702        let (_td, ctx) = build_test_context(&host);
703        let s = ArxivSource::with_base(server.uri().parse().unwrap());
704
705        let id = ArxivId::parse("cond-mat/9501001").expect("old-style id");
706        let r = Ref::Arxiv(id);
707        let res = s.fetch(&r, &profile(), &ctx).await.expect("fetch ok");
708
709        let bytes = res.pdf_bytes.expect("pdf bytes set");
710        assert!(bytes.starts_with(b"%PDF-"));
711        assert_eq!(&bytes[..], &body[..]);
712    }
713
714    // -----------------------------------------------------------------
715    // fetch — error paths
716    // -----------------------------------------------------------------
717
718    #[tokio::test]
719    async fn arxiv_fetch_with_doi_ref_errors_not_eligible() {
720        let server = MockServer::start().await;
721        let host = server
722            .uri()
723            .parse::<Url>()
724            .unwrap()
725            .host_str()
726            .unwrap()
727            .to_string();
728        let (_td, ctx) = build_test_context(&host);
729        let s = ArxivSource::with_base(server.uri().parse().unwrap());
730
731        let r = Ref::Doi(Doi("10.1234/example".to_string()));
732        let err = s
733            .fetch(&r, &profile(), &ctx)
734            .await
735            .expect_err("doi ref must not be eligible");
736        match err {
737            FetchError::NotEligible { source_key } => {
738                assert_eq!(source_key, "arxiv");
739            }
740            other => panic!("expected NotEligible, got {:?}", other),
741        }
742    }
743
744    #[tokio::test]
745    async fn arxiv_fetch_writes_log_row_with_arxiv_default_license() {
746        let server = MockServer::start().await;
747        let body = b"%PDF-1.7\n%log-row fixture\n".to_vec();
748        Mock::given(method("GET"))
749            .and(path("/pdf/2401.12345.pdf"))
750            .respond_with(ResponseTemplate::new(200).set_body_bytes(body.clone()))
751            .mount(&server)
752            .await;
753        let host = server
754            .uri()
755            .parse::<Url>()
756            .unwrap()
757            .host_str()
758            .unwrap()
759            .to_string();
760        let (_td, ctx) = build_test_context(&host);
761        // Capture the log path before the fetch call for later read-back.
762        let log_path = ctx.log.path().to_path_buf();
763        let s = ArxivSource::with_base(server.uri().parse().unwrap());
764
765        let id = ArxivId::parse("2401.12345").unwrap();
766        let r = Ref::Arxiv(id);
767        let _ = s.fetch(&r, &profile(), &ctx).await.expect("fetch ok");
768
769        let rows = read_rows(&log_path);
770        assert_eq!(rows.len(), 1, "exactly one fetch row expected");
771        let row = &rows[0];
772        assert_eq!(row.source.as_deref(), Some("arxiv"));
773        assert_eq!(row.ref_.as_deref(), Some("2401.12345"));
774        assert_eq!(row.license.as_deref(), Some("arxiv-default"));
775        assert_eq!(row.size_bytes, Some(body.len() as u64));
776        assert!(row.error_code.is_none());
777    }
778
779    #[tokio::test]
780    async fn arxiv_non_pdf_body_rejected() {
781        // Wiremock returns 200 with a non-PDF body. The magic-byte check
782        // inside `HttpClient::fetch_pdf` rejects it as `HttpError::NotAPdf`,
783        // surfacing as `FetchError::Http`.
784        let server = MockServer::start().await;
785        Mock::given(method("GET"))
786            .and(path("/pdf/2401.12345.pdf"))
787            .respond_with(
788                ResponseTemplate::new(200).set_body_bytes(b"<html>not a pdf</html>".to_vec()),
789            )
790            .mount(&server)
791            .await;
792        let host = server
793            .uri()
794            .parse::<Url>()
795            .unwrap()
796            .host_str()
797            .unwrap()
798            .to_string();
799        let (_td, ctx) = build_test_context(&host);
800        let s = ArxivSource::with_base(server.uri().parse().unwrap());
801
802        let id = ArxivId::parse("2401.12345").unwrap();
803        let r = Ref::Arxiv(id);
804        let err = s
805            .fetch(&r, &profile(), &ctx)
806            .await
807            .expect_err("non-pdf body must be rejected");
808        match err {
809            FetchError::Http(HttpError::NotAPdf { got }) => {
810                assert_eq!(&got, b"<html");
811            }
812            other => panic!("expected FetchError::Http(NotAPdf), got {:?}", other),
813        }
814    }
815
816    #[tokio::test]
817    async fn arxiv_404_maps_to_http_error() {
818        let server = MockServer::start().await;
819        Mock::given(method("GET"))
820            .and(path("/pdf/2401.99999.pdf"))
821            .respond_with(ResponseTemplate::new(404))
822            .mount(&server)
823            .await;
824        let host = server
825            .uri()
826            .parse::<Url>()
827            .unwrap()
828            .host_str()
829            .unwrap()
830            .to_string();
831        let (_td, ctx) = build_test_context(&host);
832        let s = ArxivSource::with_base(server.uri().parse().unwrap());
833
834        let id = ArxivId::parse("2401.99999").unwrap();
835        let r = Ref::Arxiv(id);
836        let err = s
837            .fetch(&r, &profile(), &ctx)
838            .await
839            .expect_err("404 must surface");
840        match err {
841            FetchError::Http(HttpError::HttpStatus { status, .. }) => {
842                assert_eq!(status, 404);
843            }
844            other => panic!("expected FetchError::Http(HttpStatus), got {:?}", other),
845        }
846    }
847
848    // -----------------------------------------------------------------
849    // parse_atom_feed (B.1) — unit tests
850    // -----------------------------------------------------------------
851
852    /// Synthetic Atom payload from the Slice 1 spec (deliverable B.3). Do
853    /// not hit real arXiv from tests.
854    const SAMPLE_ATOM_FEED: &str = r#"<?xml version="1.0" encoding="UTF-8"?>
855<feed xmlns="http://www.w3.org/2005/Atom">
856  <entry>
857    <id>http://arxiv.org/abs/2401.12345v1</id>
858    <updated>2024-02-01T00:00:00Z</updated>
859    <published>2024-01-15T00:00:00Z</published>
860    <title>Example arXiv Paper Title</title>
861    <summary>This is an example abstract.</summary>
862    <author>
863      <name>Jane Doe</name>
864    </author>
865    <author>
866      <name>John Roe</name>
867    </author>
868    <category term="cs.LG" scheme="http://arxiv.org/schemas/atom"/>
869    <category term="stat.ML" scheme="http://arxiv.org/schemas/atom"/>
870  </entry>
871</feed>"#;
872
873    #[test]
874    fn parse_atom_feed_extracts_all_fields() {
875        let v = parse_atom_feed(SAMPLE_ATOM_FEED.as_bytes()).expect("Atom parses");
876        assert_eq!(v["title"], serde_json::json!("Example arXiv Paper Title"));
877        assert_eq!(
878            v["abstract"],
879            serde_json::json!("This is an example abstract.")
880        );
881        assert_eq!(v["authors"], serde_json::json!(["Jane Doe", "John Roe"]));
882        assert_eq!(v["published"], serde_json::json!("2024-01-15T00:00:00Z"));
883        assert_eq!(v["updated"], serde_json::json!("2024-02-01T00:00:00Z"));
884        assert_eq!(v["categories"], serde_json::json!(["cs.LG", "stat.ML"]));
885    }
886
887    #[test]
888    fn parse_atom_feed_empty_feed_errors_source_schema() {
889        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
890<feed xmlns="http://www.w3.org/2005/Atom"></feed>"#;
891        let err = parse_atom_feed(xml.as_bytes()).expect_err("empty feed must error");
892        match err {
893            FetchError::SourceSchema { hint } => {
894                assert!(
895                    hint.contains("entry"),
896                    "expected mention of <entry>; got {hint}"
897                );
898            }
899            other => panic!("expected SourceSchema, got {other:?}"),
900        }
901    }
902
903    #[test]
904    fn parse_atom_feed_omits_missing_optional_fields() {
905        // An entry with only an id and title — abstract/authors/categories
906        // absent. The output must omit those keys entirely (not emit
907        // `null`).
908        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
909<feed xmlns="http://www.w3.org/2005/Atom">
910  <entry>
911    <id>http://arxiv.org/abs/2401.00001v1</id>
912    <title>Minimal Entry</title>
913  </entry>
914</feed>"#;
915        let v = parse_atom_feed(xml.as_bytes()).expect("parses");
916        let obj = v.as_object().expect("object");
917        assert_eq!(
918            obj.get("title").and_then(Value::as_str),
919            Some("Minimal Entry")
920        );
921        assert!(
922            !obj.contains_key("abstract"),
923            "abstract should be omitted: {obj:?}"
924        );
925        assert!(
926            !obj.contains_key("authors"),
927            "authors should be omitted: {obj:?}"
928        );
929        assert!(
930            !obj.contains_key("categories"),
931            "categories should be omitted: {obj:?}"
932        );
933    }
934
935    // -----------------------------------------------------------------
936    // fetch_metadata_only — orchestrator entry point
937    // -----------------------------------------------------------------
938
939    #[tokio::test]
940    async fn arxiv_fetch_metadata_only_returns_atom_metadata() {
941        let server = MockServer::start().await;
942        Mock::given(method("GET"))
943            .and(path("/api/query"))
944            .respond_with(ResponseTemplate::new(200).set_body_string(SAMPLE_ATOM_FEED))
945            .mount(&server)
946            .await;
947        let host = server
948            .uri()
949            .parse::<Url>()
950            .unwrap()
951            .host_str()
952            .unwrap()
953            .to_string();
954        let (_td, ctx) = build_test_context(&host);
955        let s = ArxivSource::with_base(server.uri().parse().unwrap());
956        let id = ArxivId::parse("2401.12345").unwrap();
957
958        let meta = s
959            .fetch_metadata_only(&id, &ctx)
960            .await
961            .expect("metadata_only ok");
962        assert_eq!(
963            meta["title"],
964            serde_json::json!("Example arXiv Paper Title")
965        );
966        assert_eq!(meta["authors"], serde_json::json!(["Jane Doe", "John Roe"]));
967    }
968
969    #[tokio::test]
970    async fn arxiv_fetch_populates_metadata_json_when_atom_endpoint_mocked() {
971        // Full Source::fetch with BOTH Atom and PDF endpoints mocked must
972        // populate `metadata_json` from the Atom response.
973        let server = MockServer::start().await;
974        Mock::given(method("GET"))
975            .and(path("/api/query"))
976            .respond_with(ResponseTemplate::new(200).set_body_string(SAMPLE_ATOM_FEED))
977            .mount(&server)
978            .await;
979        Mock::given(method("GET"))
980            .and(path("/pdf/2401.12345.pdf"))
981            .respond_with(ResponseTemplate::new(200).set_body_bytes(b"%PDF-1.7\n%fix\n".to_vec()))
982            .mount(&server)
983            .await;
984        let host = server
985            .uri()
986            .parse::<Url>()
987            .unwrap()
988            .host_str()
989            .unwrap()
990            .to_string();
991        let (_td, ctx) = build_test_context(&host);
992        let s = ArxivSource::with_base(server.uri().parse().unwrap());
993        let id = ArxivId::parse("2401.12345").unwrap();
994        let r = Ref::Arxiv(id);
995
996        let res = s.fetch(&r, &profile(), &ctx).await.expect("fetch ok");
997        let meta = res.metadata_json.expect("metadata_json populated");
998        assert_eq!(
999            meta["title"],
1000            serde_json::json!("Example arXiv Paper Title")
1001        );
1002    }
1003
1004    #[tokio::test]
1005    async fn arxiv_fetch_atom_failure_falls_back_to_pdf_only() {
1006        // PDF endpoint mocked; Atom endpoint deliberately unmocked
1007        // (will 404). The fetch must still succeed with
1008        // `metadata_json = None` — the best-effort contract.
1009        let server = MockServer::start().await;
1010        Mock::given(method("GET"))
1011            .and(path("/pdf/2401.12345.pdf"))
1012            .respond_with(ResponseTemplate::new(200).set_body_bytes(b"%PDF-1.7\nx".to_vec()))
1013            .mount(&server)
1014            .await;
1015        let host = server
1016            .uri()
1017            .parse::<Url>()
1018            .unwrap()
1019            .host_str()
1020            .unwrap()
1021            .to_string();
1022        let (_td, ctx) = build_test_context(&host);
1023        let s = ArxivSource::with_base(server.uri().parse().unwrap());
1024        let id = ArxivId::parse("2401.12345").unwrap();
1025        let r = Ref::Arxiv(id);
1026
1027        let res = s.fetch(&r, &profile(), &ctx).await.expect("fetch ok");
1028        assert!(res.metadata_json.is_none());
1029        assert!(res.pdf_bytes.is_some());
1030    }
1031}