Skip to main content

doiget_core/sources/
arxiv.rs

1//! arXiv source — arXiv id → PDF + Atom-feed metadata.
2//!
3//! Spec: `docs/SOURCES.md` §4 arXiv. No auth; the API has a 3-second-per-request
4//! rate guideline that doiget's 5/sec global + 200ms per-source backoff
5//! comfortably respects (no extra source-specific tuning needed).
6//!
7//! # Fetch flow (full)
8//!
9//! 1. `can_serve` returns `true` only for `Ref::Arxiv(_)`; `Ref::Doi(_)` is
10//!    rejected up front.
11//! 2. `fetch` acquires a permit from the shared `RateLimiter`, then
12//!    best-effort fetches the Atom feed (`<base>/api/query?id_list=<id>`)
13//!    and parses it into a JSON metadata object via the private
14//!    `parse_atom_feed` helper. Atom failures degrade gracefully
15//!    (`metadata_json = None` + `tracing::warn!`) — the existing 1.0
16//!    PDF-leg semantics are preserved.
17//! 3. The PDF URL `<base>/pdf/<id>.pdf` is fetched via
18//!    [`crate::http::HttpClient::fetch_pdf`] which enforces the magic-byte
19//!    (`%PDF-`) check per `docs/SECURITY.md` §1.2.
20//! 4. ONE `LogEvent::Fetch` row is appended for the PDF leg. The Atom leg
21//!    does NOT emit its own row — the source-level audit unit is
22//!    "one fetch attempt = one row" and the Atom call is a supporting
23//!    leg of the same attempt.
24//!
25//! # Metadata-only path
26//!
27//! [`ArxivSource::fetch_metadata_only`] performs ONLY the Atom feed fetch
28//! and is the entry point for the `metadata_only` orchestrator
29//! (`crate::orchestrator::metadata_only`). It MUST NOT call
30//! [`crate::http::HttpClient::fetch_pdf`] — doing so would violate the
31//! `doiget_metadata_only` contract (`docs/MCP_TOOLS.md` §11). It emits
32//! one `LogEvent::Fetch` row under `Capability::Metadata` so the audit
33//! trail distinguishes metadata-only fetches from full fetches without
34//! breaking the schema (the `capability` field is the structured channel
35//! for this distinction; spec §3 documents it as one of `oa` / `metadata`
36//! / `tdm-*`).
37
38use async_trait::async_trait;
39use bytes::Bytes;
40use quick_xml::events::Event;
41use quick_xml::Reader;
42use serde_json::{json, Value};
43use url::Url;
44
45use crate::provenance::{Capability, LogEvent, LogResult, RowInput};
46use crate::source::{FetchContext, FetchError, FetchResult, Source};
47use crate::{ArxivId, CapabilityProfile, Ref};
48
49/// Default base for the PDF endpoint. arXiv serves PDFs at
50/// `https://arxiv.org/pdf/<id>` (the trailing `.pdf` is optional but
51/// most reliable to include). PDFs may redirect to `cdn.arxiv.org` —
52/// the per-source allowlist in `crate::http::tier_1_allowlist()` covers
53/// this via the `*.arxiv.org` glob.
54const PDF_BASE: &str = "https://arxiv.org";
55
56/// Default base for the Atom metadata endpoint. arXiv serves the API at
57/// `https://export.arxiv.org/api/query` — a DIFFERENT host from the PDF
58/// endpoint. Hitting `arxiv.org/api/query` instead redirects and fails
59/// the metadata leg, so the two endpoints must use separate bases.
60/// `export.arxiv.org` is covered by the `*.arxiv.org` allowlist glob.
61const META_BASE: &str = "https://export.arxiv.org";
62
63/// arXiv [`Source`] impl. PDFs are served from `arxiv.org`; Atom metadata
64/// from `export.arxiv.org` (the `metadata_url` builder).
65#[derive(Clone, Debug)]
66pub struct ArxivSource {
67    /// PDF endpoint base (`arxiv.org` in production).
68    base: Url,
69    /// Atom metadata endpoint base (`export.arxiv.org` in production).
70    meta_base: Url,
71}
72
73impl ArxivSource {
74    /// Production constructor. PDFs from `arxiv.org`, Atom metadata from
75    /// `export.arxiv.org`.
76    pub fn new() -> Self {
77        // Both hard-coded constants are `'static` string literals known at
78        // compile time to be valid absolute URLs; the `expect`s can only
79        // fire if a constant regresses, which every `ArxivSource::new()`
80        // test exercises.
81        #[allow(clippy::expect_used)]
82        let base = Url::parse(PDF_BASE).expect("hard-coded PDF base URL is valid");
83        #[allow(clippy::expect_used)]
84        let meta_base = Url::parse(META_BASE).expect("hard-coded meta base URL is valid");
85        Self { base, meta_base }
86    }
87
88    /// Construct with an arbitrary base URL.
89    ///
90    /// The orchestrator (`doiget-cli::commands::fetch`) uses this to honor
91    /// the `DOIGET_ARXIV_BASE` env var, which lets integration tests point
92    /// the source at a wiremock origin without resorting to compile-time
93    /// gates. Both the PDF and metadata legs share the one override base
94    /// (a single wiremock origin serves both paths). Production callers
95    /// use [`ArxivSource::new`].
96    pub fn with_base(base: Url) -> Self {
97        Self {
98            meta_base: base.clone(),
99            base,
100        }
101    }
102
103    /// Build the PDF URL for a given arXiv id. arXiv accepts both
104    /// `/pdf/<id>` and `/pdf/<id>.pdf`; we use the trailing-`.pdf` form to
105    /// make the URL self-describing.
106    ///
107    /// Old-style ids (`cond-mat/9501001`) contain a `/` in the id itself;
108    /// the resulting path `/pdf/cond-mat/9501001.pdf` is the form arXiv
109    /// expects. Because the base URL has no path beyond `/`, `Url::join`
110    /// resolves the absolute reference `/pdf/<id>.pdf` to exactly that
111    /// path for both new-style (`2401.12345`) and old-style
112    /// (`cond-mat/9501001`) ids. The `arxiv_fetch_old_style_id_*` test
113    /// pins this behavior.
114    fn pdf_url(&self, id: &ArxivId) -> Result<Url, FetchError> {
115        let path = format!("/pdf/{}.pdf", id.as_str());
116        self.base.join(&path).map_err(|e| FetchError::SourceSchema {
117            hint: format!("arxiv URL construction failed: {e}"),
118        })
119    }
120
121    /// Build the Atom-feed metadata URL for a given arXiv id.
122    ///
123    /// Production: `https://export.arxiv.org/api/query?id_list=<id>`. In
124    /// tests the base is the wiremock origin; the path is the same
125    /// (`/api/query?id_list=<id>`). The `export.arxiv.org` host is on the
126    /// `arxiv` redirect allowlist (per
127    /// `crate::http::tier_1_allowlist`) so the redirect closure does not
128    /// reject this leg.
129    ///
130    /// Old-style ids (`cond-mat/9501001`) contain a `/` which we
131    /// URL-encode via `query_pairs_mut().append_pair` so the wire form is
132    /// `id_list=cond-mat%2F9501001`.
133    fn metadata_url(&self, id: &ArxivId) -> Result<Url, FetchError> {
134        let mut url = self
135            .meta_base
136            .join("/api/query")
137            .map_err(|e| FetchError::SourceSchema {
138                hint: format!("arxiv metadata URL construction failed: {e}"),
139            })?;
140        url.query_pairs_mut().append_pair("id_list", id.as_str());
141        Ok(url)
142    }
143
144    /// Fetch ONLY the Atom-feed metadata for the given arXiv id. Does NOT
145    /// touch the PDF endpoint — this is the entry point for the
146    /// `metadata_only` orchestrator (`docs/MCP_TOOLS.md` §11).
147    ///
148    /// Emits a single `LogEvent::Fetch` row under `Capability::Metadata`
149    /// so the audit trail distinguishes metadata-only attempts from full
150    /// (PDF) fetches.
151    ///
152    /// # Errors
153    ///
154    /// - [`FetchError::Http`] on transport / status / size-cap failures.
155    /// - [`FetchError::SourceSchema`] if the response body is not
156    ///   well-formed Atom XML.
157    /// - [`FetchError::Log`] if the provenance row write fails
158    ///   (fail-closed per `docs/PROVENANCE_LOG.md` §5).
159    pub async fn fetch_metadata_only(
160        &self,
161        id: &ArxivId,
162        ctx: &FetchContext,
163    ) -> Result<Value, FetchError> {
164        // Same politeness gate as the full fetch path.
165        let _permit = ctx.rate_limiter.acquire(self.name()).await;
166
167        let url = self.metadata_url(id)?;
168        let (body, _final_url) = ctx.http.fetch_bytes(self.name(), url).await?;
169        let metadata = parse_atom_feed(&body)?;
170
171        // ADR-0021 §1 canonical-digest under the "arxiv" resolver
172        // profile. version=None until a follow-up slice threads the
173        // Atom-feed-discovered version (`v2`, etc.) into this row.
174        let canonical =
175            crate::CanonicalRef::new(crate::SourceType::Arxiv, id.as_str(), self.name(), None)
176                .digest_hex();
177        ctx.log.append(RowInput {
178            event: LogEvent::Fetch,
179            result: LogResult::Ok,
180            // Distinguish metadata-only from full (PDF) fetches via the
181            // structured `capability` channel rather than mangling the
182            // `source` string — `docs/PROVENANCE_LOG.md` §3 lists
183            // `metadata` as a first-class capability value.
184            capability: Capability::Metadata,
185            ref_: Some(id.as_str()),
186            source: Some(self.name()),
187            error_code: None,
188            size_bytes: Some(body.len() as u64),
189            license: Some("arxiv-default"),
190            store_path: None,
191            canonical_digest: Some(&canonical),
192        })?;
193
194        Ok(metadata)
195    }
196}
197
198impl Default for ArxivSource {
199    fn default() -> Self {
200        Self::new()
201    }
202}
203
204#[async_trait]
205impl Source for ArxivSource {
206    fn name(&self) -> &str {
207        "arxiv"
208    }
209
210    fn can_serve(&self, _profile: &CapabilityProfile, ref_: &Ref) -> bool {
211        matches!(ref_, Ref::Arxiv(_))
212    }
213
214    async fn fetch(
215        &self,
216        ref_: &Ref,
217        _profile: &CapabilityProfile,
218        ctx: &FetchContext,
219    ) -> Result<FetchResult, FetchError> {
220        // Eligibility gate. The orchestrator is expected to call
221        // `can_serve` first, but a runtime check here gives a clean error
222        // path if it does not.
223        let id = match ref_ {
224            Ref::Arxiv(a) => a,
225            Ref::Doi(_) => {
226                return Err(FetchError::NotEligible {
227                    source_key: "arxiv".into(),
228                });
229            }
230        };
231
232        // Hold the rate-limiter permit for the duration of the HTTP
233        // fetch. Drop happens at end of scope after the log append below.
234        let _permit = ctx.rate_limiter.acquire(self.name()).await;
235
236        // ----- Atom-feed metadata leg (best-effort) -------------------
237        //
238        // Fetched BEFORE the PDF so that `FetchResult::metadata_json` is
239        // populated for a single-pass fetch (the orchestrator does not
240        // need to re-issue a metadata-only call). Failures here degrade
241        // gracefully: we set `metadata_json = None`, emit a tracing
242        // warning, and proceed with the PDF leg unchanged. NO log row
243        // is emitted from this leg — the source-level audit unit is
244        // "one fetch attempt = one row" and the row comes from the PDF
245        // leg below. This is what preserves the 4-row sequence asserted
246        // by `crates/doiget-cli/tests/fetch_arxiv_e2e.rs`.
247        let metadata_json = match self.metadata_url(id) {
248            Ok(meta_url) => match ctx.http.fetch_bytes(self.name(), meta_url).await {
249                Ok((bytes, _final)) => match parse_atom_feed(&bytes) {
250                    Ok(v) => Some(v),
251                    Err(e) => {
252                        tracing::warn!(
253                            arxiv_id = %id.as_str(),
254                            error = %e,
255                            "arxiv Atom feed parse failed; continuing with PDF-only fetch"
256                        );
257                        None
258                    }
259                },
260                Err(e) => {
261                    tracing::warn!(
262                        arxiv_id = %id.as_str(),
263                        error = %e,
264                        "arxiv Atom feed fetch failed; continuing with PDF-only fetch"
265                    );
266                    None
267                }
268            },
269            Err(e) => {
270                tracing::warn!(
271                    arxiv_id = %id.as_str(),
272                    error = %e,
273                    "arxiv metadata URL construction failed; continuing with PDF-only fetch"
274                );
275                None
276            }
277        };
278
279        // ----- PDF leg -------------------------------------------------
280        let url = self.pdf_url(id)?;
281
282        // `fetch_pdf` enforces the magic-byte check (`%PDF-`) per
283        // `docs/SECURITY.md` §1.2 — non-PDF response surfaces as
284        // `HttpError::NotAPdf`, which `From` converts to `FetchError::Http`.
285        let (body, final_url): (Bytes, Url) = ctx.http.fetch_pdf(self.name(), url).await?;
286
287        // One `event=fetch` row per attempt, per `docs/ARCHITECTURE.md` §6
288        // and `docs/PROVENANCE_LOG.md` §3. Per `docs/SECURITY.md` §1.8 a
289        // log write failure is fail-closed — the `?` aborts the fetch.
290        // ADR-0021 §1 canonical-digest: build under the "arxiv" resolver
291        // profile. version=None in Slice 4 — a follow-up may surface
292        // the `vN` discriminator from the Atom-feed `id` element.
293        let canonical = ref_.promote(self.name(), None).digest_hex();
294        ctx.log.append(RowInput {
295            event: LogEvent::Fetch,
296            result: LogResult::Ok,
297            capability: Capability::Oa,
298            ref_: Some(id.as_str()),
299            source: Some(self.name()),
300            error_code: None,
301            size_bytes: Some(body.len() as u64),
302            // arXiv does not expose a per-item license string; the
303            // platform-wide license declaration lives at
304            // <https://info.arxiv.org/help/license/>. Phase 1 records
305            // `"arxiv-default"` so the value is informative without
306            // claiming a specific Creative Commons license.
307            license: Some("arxiv-default"),
308            store_path: None,
309            canonical_digest: Some(&canonical),
310        })?;
311
312        Ok(FetchResult {
313            source: self.name().to_string(),
314            license: "arxiv-default".into(),
315            pdf_bytes: Some(body),
316            final_url: Some(final_url),
317            metadata_json,
318        })
319    }
320}
321
322// ---------------------------------------------------------------------------
323// Atom-feed parser (B.1)
324// ---------------------------------------------------------------------------
325
326/// Parse the arXiv Atom-feed response body into a structured JSON
327/// metadata object.
328///
329/// Endpoint: `https://export.arxiv.org/api/query?id_list=<id>` (see
330/// arXiv API user manual §3.1). The response is an `<feed>` document
331/// containing one `<entry>` per requested id. We extract the fields
332/// listed in `docs/SOURCES.md` §4 arXiv (title, summary/abstract,
333/// authors, published, updated, categories) into the synthetic JSON
334/// shape:
335///
336/// ```jsonc
337/// {
338///   "title": "...",
339///   "abstract": "...",
340///   "authors": ["Family, Given", ...],
341///   "published": "YYYY-MM-DDTHH:MM:SSZ",  // RFC3339 UTC, passed through verbatim
342///   "updated":   "YYYY-MM-DDTHH:MM:SSZ",
343///   "categories": ["cs.LG", "stat.ML"],
344///   "doi": "10.1103/...",          // PUBLISHED (journal) DOI cross-ref, NOT this entry's id; omit-when-absent (#281 item 5)
345///   "journal_ref": "Phys. Rev. ..."  // omit-when-absent
346/// }
347/// ```
348///
349/// All fields are best-effort: any missing element is omitted from the
350/// JSON output (NOT serialized as `null`). The parser is a small
351/// `quick-xml` event walker — no DOM allocation. Only the FIRST `<entry>`
352/// element is consumed (we always query a single id).
353///
354/// # Errors
355///
356/// Returns [`FetchError::SourceSchema`] if the XML is malformed (parser
357/// reports a syntax error), or [`FetchError::NotFound`] if no `<entry>`
358/// element is present (arXiv returns HTTP 200 with an empty `<feed>` on an
359/// unknown id — an authoritative absence, not a parse error).
360pub(crate) fn parse_atom_feed(xml: &[u8]) -> Result<Value, FetchError> {
361    let mut reader = Reader::from_reader(xml);
362    let config = reader.config_mut();
363    config.trim_text(true);
364
365    // Top-level state. `in_entry` tracks whether we are inside the first
366    // (and only) `<entry>` element; once we exit, we stop collecting.
367    let mut in_entry = false;
368    let mut saw_entry = false;
369    let mut depth = 0_i32; // depth WITHIN the entry; 0 = at <entry> root
370
371    // Accumulators. Per-author state is kept on a stack so a nested
372    // `<author><name>...</name></author>` populates the right slot.
373    let mut title: Option<String> = None;
374    let mut abstract_: Option<String> = None;
375    let mut published: Option<String> = None;
376    let mut updated: Option<String> = None;
377    let mut authors: Vec<String> = Vec::new();
378    let mut categories: Vec<String> = Vec::new();
379    // arXiv-namespaced elements (`<arxiv:doi>`, `<arxiv:journal_ref>`):
380    // present only when the submitter supplied a published DOI / journal
381    // reference. They are the canonical arXiv → published-DOI link source
382    // (#281 item 5), surfaced here so the metadata path carries them.
383    let mut doi: Option<String> = None;
384    let mut journal_ref: Option<String> = None;
385
386    // Current text-collection target — None when we are not inside a
387    // leaf element whose text we want.
388    #[derive(Clone, Copy)]
389    enum Target {
390        Title,
391        Summary,
392        Published,
393        Updated,
394        AuthorName,
395        Doi,
396        JournalRef,
397    }
398    let mut target: Option<Target> = None;
399    let mut in_author = false;
400    let mut buf: Vec<u8> = Vec::new();
401
402    loop {
403        match reader.read_event_into(&mut buf) {
404            Ok(Event::Start(e)) => {
405                let name_bytes = e.name();
406                let local = local_name(name_bytes.as_ref());
407                if !in_entry {
408                    if local == b"entry" {
409                        in_entry = true;
410                        saw_entry = true;
411                        depth = 0;
412                    }
413                    buf.clear();
414                    continue;
415                }
416                depth += 1;
417                // Depth==1 means a direct child of `<entry>`.
418                if depth == 1 {
419                    match local {
420                        b"title" => target = Some(Target::Title),
421                        b"summary" => target = Some(Target::Summary),
422                        b"published" => target = Some(Target::Published),
423                        b"updated" => target = Some(Target::Updated),
424                        // arXiv namespace; `local_name` strips the `arxiv:`
425                        // prefix, so these match `<arxiv:doi>` /
426                        // `<arxiv:journal_ref>`.
427                        b"doi" => target = Some(Target::Doi),
428                        b"journal_ref" => target = Some(Target::JournalRef),
429                        b"author" => {
430                            in_author = true;
431                            authors.push(String::new());
432                        }
433                        _ => {}
434                    }
435                } else if depth == 2 && in_author && local == b"name" {
436                    target = Some(Target::AuthorName);
437                }
438                buf.clear();
439            }
440            Ok(Event::Empty(e)) => {
441                let name_bytes = e.name();
442                let local = local_name(name_bytes.as_ref());
443                if in_entry && depth == 0 && local == b"category" {
444                    // <category term="cs.LG" scheme="..."/> — extract `term`.
445                    for attr in e.attributes().flatten() {
446                        if attr.key.as_ref() == b"term" {
447                            // quick-xml 0.40: `unescape_value()` is
448                            // deprecated in favour of `normalized_value()`
449                            // (attribute-value normalization resolves the
450                            // same character/entity references). arXiv's
451                            // Atom feed is XML 1.0.
452                            if let Ok(v) = attr.normalized_value(quick_xml::XmlVersion::Explicit1_0)
453                            {
454                                categories.push(v.into_owned());
455                            }
456                        }
457                    }
458                }
459                buf.clear();
460            }
461            Ok(Event::Text(t)) => {
462                if let Some(tg) = target {
463                    // quick-xml 0.40 removed `BytesText::unescape`.
464                    // Reproduce the old behaviour: decode the bytes, then
465                    // unescape XML entities via `quick_xml::escape::unescape`.
466                    // Best-effort — skip the text on decode/unescape error.
467                    if let Some(s) = t.decode().ok().and_then(|raw| {
468                        quick_xml::escape::unescape(&raw)
469                            .ok()
470                            .map(|c| c.into_owned())
471                    }) {
472                        match tg {
473                            Target::Title => title.get_or_insert_with(String::new).push_str(&s),
474                            Target::Summary => {
475                                abstract_.get_or_insert_with(String::new).push_str(&s)
476                            }
477                            Target::Published => {
478                                published.get_or_insert_with(String::new).push_str(&s)
479                            }
480                            Target::Updated => updated.get_or_insert_with(String::new).push_str(&s),
481                            Target::Doi => doi.get_or_insert_with(String::new).push_str(&s),
482                            Target::JournalRef => {
483                                journal_ref.get_or_insert_with(String::new).push_str(&s)
484                            }
485                            Target::AuthorName => {
486                                if let Some(last) = authors.last_mut() {
487                                    last.push_str(&s);
488                                }
489                            }
490                        }
491                    }
492                }
493                buf.clear();
494            }
495            Ok(Event::End(e)) => {
496                if !in_entry {
497                    buf.clear();
498                    continue;
499                }
500                let name_bytes = e.name();
501                let local = local_name(name_bytes.as_ref());
502                if depth == 0 && local == b"entry" {
503                    // Done with the first entry — stop. We deliberately
504                    // ignore any subsequent entries since the orchestrator
505                    // always queries a single id.
506                    break;
507                }
508                depth -= 1;
509                if depth == 0 {
510                    if local == b"author" {
511                        in_author = false;
512                        // Drop empty author names (defensive).
513                        if let Some(last) = authors.last() {
514                            if last.is_empty() {
515                                authors.pop();
516                            }
517                        }
518                    }
519                    target = None;
520                } else if depth == 1 && in_author && local == b"name" {
521                    target = None;
522                }
523                buf.clear();
524            }
525            Ok(Event::Eof) => break,
526            Err(e) => {
527                return Err(FetchError::SourceSchema {
528                    hint: format!("arxiv Atom XML parse error: {e}"),
529                });
530            }
531            // CDATA / Comment / Decl / PI / DocType — ignored.
532            _ => {
533                buf.clear();
534            }
535        }
536    }
537
538    if !saw_entry {
539        // arXiv signals an unknown id with HTTP 200 + an empty `<feed>`
540        // (no `<entry>`), NOT a 404. Surface it as an authoritative
541        // absence so `doiget verify` classifies it `absent` (a dead
542        // reference) rather than a tolerable transport blip.
543        return Err(FetchError::NotFound {
544            hint: "arxiv Atom feed had no <entry> element (unknown id?)".into(),
545        });
546    }
547
548    // Build the JSON object, omitting empty optionals. `serde_json::Map`
549    // preserves insertion order so the output is stable.
550    let mut obj = serde_json::Map::new();
551    if let Some(t) = title {
552        let trimmed = t.trim().to_string();
553        if !trimmed.is_empty() {
554            obj.insert("title".into(), Value::String(trimmed));
555        }
556    }
557    if let Some(a) = abstract_ {
558        let trimmed = a.trim().to_string();
559        if !trimmed.is_empty() {
560            obj.insert("abstract".into(), Value::String(trimmed));
561        }
562    }
563    if !authors.is_empty() {
564        obj.insert(
565            "authors".into(),
566            Value::Array(authors.into_iter().map(Value::String).collect()),
567        );
568    }
569    if let Some(p) = published {
570        let trimmed = p.trim().to_string();
571        if !trimmed.is_empty() {
572            obj.insert("published".into(), Value::String(trimmed));
573        }
574    }
575    if let Some(u) = updated {
576        let trimmed = u.trim().to_string();
577        if !trimmed.is_empty() {
578            obj.insert("updated".into(), Value::String(trimmed));
579        }
580    }
581    // arXiv → published-DOI link (#281 item 5): omitted when the submitter
582    // did not supply a DOI / journal reference.
583    //
584    // HAZARD: this `doi` is the PUBLISHED (journal) DOI, NOT this arXiv
585    // record's own identifier. It must NOT be promoted to the reserved
586    // top-level `doi` of the store `Metadata` (STORE.md) — that field is the
587    // entry's own identity. `orchestrator::build_metadata_only_metadata`
588    // correctly forces an arXiv entry's `doi` to `None`; any future consumer
589    // mapping `metadata_json["doi"]` into `Metadata.doi` would write the
590    // wrong identity. Treat this strictly as a cross-reference.
591    if let Some(d) = doi {
592        let trimmed = d.trim().to_string();
593        if !trimmed.is_empty() {
594            obj.insert("doi".into(), Value::String(trimmed));
595        }
596    }
597    if let Some(j) = journal_ref {
598        let trimmed = j.trim().to_string();
599        if !trimmed.is_empty() {
600            obj.insert("journal_ref".into(), Value::String(trimmed));
601        }
602    }
603    if !categories.is_empty() {
604        obj.insert(
605            "categories".into(),
606            Value::Array(categories.into_iter().map(Value::String).collect()),
607        );
608    }
609    Ok(json!(obj))
610}
611
612/// Strip an XML namespace prefix from a qualified name, returning the
613/// local-part bytes. `b"atom:entry"` -> `b"entry"`. Atom uses the default
614/// namespace so most names arrive unprefixed; this helper makes the
615/// parser robust to either form without depending on quick-xml's
616/// namespace resolver (which would require us to thread a
617/// `NsReader` and explicit prefix bindings through every event).
618fn local_name(qname: &[u8]) -> &[u8] {
619    match qname.iter().rposition(|&b| b == b':') {
620        Some(idx) => &qname[idx + 1..],
621        None => qname,
622    }
623}
624
625// ---------------------------------------------------------------------------
626// Tests
627// ---------------------------------------------------------------------------
628
629#[cfg(test)]
630#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
631mod tests {
632    use super::*;
633
634    use std::sync::Arc;
635
636    use camino::Utf8PathBuf;
637    use tempfile::TempDir;
638    use wiremock::matchers::{method, path};
639    use wiremock::{Mock, MockServer, ResponseTemplate};
640
641    use crate::http::{HttpClient, HttpError};
642    use crate::provenance::{LogRow, ProvenanceLog};
643    use crate::rate_limiter::RateLimiter;
644    use crate::source::FetchContext;
645    use crate::{ArxivId, CapabilityProfile, Doi, RateLimits, Ref};
646
647    const TEST_SESSION_ID: &str = "01J0000000000000000000TEST";
648
649    /// Build a complete `FetchContext` against a wiremock host for use in
650    /// the source-level tests below.
651    fn build_test_context(wiremock_host: &str) -> (TempDir, FetchContext) {
652        let td = TempDir::new().expect("tempdir");
653        let log_dir =
654            Utf8PathBuf::try_from(td.path().to_path_buf()).expect("temp dir path must be UTF-8");
655        let log_path = log_dir.join("test.jsonl");
656
657        let http = Arc::new(HttpClient::new_for_tests_allow_http("arxiv", wiremock_host));
658        let rate_limiter = Arc::new(RateLimiter::new(RateLimits::HARD_CODED));
659        let session_id = TEST_SESSION_ID.to_string();
660        let log = Arc::new(
661            ProvenanceLog::open(log_path, session_id.clone()).expect("provenance log opens"),
662        );
663
664        (
665            td,
666            FetchContext {
667                http,
668                rate_limiter,
669                log,
670                session_id,
671                cache_root: None,
672            },
673        )
674    }
675
676    fn read_rows(path: &camino::Utf8Path) -> Vec<LogRow> {
677        let raw = std::fs::read_to_string(path).expect("read log");
678        raw.lines()
679            .filter(|l| !l.is_empty())
680            .map(|l| serde_json::from_str::<LogRow>(l).expect("valid LogRow"))
681            .collect()
682    }
683
684    fn profile() -> CapabilityProfile {
685        CapabilityProfile::from_env().expect("Phase 0 stub profile")
686    }
687
688    // -----------------------------------------------------------------
689    // can_serve
690    // -----------------------------------------------------------------
691
692    #[test]
693    fn arxiv_can_serve_returns_true_for_arxiv() {
694        let s = ArxivSource::new();
695        let id = ArxivId::parse("2401.12345").expect("valid id");
696        let r = Ref::Arxiv(id);
697        assert!(s.can_serve(&profile(), &r));
698    }
699
700    #[test]
701    fn production_metadata_url_uses_export_host_pdf_uses_arxiv() {
702        // Regression guard: the Atom metadata leg MUST hit
703        // export.arxiv.org, while PDFs hit arxiv.org. Sending metadata to
704        // arxiv.org/api/query redirects and fails the resolve.
705        let s = ArxivSource::new();
706        let id = ArxivId::parse("1706.03762").expect("valid id");
707        let meta = s.metadata_url(&id).expect("meta url");
708        assert_eq!(meta.host_str(), Some("export.arxiv.org"));
709        assert_eq!(meta.path(), "/api/query");
710        let pdf = s.pdf_url(&id).expect("pdf url");
711        assert_eq!(pdf.host_str(), Some("arxiv.org"));
712    }
713
714    #[test]
715    fn with_base_shares_one_origin_for_both_legs() {
716        // The DOIGET_ARXIV_BASE override (wiremock) serves both paths from
717        // a single origin, so meta and PDF must resolve to the same host.
718        let s = ArxivSource::with_base("http://127.0.0.1:9999".parse().expect("url"));
719        let id = ArxivId::parse("2401.12345").expect("valid id");
720        assert_eq!(
721            s.metadata_url(&id).expect("meta").host_str(),
722            s.pdf_url(&id).expect("pdf").host_str()
723        );
724    }
725
726    #[test]
727    fn arxiv_can_serve_returns_false_for_doi() {
728        let s = ArxivSource::new();
729        let r = Ref::Doi(Doi("10.1234/example".to_string()));
730        assert!(!s.can_serve(&profile(), &r));
731    }
732
733    // -----------------------------------------------------------------
734    // fetch — happy paths
735    // -----------------------------------------------------------------
736
737    #[tokio::test]
738    async fn arxiv_fetch_new_style_id_returns_pdf_bytes() {
739        let server = MockServer::start().await;
740        let body = b"%PDF-1.7\n%fixture\n".to_vec();
741        Mock::given(method("GET"))
742            .and(path("/pdf/2401.12345.pdf"))
743            .respond_with(ResponseTemplate::new(200).set_body_bytes(body.clone()))
744            .mount(&server)
745            .await;
746
747        let host = server
748            .uri()
749            .parse::<Url>()
750            .unwrap()
751            .host_str()
752            .unwrap()
753            .to_string();
754        let (_td, ctx) = build_test_context(&host);
755        let s = ArxivSource::with_base(server.uri().parse().unwrap());
756
757        let id = ArxivId::parse("2401.12345").unwrap();
758        let r = Ref::Arxiv(id);
759        let res = s.fetch(&r, &profile(), &ctx).await.expect("fetch ok");
760
761        assert_eq!(res.source, "arxiv");
762        assert_eq!(res.license, "arxiv-default");
763        let bytes = res.pdf_bytes.expect("pdf bytes set");
764        assert!(
765            bytes.starts_with(b"%PDF-"),
766            "expected PDF magic prefix, got {:?}",
767            &bytes[..bytes.len().min(8)]
768        );
769        assert_eq!(&bytes[..], &body[..]);
770    }
771
772    #[tokio::test]
773    async fn arxiv_fetch_old_style_id_returns_pdf_bytes() {
774        // Old-style id contains `/` (`cond-mat/9501001`); the URL must
775        // become `/pdf/cond-mat/9501001.pdf`. This pins the URL-builder
776        // behavior across both id shapes.
777        let server = MockServer::start().await;
778        let body = b"%PDF-1.4\n%old-style fixture\n".to_vec();
779        Mock::given(method("GET"))
780            .and(path("/pdf/cond-mat/9501001.pdf"))
781            .respond_with(ResponseTemplate::new(200).set_body_bytes(body.clone()))
782            .mount(&server)
783            .await;
784
785        let host = server
786            .uri()
787            .parse::<Url>()
788            .unwrap()
789            .host_str()
790            .unwrap()
791            .to_string();
792        let (_td, ctx) = build_test_context(&host);
793        let s = ArxivSource::with_base(server.uri().parse().unwrap());
794
795        let id = ArxivId::parse("cond-mat/9501001").expect("old-style id");
796        let r = Ref::Arxiv(id);
797        let res = s.fetch(&r, &profile(), &ctx).await.expect("fetch ok");
798
799        let bytes = res.pdf_bytes.expect("pdf bytes set");
800        assert!(bytes.starts_with(b"%PDF-"));
801        assert_eq!(&bytes[..], &body[..]);
802    }
803
804    // -----------------------------------------------------------------
805    // fetch — error paths
806    // -----------------------------------------------------------------
807
808    #[tokio::test]
809    async fn arxiv_fetch_with_doi_ref_errors_not_eligible() {
810        let server = MockServer::start().await;
811        let host = server
812            .uri()
813            .parse::<Url>()
814            .unwrap()
815            .host_str()
816            .unwrap()
817            .to_string();
818        let (_td, ctx) = build_test_context(&host);
819        let s = ArxivSource::with_base(server.uri().parse().unwrap());
820
821        let r = Ref::Doi(Doi("10.1234/example".to_string()));
822        let err = s
823            .fetch(&r, &profile(), &ctx)
824            .await
825            .expect_err("doi ref must not be eligible");
826        match err {
827            FetchError::NotEligible { source_key } => {
828                assert_eq!(source_key, "arxiv");
829            }
830            other => panic!("expected NotEligible, got {:?}", other),
831        }
832    }
833
834    #[tokio::test]
835    async fn arxiv_fetch_writes_log_row_with_arxiv_default_license() {
836        let server = MockServer::start().await;
837        let body = b"%PDF-1.7\n%log-row fixture\n".to_vec();
838        Mock::given(method("GET"))
839            .and(path("/pdf/2401.12345.pdf"))
840            .respond_with(ResponseTemplate::new(200).set_body_bytes(body.clone()))
841            .mount(&server)
842            .await;
843        let host = server
844            .uri()
845            .parse::<Url>()
846            .unwrap()
847            .host_str()
848            .unwrap()
849            .to_string();
850        let (_td, ctx) = build_test_context(&host);
851        // Capture the log path before the fetch call for later read-back.
852        let log_path = ctx.log.path().to_path_buf();
853        let s = ArxivSource::with_base(server.uri().parse().unwrap());
854
855        let id = ArxivId::parse("2401.12345").unwrap();
856        let r = Ref::Arxiv(id);
857        let _ = s.fetch(&r, &profile(), &ctx).await.expect("fetch ok");
858
859        let rows = read_rows(&log_path);
860        assert_eq!(rows.len(), 1, "exactly one fetch row expected");
861        let row = &rows[0];
862        assert_eq!(row.source.as_deref(), Some("arxiv"));
863        assert_eq!(row.ref_.as_deref(), Some("2401.12345"));
864        assert_eq!(row.license.as_deref(), Some("arxiv-default"));
865        assert_eq!(row.size_bytes, Some(body.len() as u64));
866        assert!(row.error_code.is_none());
867    }
868
869    #[tokio::test]
870    async fn arxiv_non_pdf_body_rejected() {
871        // Wiremock returns 200 with a non-PDF body. The magic-byte check
872        // inside `HttpClient::fetch_pdf` rejects it as `HttpError::NotAPdf`,
873        // surfacing as `FetchError::Http`.
874        let server = MockServer::start().await;
875        Mock::given(method("GET"))
876            .and(path("/pdf/2401.12345.pdf"))
877            .respond_with(
878                ResponseTemplate::new(200).set_body_bytes(b"<html>not a pdf</html>".to_vec()),
879            )
880            .mount(&server)
881            .await;
882        let host = server
883            .uri()
884            .parse::<Url>()
885            .unwrap()
886            .host_str()
887            .unwrap()
888            .to_string();
889        let (_td, ctx) = build_test_context(&host);
890        let s = ArxivSource::with_base(server.uri().parse().unwrap());
891
892        let id = ArxivId::parse("2401.12345").unwrap();
893        let r = Ref::Arxiv(id);
894        let err = s
895            .fetch(&r, &profile(), &ctx)
896            .await
897            .expect_err("non-pdf body must be rejected");
898        match err {
899            FetchError::Http(HttpError::NotAPdf { got }) => {
900                assert_eq!(&got, b"<html");
901            }
902            other => panic!("expected FetchError::Http(NotAPdf), got {:?}", other),
903        }
904    }
905
906    #[tokio::test]
907    async fn arxiv_404_maps_to_http_error() {
908        let server = MockServer::start().await;
909        Mock::given(method("GET"))
910            .and(path("/pdf/2401.99999.pdf"))
911            .respond_with(ResponseTemplate::new(404))
912            .mount(&server)
913            .await;
914        let host = server
915            .uri()
916            .parse::<Url>()
917            .unwrap()
918            .host_str()
919            .unwrap()
920            .to_string();
921        let (_td, ctx) = build_test_context(&host);
922        let s = ArxivSource::with_base(server.uri().parse().unwrap());
923
924        let id = ArxivId::parse("2401.99999").unwrap();
925        let r = Ref::Arxiv(id);
926        let err = s
927            .fetch(&r, &profile(), &ctx)
928            .await
929            .expect_err("404 must surface");
930        match err {
931            FetchError::Http(HttpError::HttpStatus { status, .. }) => {
932                assert_eq!(status, 404);
933            }
934            other => panic!("expected FetchError::Http(HttpStatus), got {:?}", other),
935        }
936    }
937
938    // -----------------------------------------------------------------
939    // parse_atom_feed (B.1) — unit tests
940    // -----------------------------------------------------------------
941
942    /// Synthetic Atom payload from the Slice 1 spec (deliverable B.3). Do
943    /// not hit real arXiv from tests.
944    const SAMPLE_ATOM_FEED: &str = r#"<?xml version="1.0" encoding="UTF-8"?>
945<feed xmlns="http://www.w3.org/2005/Atom">
946  <entry>
947    <id>http://arxiv.org/abs/2401.12345v1</id>
948    <updated>2024-02-01T00:00:00Z</updated>
949    <published>2024-01-15T00:00:00Z</published>
950    <title>Example arXiv Paper Title</title>
951    <summary>This is an example abstract.</summary>
952    <author>
953      <name>Jane Doe</name>
954    </author>
955    <author>
956      <name>John Roe</name>
957    </author>
958    <category term="cs.LG" scheme="http://arxiv.org/schemas/atom"/>
959    <category term="stat.ML" scheme="http://arxiv.org/schemas/atom"/>
960  </entry>
961</feed>"#;
962
963    #[test]
964    fn parse_atom_feed_extracts_all_fields() {
965        let v = parse_atom_feed(SAMPLE_ATOM_FEED.as_bytes()).expect("Atom parses");
966        assert_eq!(v["title"], serde_json::json!("Example arXiv Paper Title"));
967        assert_eq!(
968            v["abstract"],
969            serde_json::json!("This is an example abstract.")
970        );
971        assert_eq!(v["authors"], serde_json::json!(["Jane Doe", "John Roe"]));
972        assert_eq!(v["published"], serde_json::json!("2024-01-15T00:00:00Z"));
973        assert_eq!(v["updated"], serde_json::json!("2024-02-01T00:00:00Z"));
974        assert_eq!(v["categories"], serde_json::json!(["cs.LG", "stat.ML"]));
975    }
976
977    #[test]
978    fn parse_atom_feed_empty_feed_is_not_found() {
979        // An unknown arXiv id yields HTTP 200 + an empty `<feed>`. That is
980        // an authoritative absence (→ `FetchError::NotFound` →
981        // `ErrorCode::NotFound` → verify `absent`), NOT a schema error.
982        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
983<feed xmlns="http://www.w3.org/2005/Atom"></feed>"#;
984        let err = parse_atom_feed(xml.as_bytes()).expect_err("empty feed must error");
985        match err {
986            FetchError::NotFound { hint } => {
987                assert!(
988                    hint.contains("entry"),
989                    "expected mention of <entry>; got {hint}"
990                );
991            }
992            other => panic!("expected NotFound, got {other:?}"),
993        }
994    }
995
996    #[test]
997    fn parse_atom_feed_captures_published_doi_and_journal_ref() {
998        // When the submitter supplied a published DOI / journal reference,
999        // arXiv emits `<arxiv:doi>` / `<arxiv:journal_ref>` (the arXiv
1000        // namespace). They are the arXiv → published-DOI link (#281 item 5)
1001        // and must surface in the metadata JSON. Absent on most entries.
1002        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
1003<feed xmlns="http://www.w3.org/2005/Atom" xmlns:arxiv="http://arxiv.org/schemas/atom">
1004  <entry>
1005    <id>http://arxiv.org/abs/2101.54321v2</id>
1006    <title>Published Later</title>
1007    <arxiv:doi>10.1103/PhysRevLett.130.200601</arxiv:doi>
1008    <arxiv:journal_ref>Phys. Rev. Lett. 130, 200601 (2023)</arxiv:journal_ref>
1009  </entry>
1010</feed>"#;
1011        let v = parse_atom_feed(xml.as_bytes()).expect("parses");
1012        assert_eq!(
1013            v["doi"],
1014            serde_json::json!("10.1103/PhysRevLett.130.200601")
1015        );
1016        assert_eq!(
1017            v["journal_ref"],
1018            serde_json::json!("Phys. Rev. Lett. 130, 200601 (2023)")
1019        );
1020    }
1021
1022    #[test]
1023    fn parse_atom_feed_omits_doi_when_absent() {
1024        // The common case: no published DOI yet → no `doi` / `journal_ref`
1025        // key (omitted, not null).
1026        let v = parse_atom_feed(SAMPLE_ATOM_FEED.as_bytes()).expect("parses");
1027        let obj = v.as_object().expect("object");
1028        assert!(!obj.contains_key("doi"), "doi must be omitted: {obj:?}");
1029        assert!(
1030            !obj.contains_key("journal_ref"),
1031            "journal_ref must be omitted: {obj:?}"
1032        );
1033    }
1034
1035    #[test]
1036    fn parse_atom_feed_journal_ref_only_without_doi() {
1037        // A real, common state: a journal_ref but no DOI. The `doi` key must
1038        // be absent while `journal_ref` is present (independent extraction).
1039        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
1040<feed xmlns="http://www.w3.org/2005/Atom" xmlns:arxiv="http://arxiv.org/schemas/atom">
1041  <entry>
1042    <id>http://arxiv.org/abs/2101.00001v1</id>
1043    <title>Journal Ref Only</title>
1044    <arxiv:journal_ref>J. Stat. Mech. (2021) 013203</arxiv:journal_ref>
1045  </entry>
1046</feed>"#;
1047        let v = parse_atom_feed(xml.as_bytes()).expect("parses");
1048        let obj = v.as_object().expect("object");
1049        assert!(!obj.contains_key("doi"), "doi must be omitted: {obj:?}");
1050        assert_eq!(
1051            obj.get("journal_ref").and_then(Value::as_str),
1052            Some("J. Stat. Mech. (2021) 013203")
1053        );
1054    }
1055
1056    #[test]
1057    fn parse_atom_feed_whitespace_doi_is_omitted() {
1058        // A whitespace-only `<arxiv:doi>` trims to empty and must be omitted,
1059        // not emitted as `""` (exercises the trim→empty omit branch).
1060        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
1061<feed xmlns="http://www.w3.org/2005/Atom" xmlns:arxiv="http://arxiv.org/schemas/atom">
1062  <entry>
1063    <id>http://arxiv.org/abs/2101.00002v1</id>
1064    <title>Blank DOI</title>
1065    <arxiv:doi>   </arxiv:doi>
1066  </entry>
1067</feed>"#;
1068        let v = parse_atom_feed(xml.as_bytes()).expect("parses");
1069        assert!(
1070            !v.as_object().expect("object").contains_key("doi"),
1071            "whitespace-only doi must be omitted: {v:?}"
1072        );
1073    }
1074
1075    #[test]
1076    fn parse_atom_feed_omits_missing_optional_fields() {
1077        // An entry with only an id and title — abstract/authors/categories
1078        // absent. The output must omit those keys entirely (not emit
1079        // `null`).
1080        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
1081<feed xmlns="http://www.w3.org/2005/Atom">
1082  <entry>
1083    <id>http://arxiv.org/abs/2401.00001v1</id>
1084    <title>Minimal Entry</title>
1085  </entry>
1086</feed>"#;
1087        let v = parse_atom_feed(xml.as_bytes()).expect("parses");
1088        let obj = v.as_object().expect("object");
1089        assert_eq!(
1090            obj.get("title").and_then(Value::as_str),
1091            Some("Minimal Entry")
1092        );
1093        assert!(
1094            !obj.contains_key("abstract"),
1095            "abstract should be omitted: {obj:?}"
1096        );
1097        assert!(
1098            !obj.contains_key("authors"),
1099            "authors should be omitted: {obj:?}"
1100        );
1101        assert!(
1102            !obj.contains_key("categories"),
1103            "categories should be omitted: {obj:?}"
1104        );
1105    }
1106
1107    // -----------------------------------------------------------------
1108    // fetch_metadata_only — orchestrator entry point
1109    // -----------------------------------------------------------------
1110
1111    #[tokio::test]
1112    async fn arxiv_fetch_metadata_only_returns_atom_metadata() {
1113        let server = MockServer::start().await;
1114        Mock::given(method("GET"))
1115            .and(path("/api/query"))
1116            .respond_with(ResponseTemplate::new(200).set_body_string(SAMPLE_ATOM_FEED))
1117            .mount(&server)
1118            .await;
1119        let host = server
1120            .uri()
1121            .parse::<Url>()
1122            .unwrap()
1123            .host_str()
1124            .unwrap()
1125            .to_string();
1126        let (_td, ctx) = build_test_context(&host);
1127        let s = ArxivSource::with_base(server.uri().parse().unwrap());
1128        let id = ArxivId::parse("2401.12345").unwrap();
1129
1130        let meta = s
1131            .fetch_metadata_only(&id, &ctx)
1132            .await
1133            .expect("metadata_only ok");
1134        assert_eq!(
1135            meta["title"],
1136            serde_json::json!("Example arXiv Paper Title")
1137        );
1138        assert_eq!(meta["authors"], serde_json::json!(["Jane Doe", "John Roe"]));
1139    }
1140
1141    #[tokio::test]
1142    async fn arxiv_fetch_populates_metadata_json_when_atom_endpoint_mocked() {
1143        // Full Source::fetch with BOTH Atom and PDF endpoints mocked must
1144        // populate `metadata_json` from the Atom response.
1145        let server = MockServer::start().await;
1146        Mock::given(method("GET"))
1147            .and(path("/api/query"))
1148            .respond_with(ResponseTemplate::new(200).set_body_string(SAMPLE_ATOM_FEED))
1149            .mount(&server)
1150            .await;
1151        Mock::given(method("GET"))
1152            .and(path("/pdf/2401.12345.pdf"))
1153            .respond_with(ResponseTemplate::new(200).set_body_bytes(b"%PDF-1.7\n%fix\n".to_vec()))
1154            .mount(&server)
1155            .await;
1156        let host = server
1157            .uri()
1158            .parse::<Url>()
1159            .unwrap()
1160            .host_str()
1161            .unwrap()
1162            .to_string();
1163        let (_td, ctx) = build_test_context(&host);
1164        let s = ArxivSource::with_base(server.uri().parse().unwrap());
1165        let id = ArxivId::parse("2401.12345").unwrap();
1166        let r = Ref::Arxiv(id);
1167
1168        let res = s.fetch(&r, &profile(), &ctx).await.expect("fetch ok");
1169        let meta = res.metadata_json.expect("metadata_json populated");
1170        assert_eq!(
1171            meta["title"],
1172            serde_json::json!("Example arXiv Paper Title")
1173        );
1174    }
1175
1176    #[tokio::test]
1177    async fn arxiv_fetch_atom_failure_falls_back_to_pdf_only() {
1178        // PDF endpoint mocked; Atom endpoint deliberately unmocked
1179        // (will 404). The fetch must still succeed with
1180        // `metadata_json = None` — the best-effort contract.
1181        let server = MockServer::start().await;
1182        Mock::given(method("GET"))
1183            .and(path("/pdf/2401.12345.pdf"))
1184            .respond_with(ResponseTemplate::new(200).set_body_bytes(b"%PDF-1.7\nx".to_vec()))
1185            .mount(&server)
1186            .await;
1187        let host = server
1188            .uri()
1189            .parse::<Url>()
1190            .unwrap()
1191            .host_str()
1192            .unwrap()
1193            .to_string();
1194        let (_td, ctx) = build_test_context(&host);
1195        let s = ArxivSource::with_base(server.uri().parse().unwrap());
1196        let id = ArxivId::parse("2401.12345").unwrap();
1197        let r = Ref::Arxiv(id);
1198
1199        let res = s.fetch(&r, &profile(), &ctx).await.expect("fetch ok");
1200        assert!(res.metadata_json.is_none());
1201        assert!(res.pdf_bytes.is_some());
1202    }
1203}