doiget-cli 0.4.0

doiget CLI binary
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
//! End-to-end wiremock-driven tests for `doiget fetch <DOI>` with the OA
//! PDF leg enabled (Phase 1 success criterion — see
//! [`docs/PHASES.md`](../../../docs/PHASES.md) §4 and
//! [`docs/REDIRECT_ALLOWLIST.md`](../../../docs/REDIRECT_ALLOWLIST.md) §3.4).
//!
//! ## What is exercised
//!
//! - `doiget_cli::commands::fetch::run_with_options` end-to-end on a DOI input.
//! - Crossref + Unpaywall fan-out to the wiremock origin.
//! - The synthetic `oa-publisher` source key with its OA URL host check
//!   pulled from `HttpClient::new_for_tests_allow_http_multi(...)` over
//!   the same wiremock host (`127.0.0.1`).
//! - `HttpClient::fetch_pdf` magic-byte enforcement (the OA endpoint
//!   serves a body starting with `%PDF-`).
//! - `FsStore::write` atomic-rename code path for PDF + metadata.
//! - `ProvenanceLog::append` writing the expected row sequence
//!   (`SessionStart` -> 3 x `Fetch ok` -> `StoreWrite ok` -> `SessionEnd`).
//!
//! ## Network purity
//!
//! Per the network-purity guard, this test makes NO outbound calls. All
//! HTTP traffic terminates at a `wiremock::MockServer` on `127.0.0.1:N`,
//! reached via `DOIGET_*_BASE` env-var overrides.

#![allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]

use camino::{Utf8Path, Utf8PathBuf};
use doiget_cli::commands::fetch;
use doiget_cli::commands::output::OutputMode;
use doiget_core::provenance::{LogEvent, LogResult, LogRow};
use doiget_core::store::Metadata;
use serial_test::serial;
use tempfile::TempDir;
use wiremock::matchers::{method, path};
use wiremock::{Mock, MockServer, ResponseTemplate};

mod common;
use common::env_guard::EnvGuard;

/// Env-var keys mutated by the tests in this file. Wired through the
/// `EnvGuard` above so each test's setup is hermetic.
const ENV_KEYS: &[&str] = &[
    "DOIGET_STORE_ROOT",
    "DOIGET_LOG_PATH",
    "DOIGET_ARXIV_BASE",
    "DOIGET_CROSSREF_BASE",
    "DOIGET_UNPAYWALL_BASE",
    "DOIGET_OA_PUBLISHER_BASE",
    "DOIGET_CONTACT_EMAIL",
    "DOIGET_UNPAYWALL_EMAIL",
];

const TEST_DOI: &str = "10.1234/test";
/// Percent-encoded form of `TEST_DOI` as it appears on the wire after
/// `path_segments_mut().push(...)`. Wiremock matches the encoded path.
const TEST_DOI_ENCODED: &str = "10.1234%2Ftest";

fn read_log_rows(path: &Utf8PathBuf) -> Vec<LogRow> {
    let raw = std::fs::read_to_string(path.as_std_path()).expect("read log");
    raw.lines()
        .filter(|l| !l.is_empty())
        .map(|l| serde_json::from_str::<LogRow>(l).expect("valid LogRow"))
        .collect()
}

/// Crossref envelope returned by the `/works/<doi>` mock — minimal Phase 1
/// shape (title + authors + issued year). The orchestrator extracts these
/// via `extract_crossref_fields`.
fn crossref_body() -> serde_json::Value {
    serde_json::json!({
        "status": "ok",
        "message": {
            "title": ["E2E OA test paper"],
            "author": [{ "family": "Doe", "given": "Jane" }],
            "issued": { "date-parts": [[2026, 1, 1]] },
            "container-title": ["Synthetic Journal"],
            "type": "journal-article"
        }
    })
}

/// Unpaywall envelope returned by the `/v2/<doi>` mock with a
/// `best_oa_location.url_for_pdf` pointing at the same wiremock origin's
/// `/oa/file.pdf` path.
fn unpaywall_body(oa_url_for_pdf: &str) -> serde_json::Value {
    serde_json::json!({
        "doi": TEST_DOI,
        "is_oa": true,
        "title": "E2E OA test paper",
        "best_oa_location": {
            "url": oa_url_for_pdf,
            "url_for_pdf": oa_url_for_pdf,
            "license": "cc-by"
        }
    })
}

#[tokio::test]
#[serial]
async fn fetch_doi_oa_pdf_happy_path() {
    // Step 1: spin up ONE wiremock server and mount three paths on it
    // (Crossref `/works/<doi>`, Unpaywall `/v2/<doi>`, OA PDF
    // `/oa/file.pdf`). Per the design note: "Spin up TWO wiremock servers
    // (or one with multiple paths — simpler)" — we go with the one-server
    // option so a single host is on the redirect allowlist.
    let server = MockServer::start().await;
    let base_uri = server.uri();
    let oa_url = format!("{}/oa/file.pdf", base_uri);

    // Crossref uses `Url::join("/works/<doi>")` which does NOT URL-encode
    // the embedded `/` in the DOI suffix; so wiremock matches on the raw
    // form (`/works/10.1234/test`). Unpaywall, in contrast, uses
    // `path_segments_mut().push()` which DOES percent-encode (`%2F`).
    Mock::given(method("GET"))
        .and(path(format!("/works/{}", TEST_DOI)))
        .respond_with(ResponseTemplate::new(200).set_body_json(crossref_body()))
        .mount(&server)
        .await;

    Mock::given(method("GET"))
        .and(path(format!("/v2/{}", TEST_DOI_ENCODED)))
        .respond_with(ResponseTemplate::new(200).set_body_json(unpaywall_body(&oa_url)))
        .mount(&server)
        .await;

    let pdf_body = b"%PDF-fake-bytes\n".to_vec();
    Mock::given(method("GET"))
        .and(path("/oa/file.pdf"))
        .respond_with(ResponseTemplate::new(200).set_body_bytes(pdf_body.clone()))
        .mount(&server)
        .await;

    // Step 2: stage a temp dir for store + log artifacts.
    let td = TempDir::new().expect("tempdir");
    let temp_root: Utf8PathBuf = Utf8Path::from_path(td.path())
        .expect("temp dir is utf-8")
        .to_path_buf();
    let store_root = temp_root.join("papers");
    let log_path = temp_root.join("log.jsonl");

    let env = EnvGuard::new(ENV_KEYS);
    env.set("DOIGET_STORE_ROOT", store_root.as_str());
    env.set("DOIGET_LOG_PATH", log_path.as_str());
    // The CrossrefSource hits `<base>/works/<DOI>`; pass the bare server
    // URI as base so the orchestrator's URL builder lands at `/works/...`.
    env.set("DOIGET_CROSSREF_BASE", &base_uri);
    // The UnpaywallSource hits `<base>/<DOI>`; we want `/v2/<DOI>`, so
    // include the `/v2` prefix in the base.
    env.set("DOIGET_UNPAYWALL_BASE", &format!("{}/v2", base_uri));
    // Register the OA publisher allowlist host for the test client (same
    // wiremock host as the others).
    env.set("DOIGET_OA_PUBLISHER_BASE", &base_uri);

    // Step 3: run the orchestrator end-to-end. No real network traffic.
    fetch::run_with_options(format!("doi:{}", TEST_DOI), false, OutputMode::Human)
        .await
        .expect("fetch::run_with_options succeeds");

    // Step 4: assert the on-disk PDF exists and starts with `%PDF-`.
    let pdf_path = store_root.join("doi_10.1234_test.pdf");
    assert!(
        pdf_path.exists(),
        "expected PDF at {pdf_path}; tree: {:?}",
        std::fs::read_dir(temp_root.as_std_path())
            .map(|d| d.flatten().map(|e| e.path()).collect::<Vec<_>>())
    );
    let pdf_bytes = std::fs::read(pdf_path.as_std_path()).expect("read pdf");
    assert_eq!(pdf_bytes, pdf_body, "stored PDF must match wiremock body");
    assert!(
        pdf_bytes.starts_with(b"%PDF-"),
        "PDF must start with magic bytes"
    );

    // Step 5: metadata TOML round-trips and has [doiget].source = "oa-publisher".
    let meta_path = store_root.join(".metadata").join("doi_10.1234_test.toml");
    let meta_raw = std::fs::read_to_string(meta_path.as_std_path()).expect("read metadata toml");
    let metadata: Metadata = toml::from_str(&meta_raw).expect("metadata round-trips");
    assert_eq!(metadata.schema_version, "1.0");
    let doiget = metadata.doiget.expect("[doiget] table present");
    assert_eq!(doiget.source, "oa-publisher");
    assert_eq!(doiget.size_bytes, pdf_body.len() as u64);
    assert_eq!(doiget.license, "cc-by");
    assert_eq!(
        metadata.doi.map(|d| d.as_str().to_string()),
        Some(TEST_DOI.to_string())
    );

    // Step 6: provenance log has at least three `Fetch ok` rows
    // (Crossref, Unpaywall, oa-publisher) plus the bookend rows.
    let rows = read_log_rows(&log_path);
    let fetch_ok_rows: Vec<&LogRow> = rows
        .iter()
        .filter(|r| r.event == LogEvent::Fetch && r.result == LogResult::Ok)
        .collect();
    assert!(
        fetch_ok_rows.len() >= 3,
        "expected >=3 Fetch ok rows (crossref, unpaywall, oa-publisher); got {}: {:?}",
        fetch_ok_rows.len(),
        fetch_ok_rows
            .iter()
            .map(|r| r.source.as_deref().unwrap_or("?"))
            .collect::<Vec<_>>()
    );
    let sources: Vec<&str> = fetch_ok_rows
        .iter()
        .filter_map(|r| r.source.as_deref())
        .collect();
    assert!(
        sources.contains(&"crossref"),
        "expected a crossref Fetch ok row; got {:?}",
        sources
    );
    assert!(
        sources.contains(&"unpaywall"),
        "expected an unpaywall Fetch ok row; got {:?}",
        sources
    );
    assert!(
        sources.contains(&"oa-publisher"),
        "expected an oa-publisher Fetch ok row; got {:?}",
        sources
    );

    // Sanity: hash chain links rows in file order.
    assert_eq!(rows[0].prev_hash, "GENESIS");
    for i in 1..rows.len() {
        assert_eq!(
            rows[i].prev_hash,
            rows[i - 1].this_hash,
            "hash chain break at row {i}"
        );
    }

    drop(env);
    drop(td);
}

#[tokio::test]
#[serial]
async fn fetch_doi_oa_pdf_falls_back_to_metadata_when_host_off_allowlist() {
    // Failure-fallback path: Unpaywall hands back an OA URL whose host is
    // NOT registered in the test client's `oa-publisher` allowlist. The
    // orchestrator MUST log a `Fetch err / source=oa-publisher` row and
    // SKIP writing a PDF while still writing the metadata TOML (the
    // `informed-best-effort` posture in `docs/REDIRECT_ALLOWLIST.md` §3
    // keeps the metadata).
    //
    // Issue #145 / `docs/ERRORS.md` §3 + §6: the CLI persona must NOT
    // treat this blocked PDF leg as a clean `Ok(())`. The metadata is
    // still written (and pointed at), but `run_with_options` returns an
    // `Err` carrying a `CliExit` so the process exits non-zero — a
    // blocked PDF is no longer a silent success.
    let server = MockServer::start().await;
    let base_uri = server.uri();

    // The OA URL points at an `https://` host that is NOT one of our
    // registered allowlist entries. As of issue #145 / PR #163 the core
    // runs a PRE-FETCH host allowlist check on the metadata-discovered OA
    // URL in `doiget_core::orchestrator::try_fetch_oa_pdf` BEFORE the PDF
    // fetch is issued (`docs/REDIRECT_ALLOWLIST.md` §1 — NORMATIVE), not
    // only inside the redirect closure. This mock mounts NO redirect and
    // `attacker.test` is off the `oa-publisher` allowlist, so the pre-fetch
    // check rejects the OA URL with the SAME `HttpError::RedirectDenied`
    // the redirect closure produces — carrying a
    // `DenialContext(RedirectNotInAllowlist)`. The connect to
    // `attacker.test` never happens. The doiget-cli classification then
    // promotes this deliberate policy block to `CAPABILITY_DENIED` /
    // exit 3 (see the assertion + `docs/ERRORS.md` §6.1 below).
    let off_allowlist_oa_url = "https://attacker.test/file.pdf".to_string();

    // Crossref uses `Url::join("/works/<doi>")` which does NOT URL-encode
    // the embedded `/` in the DOI suffix; so wiremock matches on the raw
    // form (`/works/10.1234/test`). Unpaywall, in contrast, uses
    // `path_segments_mut().push()` which DOES percent-encode (`%2F`).
    Mock::given(method("GET"))
        .and(path(format!("/works/{}", TEST_DOI)))
        .respond_with(ResponseTemplate::new(200).set_body_json(crossref_body()))
        .mount(&server)
        .await;

    Mock::given(method("GET"))
        .and(path(format!("/v2/{}", TEST_DOI_ENCODED)))
        .respond_with(
            ResponseTemplate::new(200).set_body_json(unpaywall_body(&off_allowlist_oa_url)),
        )
        .mount(&server)
        .await;

    let td = TempDir::new().expect("tempdir");
    let temp_root: Utf8PathBuf = Utf8Path::from_path(td.path())
        .expect("temp dir is utf-8")
        .to_path_buf();
    let store_root = temp_root.join("papers");
    let log_path = temp_root.join("log.jsonl");

    let env = EnvGuard::new(ENV_KEYS);
    env.set("DOIGET_STORE_ROOT", store_root.as_str());
    env.set("DOIGET_LOG_PATH", log_path.as_str());
    env.set("DOIGET_CROSSREF_BASE", &base_uri);
    env.set("DOIGET_UNPAYWALL_BASE", &format!("{}/v2", base_uri));
    // Register the oa-publisher source with the wiremock host only;
    // `attacker.test` will not match and the OA leg will be denied at the
    // initial-URL host check (the same closure runs on the first leg as
    // on every redirect hop).
    env.set("DOIGET_OA_PUBLISHER_BASE", &base_uri);

    // Issue #145: the blocked PDF leg must surface as a non-zero exit,
    // NOT a silent `Ok(())`. The metadata is still written (asserted
    // below) but the CLI persona gets an `error[CODE]:` line + a
    // `CliExit` carrying the `docs/ERRORS.md` §4 process code.
    //
    // Issue #145 (Option B, approved): an off-allowlist / redirect-denied
    // / insecure-scheme OA-PDF block is a DELIBERATE policy denial and
    // MUST surface as `CAPABILITY_DENIED` / exit 3, not `NETWORK_ERROR` /
    // exit 1 (`docs/ERRORS.md` §2 + §6.1). The doiget-cli reclassification
    // lives in `effective_blocked_code`
    // (`crates/doiget-cli/src/commands/fetch.rs`): whenever the
    // orchestrator surfaces a `DenialContext` whose `reason` is
    // `redirect_not_in_allowlist` / `insecure_scheme` /
    // `host_in_block_list`, the CLI promotes the code to
    // `CapabilityDenied` and returns `CliExit(3)`.
    //
    // This case IS now COVERED end-to-end. PR #163 added the core
    // PRE-FETCH host allowlist check in
    // `doiget_core::orchestrator::try_fetch_oa_pdf`
    // (`docs/REDIRECT_ALLOWLIST.md` §1 — NORMATIVE): the
    // metadata-discovered OA URL is run through the `oa-publisher`
    // allowlist BEFORE the PDF fetch is issued, not only on redirect
    // hops. With `attacker.test` off the allowlist and NO redirect, the
    // pre-fetch check rejects it with the SAME `HttpError::RedirectDenied`
    // the redirect closure produces → `DenialContext(RedirectNotIn-
    // Allowlist)` (not `None`). The #162 CLI classification then promotes
    // it to `CAPABILITY_DENIED` / exit 3. This closes the former "initial
    // OA URL host off-allowlist with no redirect" gap (#145 + #163; see
    // `docs/ERRORS.md` §6.1). The metadata-still-written /
    // PDF-not-written and `error_code == NETWORK_ERROR` provenance
    // assertions below remain (the pre-fetch denial emits the same
    // closed-set `NETWORK_ERROR` provenance row as a redirect-time
    // denial; the process EXIT code is the reclassified value).
    let err = fetch::run_with_options(format!("doi:{}", TEST_DOI), false, OutputMode::Human)
        .await
        .expect_err("a blocked OA PDF leg must NOT be a silent success (issue #145)");
    let cli_exit = err
        .downcast_ref::<doiget_cli::commands::fetch::CliExit>()
        .expect("blocked PDF leg must carry a CliExit so main maps it to a §4 exit code");
    assert_eq!(
        cli_exit.0, 3,
        "off-allowlist OA URL with NO redirect is now caught by the #163 \
         core PRE-FETCH allowlist check, which yields the SAME \
         HttpError::RedirectDenied → DenialContext(RedirectNotInAllowlist) \
         as a redirect-time denial. The #162 CLI classification promotes \
         this deliberate supply-chain policy block to CAPABILITY_DENIED → \
         exit 3 (#145 + #163; docs/ERRORS.md §6.1). It is NO LONGER a \
         generic NETWORK_ERROR / exit 1."
    );

    // PDF MUST NOT be written.
    let pdf_path = store_root.join("doi_10.1234_test.pdf");
    assert!(
        !pdf_path.exists(),
        "PDF must NOT be written on off-allowlist host; found: {pdf_path}"
    );

    // Metadata TOML MUST be written; source falls back to the metadata
    // source label (here `unpaywall` because the license came back).
    let meta_path = store_root.join(".metadata").join("doi_10.1234_test.toml");
    assert!(
        meta_path.exists(),
        "metadata TOML must be written even when the PDF leg is denied; meta_path: {meta_path}"
    );
    let meta_raw = std::fs::read_to_string(meta_path.as_std_path()).expect("read metadata toml");
    let metadata: Metadata = toml::from_str(&meta_raw).expect("metadata round-trips");
    let doiget = metadata.doiget.expect("[doiget] table present");
    assert_ne!(
        doiget.source, "oa-publisher",
        "source must NOT be oa-publisher when the OA leg failed; got {:?}",
        doiget.source
    );
    assert_eq!(
        doiget.size_bytes, 0,
        "metadata-only fallback must report size_bytes = 0"
    );
    assert!(metadata.pdf_path.is_none(), "pdf_path must be unset");

    // Provenance log MUST have a `Fetch err` row whose source is
    // `oa-publisher`.
    let rows = read_log_rows(&log_path);
    let oa_err_rows: Vec<&LogRow> = rows
        .iter()
        .filter(|r| {
            r.event == LogEvent::Fetch
                && r.result == LogResult::Err
                && r.source.as_deref() == Some("oa-publisher")
        })
        .collect();
    assert_eq!(
        oa_err_rows.len(),
        1,
        "expected exactly one Fetch err row for oa-publisher; got {:?}",
        rows.iter()
            .map(|r| (r.event, r.result, r.source.clone()))
            .collect::<Vec<_>>()
    );
    assert_eq!(
        oa_err_rows[0].error_code.as_deref(),
        Some("NETWORK_ERROR"),
        "fallback row must set error_code = NETWORK_ERROR"
    );

    drop(env);
    drop(td);
}

/// Issue #120: a Crossref failure must NOT abort the DOI fetch when
/// Unpaywall alone can still deliver the OA PDF. Mount Unpaywall +
/// OA-publisher normally but DO NOT mount `/works/<doi>` (wiremock
/// 404 → `CrossrefSource` returns `Err`). The PDF must still land on
/// disk; metadata title falls back to the DOI (Crossref gave nothing).
#[tokio::test]
#[serial]
async fn fetch_doi_crossref_down_unpaywall_oa_still_yields_pdf() {
    let server = MockServer::start().await;
    let base_uri = server.uri();
    let oa_url = format!("{}/oa/file.pdf", base_uri);

    // NO `/works/<doi>` mock — Crossref gets 404 and fails.
    Mock::given(method("GET"))
        .and(path(format!("/v2/{}", TEST_DOI_ENCODED)))
        .respond_with(ResponseTemplate::new(200).set_body_json(unpaywall_body(&oa_url)))
        .mount(&server)
        .await;
    let pdf_body = b"%PDF-fake-bytes\n".to_vec();
    Mock::given(method("GET"))
        .and(path("/oa/file.pdf"))
        .respond_with(ResponseTemplate::new(200).set_body_bytes(pdf_body.clone()))
        .mount(&server)
        .await;

    let td = TempDir::new().expect("tempdir");
    let temp_root: Utf8PathBuf = Utf8Path::from_path(td.path())
        .expect("temp dir is utf-8")
        .to_path_buf();
    let store_root = temp_root.join("papers");
    let log_path = temp_root.join("log.jsonl");

    let env = EnvGuard::new(ENV_KEYS);
    env.set("DOIGET_STORE_ROOT", store_root.as_str());
    env.set("DOIGET_LOG_PATH", log_path.as_str());
    env.set("DOIGET_CROSSREF_BASE", &base_uri);
    env.set("DOIGET_UNPAYWALL_BASE", &format!("{}/v2", base_uri));
    env.set("DOIGET_OA_PUBLISHER_BASE", &base_uri);

    fetch::run_with_options(format!("doi:{}", TEST_DOI), false, OutputMode::Human)
        .await
        .expect("fetch must succeed via Unpaywall even though Crossref failed");

    let pdf_path = store_root.join("doi_10.1234_test.pdf");
    assert!(
        pdf_path.exists(),
        "PDF must be written even though Crossref failed; tree: {:?}",
        std::fs::read_dir(temp_root.as_std_path())
            .map(|d| d.flatten().map(|e| e.path()).collect::<Vec<_>>())
    );
    let pdf_bytes = std::fs::read(pdf_path.as_std_path()).expect("read pdf");
    assert_eq!(pdf_bytes, pdf_body);

    let meta_path = store_root.join(".metadata").join("doi_10.1234_test.toml");
    let meta_raw = std::fs::read_to_string(meta_path.as_std_path()).expect("read metadata toml");
    let metadata: Metadata = toml::from_str(&meta_raw).expect("metadata round-trips");
    let doiget = metadata.doiget.expect("[doiget] table present");
    assert_eq!(doiget.source, "oa-publisher");
    // Crossref produced nothing, so the title falls back to the DOI.
    assert_eq!(metadata.title, TEST_DOI);

    drop(env);
    drop(td);
}

/// ADR-0029 fetch chain: when `best_oa_location` returns a non-PDF
/// failure (here: HTTP 403 simulating a publisher WAF block) and
/// `oa_locations[]` contains an alternate URL that DOES yield a
/// PDF, the orchestrator must advance through the chain and write
/// the fallback PDF.
///
/// The dogfood case this exists for: a DOI hits `link.aps.org` and
/// is WAF-blocked (403), but the same OpenAlex / Unpaywall record
/// already names an arXiv preprint that resolves under
/// `oa-publisher` rate-limit posture. Pre-ADR-0029, the user had to
/// discover that URL manually; post-ADR-0029, the chain walker
/// recovers automatically.
#[tokio::test]
#[serial]
async fn fetch_doi_oa_chain_falls_back_to_secondary_when_best_returns_403() {
    let server = MockServer::start().await;
    let base_uri = server.uri();
    let blocked_url = format!("{}/oa/blocked.pdf", base_uri);
    let fallback_url = format!("{}/oa/fallback.pdf", base_uri);

    // Crossref happy path.
    Mock::given(method("GET"))
        .and(path(format!("/works/{}", TEST_DOI)))
        .respond_with(ResponseTemplate::new(200).set_body_json(crossref_body()))
        .mount(&server)
        .await;

    // Unpaywall envelope: best_oa_location names the (about-to-fail)
    // publisher URL; oa_locations[] carries an alternate that the
    // chain walker advances to after the first attempt fails.
    let upw_body = serde_json::json!({
        "doi": TEST_DOI,
        "is_oa": true,
        "title": "E2E chain test paper",
        "best_oa_location": {
            "url":         blocked_url,
            "url_for_pdf": blocked_url,
            "license":     "cc-by"
        },
        "oa_locations": [
            { "url_for_pdf": blocked_url },
            { "url_for_pdf": fallback_url }
        ]
    });
    Mock::given(method("GET"))
        .and(path(format!("/v2/{}", TEST_DOI_ENCODED)))
        .respond_with(ResponseTemplate::new(200).set_body_json(upw_body))
        .mount(&server)
        .await;

    // First candidate: 403 (publisher WAF stand-in). Empty body.
    Mock::given(method("GET"))
        .and(path("/oa/blocked.pdf"))
        .respond_with(ResponseTemplate::new(403))
        .mount(&server)
        .await;

    // Second candidate: valid PDF. The chain walker MUST land here.
    let pdf_body = b"%PDF-fallback-bytes\n".to_vec();
    Mock::given(method("GET"))
        .and(path("/oa/fallback.pdf"))
        .respond_with(ResponseTemplate::new(200).set_body_bytes(pdf_body.clone()))
        .mount(&server)
        .await;

    let td = TempDir::new().expect("tempdir");
    let temp_root: Utf8PathBuf = Utf8Path::from_path(td.path())
        .expect("temp dir is utf-8")
        .to_path_buf();
    let store_root = temp_root.join("papers");
    let log_path = temp_root.join("log.jsonl");

    let env = EnvGuard::new(ENV_KEYS);
    env.set("DOIGET_STORE_ROOT", store_root.as_str());
    env.set("DOIGET_LOG_PATH", log_path.as_str());
    env.set("DOIGET_CROSSREF_BASE", &base_uri);
    env.set("DOIGET_UNPAYWALL_BASE", &format!("{}/v2", base_uri));
    env.set("DOIGET_OA_PUBLISHER_BASE", &base_uri);

    fetch::run_with_options(format!("doi:{}", TEST_DOI), false, OutputMode::Human)
        .await
        .expect("chain walker must succeed on the fallback candidate");

    // The PDF on disk MUST be the fallback body, not the blocked URL's
    // empty 403 body — confirms the chain advanced past the first hit.
    let pdf_path = store_root.join("doi_10.1234_test.pdf");
    let pdf_bytes = std::fs::read(pdf_path.as_std_path()).expect("read pdf");
    assert_eq!(
        pdf_bytes, pdf_body,
        "stored PDF MUST be the fallback candidate's body"
    );
    assert!(pdf_bytes.starts_with(b"%PDF-"));

    // Provenance log MUST show two oa-publisher Fetch rows — the first
    // an Err (403), the second an Ok (fallback success). Pre-ADR-0029
    // there was only one oa-publisher row (the err) and a blocked
    // metadata-only fallback.
    let rows = read_log_rows(&log_path);
    let oa_rows: Vec<&LogRow> = rows
        .iter()
        .filter(|r| r.event == LogEvent::Fetch && r.source.as_deref() == Some("oa-publisher"))
        .collect();
    assert_eq!(
        oa_rows.len(),
        2,
        "expected 2 oa-publisher Fetch rows (one per chain candidate), got {}: {:?}",
        oa_rows.len(),
        oa_rows.iter().map(|r| &r.result).collect::<Vec<_>>()
    );
    assert_eq!(
        oa_rows[0].result,
        LogResult::Err,
        "first chain attempt is the 403"
    );
    assert_eq!(
        oa_rows[1].result,
        LogResult::Ok,
        "second chain attempt is the fallback PDF success"
    );

    // Hash chain stays intact across the multi-attempt OA leg.
    assert_eq!(rows[0].prev_hash, "GENESIS");
    for i in 1..rows.len() {
        assert_eq!(
            rows[i].prev_hash,
            rows[i - 1].this_hash,
            "hash chain break at row {i}"
        );
    }

    drop(env);
    drop(td);
}