doiget_cli/commands/
capabilities.rs

1//! `doiget capabilities` — single-shot inventory JSON for LLM cold-boot
2//! (#214).
3//!
4//! Emits a single JSON value describing the **full surface** of this
5//! `doiget` binary: subcommands (walked from the live `clap::Command`
6//! tree so the inventory cannot drift from the parser), positional args
7//! and named flags per subcommand, global flags, the four
8//! [`super::output::OutputMode`] values, hand-maintained env-var + example tables, the
9//! `doiget_*` MCP tool list, compile-time features, and a `docs` map
10//! pointing at the canonical spec files.
11//!
12//! Design rationale: the existing `--help` output lists subcommand
13//! names but the rest of doiget's surface (env vars, MCP tools, JSON
14//! schemas, ADR refs) is scattered across `docs/`. An LLM cold-booted
15//! into doiget — no repo access, no follow-up doc reads — cannot
16//! discover those via `--help` alone. This subcommand closes that gap
17//! with one round-trip.
18//!
19//! # Output mode
20//!
21//! `doiget capabilities` is a **product-output** command per the
22//! ADR-0017 convention (`--mode` is informational; the JSON inventory
23//! is the artefact). `--mode quiet` is the one mode that suppresses
24//! stdout (#203 / CONFIG.md §5); every other mode emits the same JSON.
25//!
26//! # Wire-format stability (whole module)
27//!
28//! Every `pub` struct / enum below carries `#[non_exhaustive]`. Adding
29//! a field is non-breaking; renaming or removing one is a
30//! compile-time break for downstream Rust consumers and a
31//! `[BREAKING]`-class change for JSON consumers (CHANGELOG must call
32//! it out). The per-item `#[non_exhaustive]` attributes intentionally
33//! carry no inline comment; this module-doc says it once.
34
35use anyhow::{Context, Result};
36use serde::Serialize;
37
38/// Top-level capability inventory. Serialised to stdout as one JSON
39/// value. Field names are part of the public wire format: renaming
40/// any field is a semver minor with a CHANGELOG `\[BREAKING\]` callout
41/// (same discipline as `EntryInfo` / `MigrationReport` in #213).
42#[allow(missing_docs)] // Field names ARE the schema; documented externally in #214.
43#[non_exhaustive]
44#[derive(Debug, Serialize)]
45pub struct Capabilities {
46    /// `CARGO_PKG_VERSION` for this build.
47    pub version: &'static str,
48    /// Cargo features compiled into this binary. Contains `"oa-only"`
49    /// in stock release builds (the default feature). Empty only when
50    /// the crate was built with `--no-default-features` and **no
51    /// other features enabled**; a build like
52    /// `cargo build --no-default-features --features citation`
53    /// yields `["citation"]`, not `[]`.
54    pub features: Vec<&'static str>,
55    /// All four [`super::output::OutputMode`] values; the parser accepts these for
56    /// `--mode`. Mirrors `CONFIG.md` §5 (CLI flags).
57    pub modes: &'static [&'static str],
58    /// Global flags that apply to every subcommand.
59    pub global_flags: Vec<FlagSpec>,
60    /// One entry per CLI subcommand (clap-walked).
61    pub subcommands: Vec<SubcommandSpec>,
62    /// `DOIGET_*` env vars from CONFIG.md §4.
63    pub env_vars: &'static [EnvVar],
64    /// MCP tools exposed by `doiget serve` (hand-coded; the source of
65    /// truth is `docs/MCP_TOOLS.md` §1).
66    pub mcp_tools: &'static [McpTool],
67    /// Canonical doc paths an LLM can pull for deeper context.
68    pub docs: Docs,
69    /// Number of user-extension allowlist hosts loaded from
70    /// `<config_dir>/doiget/config.toml` per ADR-0028 D2. `0` if the
71    /// config file is missing, contains no `[[network.additional_hosts]]`,
72    /// or fails to parse — run `doiget config doctor` to diagnose parse
73    /// failures. Exposed so an LLM can confirm at cold-boot whether the
74    /// curated allowlist has been extended on this host.
75    pub user_extension_count: usize,
76}
77
78/// What kind of value (if any) a [`FlagSpec`] carries.
79///
80/// Typed (not `&'static str`) so a typo can't slip into the wire
81/// format and the `Enum`-implies-`values`-present invariant is
82/// expressible at the type layer (see #215 for the design pass). Serialises
83/// as the lowercased variant name: `"bool"`, `"enum"`, `"string"`.
84#[non_exhaustive]
85#[derive(Debug, Serialize)]
86#[serde(rename_all = "lowercase")]
87pub enum FlagKind {
88    /// Boolean switch (no value).
89    Bool,
90    /// Value-bounded flag — `values` carries the accepted set.
91    Enum,
92    /// Any non-`Bool`, non-`Enum` flag. Today every such flag emits
93    /// `"string"`; richer typing (`Path` / `Int` etc.) is intentionally
94    /// out of scope until a real consumer needs it — `#[non_exhaustive]`
95    /// reserves space without commitment.
96    String,
97}
98
99#[allow(missing_docs)] // Field names ARE the schema; documented externally in #214.
100#[non_exhaustive]
101#[derive(Debug, Serialize)]
102pub struct FlagSpec {
103    /// e.g. `--mode`, `--json`, `-q`.
104    pub name: String,
105    /// Boolean / enum / free-string discriminator. See [`FlagKind`].
106    pub kind: FlagKind,
107    /// `clap` `help` text.
108    pub help: Option<String>,
109    /// For `kind == FlagKind::Enum`: the accepted values, harvested
110    /// from clap's `PossibleValuesParser`. Owned (not `&'static`) so
111    /// the helper works for any future enum flag, not just `--mode`
112    /// (see #215).
113    #[serde(skip_serializing_if = "Option::is_none")]
114    pub values: Option<Vec<String>>,
115}
116
117#[allow(missing_docs)] // Field names ARE the schema; documented externally in #214.
118#[non_exhaustive]
119#[derive(Debug, Serialize)]
120pub struct SubcommandSpec {
121    pub name: String,
122    pub summary: Option<String>,
123    pub args: Vec<ArgSpec>,
124    pub flags: Vec<FlagSpec>,
125    /// Hand-maintained canonical invocations.
126    pub examples: &'static [&'static str],
127    /// How this command interacts with `--mode json`. See [`JsonMode`].
128    pub json_mode: JsonMode,
129    /// Cargo feature this subcommand is gated behind, if any.
130    #[serde(skip_serializing_if = "Option::is_none")]
131    pub feature_gated: Option<&'static str>,
132}
133
134/// What kind of positional argument an [`ArgSpec`] describes.
135///
136/// Currently every entry is `Positional`; the typed enum reserves
137/// space for future variants (e.g. `Stdin` markers) without breaking
138/// existing JSON consumers. Serialises as `"positional"`.
139#[non_exhaustive]
140#[derive(Debug, Serialize)]
141#[serde(rename_all = "lowercase")]
142pub enum ArgKind {
143    /// A required-or-optional positional argument on the subcommand.
144    Positional,
145}
146
147#[allow(missing_docs)] // Field names ARE the schema; documented externally in #214.
148#[non_exhaustive]
149#[derive(Debug, Serialize)]
150pub struct ArgSpec {
151    pub name: String,
152    /// Always [`ArgKind::Positional`] today. Kept as a discriminator
153    /// so the JSON shape can grow new arg kinds later without
154    /// renaming fields (see #215 for the design pass).
155    pub kind: ArgKind,
156    pub help: Option<String>,
157    /// `true` when the arg has no default and no `Option<T>` wrapper.
158    pub required: bool,
159}
160
161/// How a subcommand interacts with `--mode json`.
162///
163/// Wire shape: every variant serialises to an object with a `status`
164/// discriminant, so a consumer sees uniform `{"status":"…", …}`
165/// records (`#[serde(tag = "status")]`). Before #215 the previous
166/// mixed string/object representation forced consumers to handle two
167/// JSON shapes for sibling variants.
168///
169/// **Tuple variants not permitted.** `#[serde(tag = "status")]`
170/// requires the tag to live in the same flat object as variant
171/// fields; tuple variants are incompatible with internally-tagged
172/// representation. Future variants MUST use named fields.
173#[non_exhaustive] // Adding a future variant is non-breaking for JSON consumers.
174#[derive(Debug, Serialize)]
175#[serde(tag = "status", rename_all = "lowercase")]
176pub enum JsonMode {
177    /// The command's primary output IS the requested artifact, not
178    /// informational chatter. `--mode` is informational here; the
179    /// exact stdout shape (e.g. JSON for `csl` / `graph` /
180    /// `capabilities` and the JSON-RPC stream from `serve`; BibTeX
181    /// for `bib`; PDF-on-disk + stderr summary for `fetch`; a
182    /// `--dry-run` JSON plan in the dry-run variants) is fixed by
183    /// the subcommand and may vary across flags. **Consult
184    /// `examples` for the per-flag stdout form** rather than
185    /// assuming JSON.
186    Artifact,
187    /// Under `--mode json` the command emits a structured JSON body
188    /// on stdout; otherwise the human form (e.g. `info`,
189    /// `list-recent`, `audit-log`, `provenance migrate`, `batch`).
190    Supported,
191    // NOTE: a `Deferred { tracking: &'static str }` variant was
192    // sketched during #214's design phase but never instantiated by
193    // any subcommand. Removed in the #215 self-review pass to avoid
194    // shipping an unused wire shape; `#[non_exhaustive]` keeps the
195    // door open to add it back non-breakingly when a real consumer
196    // emerges.
197}
198
199#[allow(missing_docs)] // Field names ARE the schema; documented externally in #214.
200#[non_exhaustive]
201#[derive(Debug, Serialize)]
202pub struct EnvVar {
203    pub name: &'static str,
204    /// `(none)` when no built-in default.
205    pub default: &'static str,
206    pub help: &'static str,
207}
208
209#[allow(missing_docs)] // Field names ARE the schema; documented externally in #214.
210#[non_exhaustive]
211#[derive(Debug, Serialize)]
212pub struct McpTool {
213    pub name: &'static str,
214    /// Anchor-style reference into `docs/MCP_TOOLS.md`.
215    pub schema_ref: &'static str,
216}
217
218#[allow(missing_docs)] // Field names ARE the schema; documented externally in #214.
219#[non_exhaustive]
220#[derive(Debug, Serialize)]
221pub struct Docs {
222    pub config: &'static str,
223    pub errors: &'static str,
224    pub scope: &'static str,
225    pub mcp: &'static str,
226    pub sources: &'static str,
227    pub redirect_allowlist: &'static str,
228    pub provenance_log: &'static str,
229}
230
231// ---------------------------------------------------------------------------
232// Static tables
233// ---------------------------------------------------------------------------
234
235const MODES: &[&str] = &["human", "json", "quiet", "mcp"];
236
237const ENV_VARS: &[EnvVar] = &[
238    EnvVar {
239        name: "DOIGET_STORE_ROOT",
240        default: "$HOME/papers",
241        help: "Root of the on-disk paper store. CONFIG.md §4.",
242    },
243    EnvVar {
244        name: "DOIGET_CACHE_ROOT",
245        default: "$HOME/.cache/doiget",
246        help: "Root of the on-disk HTTP / metadata cache. CONFIG.md §4.",
247    },
248    EnvVar {
249        name: "DOIGET_LOG_PATH",
250        default: "<config_dir>/doiget/access.jsonl",
251        help: "JSON-Lines provenance log file path (PROVENANCE_LOG.md §3).",
252    },
253    EnvVar {
254        name: "DOIGET_LOG_RETENTION_DAYS",
255        default: "90",
256        help: "Rotated-segment retention window (0 disables pruning). #140 / PROVENANCE_LOG.md §6.",
257    },
258    EnvVar {
259        name: "DOIGET_MODE",
260        default: "(none)",
261        help: "Output mode (`human`/`json`/`quiet`/`mcp`). ADR-0017 ladder rung 3.",
262    },
263    EnvVar {
264        name: "DOIGET_CONTACT_EMAIL",
265        default: "(none)",
266        help: "Contact email for polite User-Agent header (CONFIG.md §4).",
267    },
268    EnvVar {
269        name: "DOIGET_UNPAYWALL_EMAIL",
270        default: "(falls back to DOIGET_CONTACT_EMAIL)",
271        help: "Unpaywall-specific contact email.",
272    },
273    EnvVar {
274        name: "DOIGET_USER_AGENT",
275        default: "(default polite UA)",
276        help: "Override the User-Agent header for all outbound requests.",
277    },
278    EnvVar {
279        name: "DOIGET_ENABLE_OPENALEX",
280        default: "(off)",
281        help: "Enable the OpenAlex citation graph source (graph subcommand prerequisite).",
282    },
283    EnvVar {
284        name: "DOIGET_ARXIV_BASE",
285        default: "https://export.arxiv.org/",
286        help: "arXiv API base URL — primarily for testing/wiremock override.",
287    },
288    EnvVar {
289        name: "DOIGET_CROSSREF_BASE",
290        default: "https://api.crossref.org/",
291        help: "Crossref API base URL.",
292    },
293    EnvVar {
294        name: "DOIGET_UNPAYWALL_BASE",
295        default: "https://api.unpaywall.org/",
296        help: "Unpaywall API base URL.",
297    },
298];
299
300const MCP_TOOLS: &[McpTool] = &[
301    McpTool {
302        name: "doiget_resolve_paper",
303        schema_ref: "docs/MCP_TOOLS.md#1-tool-list",
304    },
305    McpTool {
306        name: "doiget_fetch_paper",
307        schema_ref: "docs/MCP_TOOLS.md#1-tool-list",
308    },
309    McpTool {
310        name: "doiget_metadata_only",
311        schema_ref: "docs/MCP_TOOLS.md#11-doiget_metadata_only-normative",
312    },
313    McpTool {
314        name: "doiget_batch_fetch",
315        schema_ref: "docs/MCP_TOOLS.md#1-tool-list",
316    },
317    McpTool {
318        name: "doiget_info",
319        schema_ref: "docs/MCP_TOOLS.md#1-tool-list",
320    },
321    McpTool {
322        name: "doiget_search_local",
323        schema_ref: "docs/MCP_TOOLS.md#1-tool-list",
324    },
325    McpTool {
326        name: "doiget_paper_search",
327        schema_ref: "docs/MCP_TOOLS.md#1-tool-list",
328    },
329    McpTool {
330        name: "doiget_paper_text",
331        schema_ref: "docs/MCP_TOOLS.md#1-tool-list",
332    },
333    McpTool {
334        name: "doiget_link",
335        schema_ref: "docs/MCP_TOOLS.md#1-tool-list",
336    },
337    McpTool {
338        name: "doiget_list_recent",
339        schema_ref: "docs/MCP_TOOLS.md#1-tool-list",
340    },
341    McpTool {
342        name: "doiget_paper_pdf_path",
343        schema_ref: "docs/MCP_TOOLS.md#1-tool-list",
344    },
345    McpTool {
346        name: "doiget_capability_profile",
347        schema_ref: "docs/MCP_TOOLS.md#1-tool-list",
348    },
349    McpTool {
350        name: "doiget_health",
351        schema_ref: "docs/MCP_TOOLS.md#1-tool-list",
352    },
353    McpTool {
354        name: "doiget_expand_citation_graph",
355        schema_ref: "docs/MCP_TOOLS.md#1-tool-list",
356    },
357    McpTool {
358        name: "doiget_bibtex_export",
359        schema_ref: "docs/MCP_TOOLS.md#1-tool-list",
360    },
361    McpTool {
362        name: "doiget_csl_export",
363        schema_ref: "docs/MCP_TOOLS.md#1-tool-list",
364    },
365    McpTool {
366        // ADR-0030 D6: parse a CSL-JSON / (future) BibTeX file and
367        // fetch each resolvable entry; each result row carries the
368        // source bibliography's `entry_key` so a Zotero / Mendeley
369        // plugin can bridge the fetched PDF back to the originating
370        // reference.
371        name: "doiget_batch_from_bibliography",
372        schema_ref: "docs/MCP_TOOLS.md#1-tool-list",
373    },
374];
375
376const DOCS: Docs = Docs {
377    config: "docs/CONFIG.md",
378    errors: "docs/ERRORS.md",
379    scope: "docs/SCOPE.md",
380    mcp: "docs/MCP_TOOLS.md",
381    sources: "docs/SOURCES.md",
382    redirect_allowlist: "docs/REDIRECT_ALLOWLIST.md",
383    provenance_log: "docs/PROVENANCE_LOG.md",
384};
385
386/// Per-subcommand hand-maintained metadata. The clap walk provides
387/// name + summary + args + flags; this table adds examples,
388/// `json_mode` semantics, and feature-gating that clap doesn't
389/// expose. A regression unit test asserts every clap-visible
390/// subcommand has an entry here (otherwise the test fails loudly).
391///
392/// **Maintenance:** `feature_gated` MUST be kept in sync with the
393/// corresponding `#[cfg(feature = …)]` annotation in `main.rs`. There
394/// is no compile-time check; the `every_test_cli_subcommand_has_metadata`
395/// regression test does not cover feature-gating directly — it only
396/// asserts metadata exists. Add a CI matrix entry (`--features
397/// citation`) when introducing new gated subcommands so the e2e
398/// assertion list catches drift (see #215). Alternatively, add a
399/// unit test that asserts `metadata_for("graph").unwrap().feature_gated
400/// == Some("citation")` to lock the gate at the lib-test layer.
401struct SubcommandMeta {
402    examples: &'static [&'static str],
403    json_mode: JsonMode,
404    feature_gated: Option<&'static str>,
405}
406
407fn metadata_for(subcommand: &str) -> Option<SubcommandMeta> {
408    let m = match subcommand {
409        "fetch" => SubcommandMeta {
410            examples: &[
411                "doiget fetch 10.1234/foo",
412                "doiget fetch arxiv:2401.12345",
413                "doiget fetch 10.1234/foo --dry-run",
414            ],
415            // The success summary is on stderr (ADR-0001); the
416            // dry-run plan is JSON product output (ADR-0022).
417            json_mode: JsonMode::Artifact,
418            feature_gated: None,
419        },
420        "batch" => SubcommandMeta {
421            examples: &[
422                "doiget batch refs.txt",
423                "doiget batch refs.txt --dry-run",
424                "doiget batch refs.txt --json",
425            ],
426            // `--json` emits the ERRORS.md §3 JSONL per-ref shape (#205).
427            json_mode: JsonMode::Supported,
428            feature_gated: None,
429        },
430        "verify" => SubcommandMeta {
431            examples: &[
432                "doiget verify refs.bib",
433                "doiget verify library.bib --strict",
434                "doiget verify refs.txt --format refs",
435            ],
436            // Emits one JSON-Lines record per entry regardless of mode;
437            // the JSONL stream is the product output.
438            json_mode: JsonMode::Artifact,
439            feature_gated: None,
440        },
441        "lint" => SubcommandMeta {
442            examples: &["doiget lint refs.bib", "doiget lint library.bib --strict"],
443            // Emits one JSON-Lines finding per issue; the JSONL stream is
444            // the product output (mirrors verify).
445            json_mode: JsonMode::Artifact,
446            feature_gated: None,
447        },
448        "info" => SubcommandMeta {
449            examples: &[
450                "doiget info 10.1234/foo",
451                "doiget info arxiv:2401.12345 --json",
452            ],
453            json_mode: JsonMode::Supported,
454            feature_gated: None,
455        },
456        "list-recent" => SubcommandMeta {
457            examples: &[
458                "doiget list-recent",
459                "doiget list-recent 20",
460                "doiget list-recent --json",
461            ],
462            json_mode: JsonMode::Supported,
463            feature_gated: None,
464        },
465        "search" => SubcommandMeta {
466            examples: &[
467                "doiget search 'quantum entanglement'",
468                "doiget search renormalization --json",
469            ],
470            json_mode: JsonMode::Supported,
471            feature_gated: None,
472        },
473        "bib" => SubcommandMeta {
474            examples: &["doiget bib 10.1234/foo", "doiget bib arxiv:2401.12345"],
475            // BibTeX output is the product; `--mode` is informational.
476            json_mode: JsonMode::Artifact,
477            feature_gated: None,
478        },
479        "cite" => SubcommandMeta {
480            examples: &["doiget cite 10.1234/foo", "doiget cite arxiv:2401.12345"],
481            // BibTeX output is the product (resolved live); `--mode` is
482            // informational, mirroring `bib`.
483            json_mode: JsonMode::Artifact,
484            feature_gated: None,
485        },
486        "csl" => SubcommandMeta {
487            examples: &["doiget csl 10.1234/foo"],
488            json_mode: JsonMode::Artifact,
489            feature_gated: None,
490        },
491        "text" => SubcommandMeta {
492            examples: &[
493                "doiget text arxiv:2401.12345",
494                "doiget text arxiv:2401.12345 --max-chars 8000",
495                "doiget text arxiv:2401.12345 --json",
496            ],
497            // The extracted text (human layout or `PaperText` JSON) is the
498            // product output; `--mode json` switches its shape.
499            json_mode: JsonMode::Supported,
500            feature_gated: None,
501        },
502        "link" => SubcommandMeta {
503            examples: &[
504                "doiget link 10.1103/PhysRevB.1",
505                "doiget link 10.1103/PhysRevB.1 --json",
506            ],
507            // The identity cluster (human lines or `PaperLinks` JSON) is the
508            // product output; `--mode json` switches its shape.
509            json_mode: JsonMode::Supported,
510            feature_gated: None,
511        },
512        "audit-log" => SubcommandMeta {
513            examples: &[
514                "doiget audit-log --verify",
515                "doiget audit-log --verify --json",
516                "doiget --quiet audit-log --verify   # exit code only",
517            ],
518            json_mode: JsonMode::Supported,
519            feature_gated: None,
520        },
521        "provenance" => SubcommandMeta {
522            examples: &[
523                "doiget provenance migrate --dry-run",
524                "doiget provenance migrate",
525                "doiget provenance migrate --dry-run --json",
526            ],
527            json_mode: JsonMode::Supported,
528            feature_gated: None,
529        },
530        "config" => SubcommandMeta {
531            examples: &[
532                "doiget config show",
533                "doiget config show --json",
534                "doiget config path",
535                "doiget config doctor",
536            ],
537            json_mode: JsonMode::Supported,
538            feature_gated: None,
539        },
540        "serve" => SubcommandMeta {
541            examples: &["doiget serve   # stdio MCP server (ADR-0001)"],
542            // serve always runs in mcp mode; the protocol output is
543            // JSON-RPC, which is product.
544            json_mode: JsonMode::Artifact,
545            feature_gated: None,
546        },
547        "graph" => SubcommandMeta {
548            examples: &[
549                "DOIGET_ENABLE_OPENALEX=1 doiget graph 10.1234/foo",
550                "DOIGET_ENABLE_OPENALEX=1 doiget graph 10.1234/foo --depth 2 --total 50",
551            ],
552            json_mode: JsonMode::Artifact,
553            feature_gated: Some("citation"),
554        },
555        "version" => SubcommandMeta {
556            examples: &[
557                "doiget version",
558                "doiget version --check",
559                "doiget version --check --mode json",
560            ],
561            json_mode: JsonMode::Supported,
562            feature_gated: None,
563        },
564        "capabilities" => SubcommandMeta {
565            examples: &["doiget capabilities | jq ."],
566            // The whole point of capabilities IS JSON output.
567            json_mode: JsonMode::Artifact,
568            feature_gated: None,
569        },
570        // clap auto-adds `help`; we silently ignore it (it's not a
571        // domain subcommand).
572        "help" => return None,
573        _ => return None,
574    };
575    Some(m)
576}
577
578// ---------------------------------------------------------------------------
579// Build
580// ---------------------------------------------------------------------------
581
582/// Build the [`Capabilities`] inventory from `cli` (the clap parser
583/// for this binary, supplied by the caller because the `Cli` struct
584/// lives in `main.rs` and is not exposed in the library crate). The
585/// caller is `commands::main::run_dispatch` via `Cli::command()`.
586pub fn build_capabilities(cli: &clap::Command) -> Capabilities {
587    let global_flags = collect_global_flags(cli);
588    let subcommands = cli
589        .get_subcommands()
590        .filter_map(|sub| build_subcommand(sub, cli))
591        .collect::<Vec<_>>();
592    Capabilities {
593        version: env!("CARGO_PKG_VERSION"),
594        features: compile_time_features(),
595        modes: MODES,
596        global_flags,
597        subcommands,
598        env_vars: ENV_VARS,
599        mcp_tools: MCP_TOOLS,
600        docs: DOCS,
601        user_extension_count: user_extension_count(),
602    }
603}
604
605/// Count valid `[[network.additional_hosts]]` entries in
606/// `<config_dir>/doiget/config.toml` (ADR-0028 D2). Returns `0` on any
607/// failure — missing config, parse error, unresolvable config dir.
608/// Diagnose failures via `doiget config doctor`; here we only need a
609/// best-effort cold-boot signal for the inventory.
610fn user_extension_count() -> usize {
611    let cfg_dir = match super::fetch::config_dir_utf8() {
612        Ok(p) => p,
613        Err(_) => return 0,
614    };
615    let path = cfg_dir.join("doiget").join("config.toml");
616    match doiget_core::user_extension::load(&path) {
617        Ok(hosts) => hosts.len(),
618        Err(_) => 0,
619    }
620}
621
622fn compile_time_features() -> Vec<&'static str> {
623    let mut feats: Vec<&'static str> = Vec::new();
624    if cfg!(feature = "oa-only") {
625        feats.push("oa-only");
626    }
627    if cfg!(feature = "metadata") {
628        feats.push("metadata");
629    }
630    if cfg!(feature = "citation") {
631        feats.push("citation");
632    }
633    if cfg!(feature = "tdm-elsevier") {
634        feats.push("tdm-elsevier");
635    }
636    if cfg!(feature = "tdm-aps") {
637        feats.push("tdm-aps");
638    }
639    if cfg!(feature = "tdm-springer") {
640        feats.push("tdm-springer");
641    }
642    feats
643}
644
645fn collect_global_flags(cmd: &clap::Command) -> Vec<FlagSpec> {
646    cmd.get_arguments()
647        .filter(|a| a.is_global_set())
648        .map(arg_to_flag_spec)
649        .collect()
650}
651
652fn build_subcommand(sub: &clap::Command, root: &clap::Command) -> Option<SubcommandSpec> {
653    let name = sub.get_name();
654    let meta = metadata_for(name)?;
655    let (args, flags) = split_args_and_flags(sub, root);
656    Some(SubcommandSpec {
657        name: name.to_string(),
658        summary: sub.get_about().map(|s| s.to_string()),
659        args,
660        flags,
661        examples: meta.examples,
662        json_mode: meta.json_mode,
663        feature_gated: meta.feature_gated,
664    })
665}
666
667fn split_args_and_flags(
668    sub: &clap::Command,
669    root: &clap::Command,
670) -> (Vec<ArgSpec>, Vec<FlagSpec>) {
671    // The root's global args appear in every subcommand's iterator;
672    // suppress them from per-subcommand `flags` (they're already in
673    // `global_flags`).
674    let global_names: std::collections::HashSet<&str> = root
675        .get_arguments()
676        .filter(|a| a.is_global_set())
677        .map(|a| a.get_id().as_str())
678        .collect();
679    let mut args = Vec::new();
680    let mut flags = Vec::new();
681    for a in sub.get_arguments() {
682        if global_names.contains(a.get_id().as_str()) {
683            continue;
684        }
685        // Clap auto-adds `--help` (and `--version` on the root) to
686        // every subcommand. They're not positional and not
687        // `is_global_set()`, so they would otherwise leak into every
688        // subcommand's `flags[]` as `kind: "string"`. Filter on the
689        // action against the known built-in variants.
690        //
691        // **Maintenance:** `clap::ArgAction` is itself
692        // `#[non_exhaustive]` upstream. A future clap release that
693        // adds a new built-in action (e.g. a hypothetical
694        // `HelpMarkdown`) would fall through this `matches!` and
695        // reappear in `flags[]`. Re-audit this filter on every clap
696        // minor-version bump.
697        if matches!(
698            a.get_action(),
699            clap::ArgAction::Help
700                | clap::ArgAction::HelpShort
701                | clap::ArgAction::HelpLong
702                | clap::ArgAction::Version
703        ) {
704            continue;
705        }
706        if a.is_positional() {
707            args.push(ArgSpec {
708                name: a.get_id().to_string(),
709                kind: ArgKind::Positional,
710                help: a.get_help().map(|s| s.to_string()),
711                required: a.is_required_set(),
712            });
713        } else {
714            flags.push(arg_to_flag_spec(a));
715        }
716    }
717    (args, flags)
718}
719
720fn arg_to_flag_spec(a: &clap::Arg) -> FlagSpec {
721    let name = a
722        .get_long()
723        .map(|s| format!("--{s}"))
724        .or_else(|| a.get_short().map(|c| format!("-{c}")))
725        .unwrap_or_else(|| a.get_id().to_string());
726    // Boolean switches → `Bool`; value-enum flags → `Enum` with the
727    // accepted values harvested from clap directly; everything else
728    // → `String`. The `possible_values()` harvest covers any future
729    // enum flag without code change (see #215).
730    let possible: Option<Vec<String>> = a
731        .get_value_parser()
732        .possible_values()
733        .map(|it| it.map(|pv| pv.get_name().to_owned()).collect());
734    let (kind, values) = if matches!(
735        a.get_action(),
736        clap::ArgAction::SetTrue | clap::ArgAction::SetFalse
737    ) {
738        (FlagKind::Bool, None)
739    } else if let Some(vs) = possible {
740        (FlagKind::Enum, Some(vs))
741    } else {
742        (FlagKind::String, None)
743    };
744    FlagSpec {
745        name,
746        kind,
747        help: a.get_help().map(|s| s.to_string()),
748        values,
749    }
750}
751
752// ---------------------------------------------------------------------------
753// Entry point
754// ---------------------------------------------------------------------------
755
756/// Run the `doiget capabilities` subcommand.
757///
758/// `capabilities` is an **artifact** command per ADR-0017 Amendment 1:
759/// its stdout output IS the deliverable (the inventory JSON an LLM
760/// reads on cold-boot). It honors only **explicit** Quiet —
761/// `--quiet` / `-q` / `--mode quiet` / `DOIGET_MODE=quiet` — and emits
762/// the inventory on every other path. The `quiet_was_explicit`
763/// discriminator is what distinguishes the two cases:
764///
765/// | mode               | quiet_was_explicit | behaviour          |
766/// |--------------------|--------------------|--------------------|
767/// | non-`Quiet`        | -                  | emit               |
768/// | `Quiet` (explicit) | `true`             | suppress           |
769/// | `Quiet` (non-TTY)  | `false`            | **emit** (#219)    |
770///
771/// The non-TTY case is the one #219 / #220 report: an LLM tool
772/// executor captures stdout, so `stdout_is_tty()` is `false`, the
773/// resolver falls through to `Quiet`, but the caller wants the JSON
774/// inventory exactly because it's about to be machine-parsed. The
775/// table's bottom row is the fix.
776///
777/// The caller passes the live `clap::Command` so the clap walk
778/// operates on the binary's actual `Cli` tree (which the lib half of
779/// this crate can't reach directly — the `Cli` struct lives in
780/// `main.rs`).
781pub fn run(
782    cli: &clap::Command,
783    mode: super::output::OutputMode,
784    quiet_was_explicit: bool,
785) -> Result<()> {
786    // ADR-0017 Amendment 1: artifact command — suppress ONLY on
787    // explicit Quiet, never on the non-TTY implicit fallback.
788    if mode == super::output::OutputMode::Quiet && quiet_was_explicit {
789        return Ok(());
790    }
791    let caps = build_capabilities(cli);
792    let s = serde_json::to_string_pretty(&caps).context("serialise capabilities inventory")?;
793    // `print_stdout` workspace-deny; localised allow at the
794    // sanctioned product-output sink. See `commands/csl.rs`'s pattern.
795    #[allow(clippy::print_stdout)]
796    {
797        println!("{s}");
798    }
799    Ok(())
800}
801
802// ---------------------------------------------------------------------------
803// Tests
804// ---------------------------------------------------------------------------
805
806#[cfg(test)]
807#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
808mod tests {
809    use super::*;
810
811    /// Mirrors the `Cli` struct in `main.rs` for lib-test reach.
812    /// `commands::capabilities` is library-level; the binary-only
813    /// `Cli` struct can't be reached from here, so we re-derive a
814    /// shadow whose subcommand list is identical. The
815    /// `cli_shadow_matches_main_cli` integration test in
816    /// `tests/capabilities_e2e.rs` runs the real binary and asserts
817    /// the wire output matches.
818    fn test_cli() -> clap::Command {
819        use clap::{Arg, ArgAction, Command};
820        let mode_values = ["human", "json", "quiet", "mcp"];
821        let cmd = Command::new("doiget")
822            .arg(
823                Arg::new("mode")
824                    .long("mode")
825                    .global(true)
826                    .value_parser(clap::builder::PossibleValuesParser::new(mode_values))
827                    .help("Output mode (human|json|quiet|mcp)."),
828            )
829            .arg(
830                Arg::new("json")
831                    .long("json")
832                    .global(true)
833                    .action(ArgAction::SetTrue)
834                    .help("Short for `--mode json`."),
835            )
836            .arg(
837                Arg::new("quiet")
838                    .long("quiet")
839                    .short('q')
840                    .global(true)
841                    .action(ArgAction::SetTrue)
842                    .help("Short for `--mode quiet`."),
843            )
844            .subcommand(
845                Command::new("fetch")
846                    .about("Fetch a single paper PDF")
847                    .arg(Arg::new("ref").required(true))
848                    .arg(
849                        Arg::new("dry-run")
850                            .long("dry-run")
851                            .action(ArgAction::SetTrue),
852                    ),
853            )
854            .subcommand(
855                Command::new("batch")
856                    .about("Fetch many refs")
857                    .arg(Arg::new("path").required(true))
858                    .arg(
859                        Arg::new("dry-run")
860                            .long("dry-run")
861                            .action(ArgAction::SetTrue),
862                    ),
863            )
864            .subcommand(
865                Command::new("info")
866                    .about("Show metadata")
867                    .arg(Arg::new("ref").required(true)),
868            )
869            .subcommand(Command::new("list-recent").about("List recent"))
870            .subcommand(
871                Command::new("search")
872                    .about("Search local")
873                    .arg(Arg::new("query").required(true)),
874            )
875            .subcommand(
876                Command::new("bib")
877                    .about("BibTeX export")
878                    .arg(Arg::new("ref").required(true)),
879            )
880            .subcommand(
881                Command::new("cite")
882                    .about("Live BibTeX")
883                    .arg(Arg::new("ref").required(true)),
884            )
885            .subcommand(
886                Command::new("csl")
887                    .about("CSL export")
888                    .arg(Arg::new("ref").required(true)),
889            )
890            .subcommand(
891                Command::new("audit-log")
892                    .about("Audit log")
893                    .arg(Arg::new("verify").long("verify").action(ArgAction::SetTrue)),
894            )
895            .subcommand(Command::new("provenance").about("Provenance ops"))
896            .subcommand(
897                Command::new("config")
898                    .about("Config")
899                    .arg(Arg::new("action").required(true)),
900            )
901            .subcommand(Command::new("serve").about("MCP server"));
902        // `graph` is `#[cfg(feature = "citation")]` in main.rs; mirror
903        // the gate so the shadow CLI matches the production surface
904        // (see #215).
905        #[cfg(feature = "citation")]
906        let cmd = cmd.subcommand(
907            Command::new("graph")
908                .about("Citation graph")
909                .arg(Arg::new("ref").required(true)),
910        );
911        cmd.subcommand(Command::new("capabilities").about("Capabilities"))
912    }
913
914    fn caps() -> Capabilities {
915        build_capabilities(&test_cli())
916    }
917
918    #[test]
919    fn capabilities_serialises_to_valid_json() {
920        let s = serde_json::to_string_pretty(&caps()).expect("serialise");
921        let v: serde_json::Value = serde_json::from_str(&s).expect("parse round-trip");
922        for key in [
923            "version",
924            "features",
925            "modes",
926            "global_flags",
927            "subcommands",
928            "env_vars",
929            "mcp_tools",
930            "docs",
931            "user_extension_count",
932        ] {
933            assert!(
934                v.get(key).is_some(),
935                "top-level key `{key}` missing from capabilities JSON: {v}"
936            );
937        }
938    }
939
940    #[test]
941    fn modes_field_matches_output_mode_enum() {
942        // Tied to `OutputMode { Human, Json, Quiet, Mcp }`.
943        assert_eq!(caps().modes, &["human", "json", "quiet", "mcp"]);
944    }
945
946    #[test]
947    fn env_vars_all_use_doiget_prefix() {
948        for ev in ENV_VARS {
949            assert!(
950                ev.name.starts_with("DOIGET_"),
951                "env var name MUST use DOIGET_ prefix, got `{}`",
952                ev.name
953            );
954        }
955    }
956
957    #[test]
958    fn mcp_tools_all_use_doiget_prefix() {
959        for t in MCP_TOOLS {
960            assert!(
961                t.name.starts_with("doiget_"),
962                "MCP tool name MUST use doiget_ prefix, got `{}`",
963                t.name
964            );
965        }
966    }
967
968    #[test]
969    fn subcommand_examples_reference_the_subcommand_name() {
970        for sub in &caps().subcommands {
971            for ex in sub.examples {
972                // `graph` examples carry a `DOIGET_ENABLE_OPENALEX=1`
973                // env prefix before `doiget …`. Allow either form.
974                assert!(
975                    ex.starts_with("doiget ") || ex.contains(" doiget "),
976                    "example `{ex}` for `{}` must invoke `doiget` somewhere",
977                    sub.name
978                );
979                assert!(
980                    ex.contains(&sub.name),
981                    "example `{ex}` does not mention subcommand `{}`",
982                    sub.name
983                );
984            }
985        }
986    }
987
988    // Exact-set parity guard against drift between the static
989    // `ENV_VARS` table and the documented surface (#215). The expected set is the SOURCE OF TRUTH at test time;
990    // adding a new DOIGET_* env var requires updating both ENV_VARS
991    // and this list in lockstep. CHANGELOG records cross-PR changes.
992    #[test]
993    fn env_vars_exact_set_matches_expected() {
994        let actual: std::collections::BTreeSet<&str> = ENV_VARS.iter().map(|ev| ev.name).collect();
995        let expected: std::collections::BTreeSet<&str> = [
996            // CONFIG.md §4 documented:
997            "DOIGET_STORE_ROOT",
998            "DOIGET_CACHE_ROOT",
999            "DOIGET_LOG_PATH",
1000            "DOIGET_LOG_RETENTION_DAYS",
1001            "DOIGET_USER_AGENT",
1002            "DOIGET_UNPAYWALL_EMAIL",
1003            "DOIGET_MODE",
1004            // Code-reachable but documented in code-level docs or
1005            // CAPABILITY.md (not CONFIG.md §4):
1006            "DOIGET_CONTACT_EMAIL",
1007            "DOIGET_ENABLE_OPENALEX",
1008            // Test/wiremock-override base URLs:
1009            "DOIGET_ARXIV_BASE",
1010            "DOIGET_CROSSREF_BASE",
1011            "DOIGET_UNPAYWALL_BASE",
1012        ]
1013        .into_iter()
1014        .collect();
1015        assert_eq!(
1016            actual, expected,
1017            "ENV_VARS table drifted from the expected canonical set; \
1018             update both `ENV_VARS` and this test together (and CONFIG.md §4 \
1019             if the new var is user-documented)."
1020        );
1021    }
1022
1023    // Exact-set parity guard against drift between the static
1024    // `MCP_TOOLS` table and `docs/MCP_TOOLS.md` §1 (#215).
1025    #[test]
1026    fn mcp_tools_exact_set_matches_expected() {
1027        let actual: std::collections::BTreeSet<&str> = MCP_TOOLS.iter().map(|t| t.name).collect();
1028        let expected: std::collections::BTreeSet<&str> = [
1029            "doiget_resolve_paper",
1030            "doiget_fetch_paper",
1031            "doiget_metadata_only",
1032            "doiget_batch_fetch",
1033            "doiget_info",
1034            "doiget_search_local",
1035            "doiget_paper_search",
1036            "doiget_paper_text",
1037            "doiget_link",
1038            "doiget_list_recent",
1039            "doiget_paper_pdf_path",
1040            "doiget_capability_profile",
1041            "doiget_health",
1042            "doiget_expand_citation_graph",
1043            "doiget_bibtex_export",
1044            "doiget_csl_export",
1045            "doiget_batch_from_bibliography",
1046        ]
1047        .into_iter()
1048        .collect();
1049        assert_eq!(
1050            actual, expected,
1051            "MCP_TOOLS table drifted from the expected set; update both \
1052             `MCP_TOOLS` and this test together (and docs/MCP_TOOLS.md §1)."
1053        );
1054    }
1055
1056    // Pin the `#[serde(tag = "status")]` wire shape: every variant
1057    // serialises to a `{"status":"…", …}` object. Accidentally
1058    // removing the `tag` attribute (or renaming the discriminant)
1059    // would silently degrade the wire format; this test catches it
1060    // (#215 N1).
1061    #[test]
1062    fn json_mode_serialises_with_status_discriminant() {
1063        let s = serde_json::to_string(&JsonMode::Artifact).expect("serialise");
1064        assert_eq!(
1065            s, r#"{"status":"artifact"}"#,
1066            "Artifact must emit a status-tagged object"
1067        );
1068        let s = serde_json::to_string(&JsonMode::Supported).expect("serialise");
1069        assert_eq!(s, r#"{"status":"supported"}"#);
1070    }
1071
1072    // `arg_to_flag_spec` was generalised in #215 to harvest the
1073    // accepted values from clap's `PossibleValuesParser` instead of
1074    // hard-coding `--mode`. Pin the contract: the `--mode` entry in
1075    // `global_flags` MUST report `kind: Enum` with all four mode
1076    // strings. A future regression that silently degrades `--mode`
1077    // to `kind: String, values: None` would otherwise pass every
1078    // existing test (#215 N3).
1079    #[test]
1080    fn mode_flag_carries_enum_kind_and_all_four_values() {
1081        let global = &caps().global_flags;
1082        let mode = global
1083            .iter()
1084            .find(|f| f.name == "--mode")
1085            .expect("--mode flag is in global_flags");
1086        assert!(
1087            matches!(mode.kind, FlagKind::Enum),
1088            "--mode kind MUST be Enum, got {:?}",
1089            mode.kind
1090        );
1091        let vs = mode.values.as_ref().expect("--mode carries values");
1092        let mut sorted = vs.clone();
1093        sorted.sort();
1094        assert_eq!(sorted, vec!["human", "json", "mcp", "quiet"]);
1095    }
1096
1097    // `compile_time_features()` pushes string literals that must
1098    // exactly match the Cargo feature names in `Cargo.toml`. A
1099    // typo in the literal (`"oa_only"` vs `"oa-only"`) would
1100    // silently invert the inventory's `features` field for every
1101    // consumer. The default build has `oa-only` active; assert
1102    // the literal round-trips (#215 A9).
1103    #[test]
1104    fn compile_time_features_contains_oa_only_under_default() {
1105        // `cfg!(feature = "oa-only")` is true in the default test
1106        // build; if a future maintainer disables the default feature
1107        // for the test target, this test becomes meaningless but
1108        // does not cause a false failure.
1109        if cfg!(feature = "oa-only") {
1110            let f = compile_time_features();
1111            assert!(
1112                f.contains(&"oa-only"),
1113                "oa-only feature was enabled at compile time but \
1114                 `compile_time_features()` did not list it: {f:?}"
1115            );
1116        }
1117    }
1118
1119    #[test]
1120    fn version_is_cargo_pkg_version() {
1121        assert_eq!(caps().version, env!("CARGO_PKG_VERSION"));
1122    }
1123
1124    /// ADR-0028 D2: `user_extension_count` must reflect the number of
1125    /// `[[network.additional_hosts]]` entries actually present in
1126    /// `<config_dir>/doiget/config.toml`. The test points every
1127    /// config-dir env var at a tempdir, writes a 2-host config, and
1128    /// asserts the inventory reports `2`. Drift here would silently
1129    /// hide user-curated allowlist hosts from the cold-boot JSON.
1130    #[test]
1131    #[serial_test::serial]
1132    fn user_extension_count_reflects_config_toml_entries() {
1133        let tmp = tempfile::TempDir::new().expect("tempdir");
1134        let cfg_root = camino::Utf8Path::from_path(tmp.path()).expect("utf8 tempdir");
1135        let doiget_dir = cfg_root.join("doiget");
1136        std::fs::create_dir_all(doiget_dir.as_std_path()).expect("mk dir");
1137        let config_toml = doiget_dir.join("config.toml");
1138        std::fs::write(
1139            config_toml.as_std_path(),
1140            "[[network.additional_hosts]]\n\
1141             host = \"example.org\"\n\
1142             \n\
1143             [[network.additional_hosts]]\n\
1144             host = \"*.example.net\"\n\
1145             note = \"university OA mirror\"\n",
1146        )
1147        .expect("write config.toml");
1148
1149        let _x = EnvGuard::set("XDG_CONFIG_HOME", cfg_root.as_str());
1150        let _a = EnvGuard::unset("APPDATA");
1151        let _h = EnvGuard::unset("HOME");
1152        let _u = EnvGuard::unset("USERPROFILE");
1153
1154        let cli = test_cli();
1155        let caps = build_capabilities(&cli);
1156        assert_eq!(
1157            caps.user_extension_count, 2,
1158            "expected 2 user-extension hosts, got {}",
1159            caps.user_extension_count
1160        );
1161    }
1162
1163    /// Companion: with no config file (and a resolvable config dir),
1164    /// the count is `0` — the curated allowlist is the entire surface.
1165    /// Confirms the `Ok(vec![])` not-found path in `user_extension::load`
1166    /// flows through unchanged.
1167    #[test]
1168    #[serial_test::serial]
1169    fn user_extension_count_is_zero_without_config_toml() {
1170        let tmp = tempfile::TempDir::new().expect("tempdir");
1171        let cfg_root = camino::Utf8Path::from_path(tmp.path()).expect("utf8 tempdir");
1172
1173        let _x = EnvGuard::set("XDG_CONFIG_HOME", cfg_root.as_str());
1174        let _a = EnvGuard::unset("APPDATA");
1175        let _h = EnvGuard::unset("HOME");
1176        let _u = EnvGuard::unset("USERPROFILE");
1177
1178        let caps = build_capabilities(&test_cli());
1179        assert_eq!(caps.user_extension_count, 0);
1180    }
1181
1182    /// Minimal env-guard local to this tests module; mirrors the
1183    /// pattern in `commands::config::tests` (each module keeps its
1184    /// own copy so they stay leaf-level cheap).
1185    struct EnvGuard {
1186        var: &'static str,
1187        prior: Option<std::ffi::OsString>,
1188    }
1189
1190    impl EnvGuard {
1191        fn set(var: &'static str, value: &str) -> Self {
1192            let prior = std::env::var_os(var);
1193            std::env::set_var(var, value);
1194            EnvGuard { var, prior }
1195        }
1196        fn unset(var: &'static str) -> Self {
1197            let prior = std::env::var_os(var);
1198            std::env::remove_var(var);
1199            EnvGuard { var, prior }
1200        }
1201    }
1202
1203    impl Drop for EnvGuard {
1204        fn drop(&mut self) {
1205            match &self.prior {
1206                Some(v) => std::env::set_var(self.var, v),
1207                None => std::env::remove_var(self.var),
1208            }
1209        }
1210    }
1211
1212    #[test]
1213    fn every_test_cli_subcommand_has_metadata() {
1214        // Regression at the lib layer: anything we add to the shadow
1215        // `test_cli` must also be in `metadata_for`. The real
1216        // `Cli::command()` is exercised by the e2e test in
1217        // `tests/capabilities_e2e.rs`.
1218        for sub in test_cli().get_subcommands() {
1219            let name = sub.get_name();
1220            if name == "help" {
1221                continue;
1222            }
1223            assert!(
1224                metadata_for(name).is_some(),
1225                "subcommand `{name}` lacks metadata in `metadata_for`"
1226            );
1227        }
1228    }
1229}
doiget_cli/commands/capabilities.rs

doiget_cli/commands/
capabilities.rs