doiget_cli/commands/capabilities.rs
1//! `doiget capabilities` — single-shot inventory JSON for LLM cold-boot
2//! (#214).
3//!
4//! Emits a single JSON value describing the **full surface** of this
5//! `doiget` binary: subcommands (walked from the live `clap::Command`
6//! tree so the inventory cannot drift from the parser), positional args
7//! and named flags per subcommand, global flags, the four
8//! [`super::output::OutputMode`] values, hand-maintained env-var + example tables, the
9//! `doiget_*` MCP tool list, compile-time features, and a `docs` map
10//! pointing at the canonical spec files.
11//!
12//! Design rationale: the existing `--help` output lists subcommand
13//! names but the rest of doiget's surface (env vars, MCP tools, JSON
14//! schemas, ADR refs) is scattered across `docs/`. An LLM cold-booted
15//! into doiget — no repo access, no follow-up doc reads — cannot
16//! discover those via `--help` alone. This subcommand closes that gap
17//! with one round-trip.
18//!
19//! # Output mode
20//!
21//! `doiget capabilities` is a **product-output** command per the
22//! ADR-0017 convention (`--mode` is informational; the JSON inventory
23//! is the artefact). `--mode quiet` is the one mode that suppresses
24//! stdout (#203 / CONFIG.md §5); every other mode emits the same JSON.
25//!
26//! # Wire-format stability (whole module)
27//!
28//! Every `pub` struct / enum below carries `#[non_exhaustive]`. Adding
29//! a field is non-breaking; renaming or removing one is a
30//! compile-time break for downstream Rust consumers and a
31//! `[BREAKING]`-class change for JSON consumers (CHANGELOG must call
32//! it out). The per-item `#[non_exhaustive]` attributes intentionally
33//! carry no inline comment; this module-doc says it once.
34
35use anyhow::{Context, Result};
36use serde::Serialize;
37
38/// Top-level capability inventory. Serialised to stdout as one JSON
39/// value. Field names are part of the public wire format: renaming
40/// any field is a semver minor with a CHANGELOG `\[BREAKING\]` callout
41/// (same discipline as `EntryInfo` / `MigrationReport` in #213).
42#[allow(missing_docs)] // Field names ARE the schema; documented externally in #214.
43#[non_exhaustive]
44#[derive(Debug, Serialize)]
45pub struct Capabilities {
46 /// `CARGO_PKG_VERSION` for this build.
47 pub version: &'static str,
48 /// Cargo features compiled into this binary. Contains `"oa-only"`
49 /// in stock release builds (the default feature). Empty only when
50 /// the crate was built with `--no-default-features` and **no
51 /// other features enabled**; a build like
52 /// `cargo build --no-default-features --features citation`
53 /// yields `["citation"]`, not `[]`.
54 pub features: Vec<&'static str>,
55 /// All four [`super::output::OutputMode`] values; the parser accepts these for
56 /// `--mode`. Mirrors `CONFIG.md` §5 (CLI flags).
57 pub modes: &'static [&'static str],
58 /// Global flags that apply to every subcommand.
59 pub global_flags: Vec<FlagSpec>,
60 /// One entry per CLI subcommand (clap-walked).
61 pub subcommands: Vec<SubcommandSpec>,
62 /// `DOIGET_*` env vars from CONFIG.md §4.
63 pub env_vars: &'static [EnvVar],
64 /// MCP tools exposed by `doiget serve` (hand-coded; the source of
65 /// truth is `docs/MCP_TOOLS.md` §1).
66 pub mcp_tools: &'static [McpTool],
67 /// Canonical doc paths an LLM can pull for deeper context.
68 pub docs: Docs,
69 /// Number of user-extension allowlist hosts loaded from
70 /// `<config_dir>/doiget/config.toml` per ADR-0028 D2. `0` if the
71 /// config file is missing, contains no `[[network.additional_hosts]]`,
72 /// or fails to parse — run `doiget config doctor` to diagnose parse
73 /// failures. Exposed so an LLM can confirm at cold-boot whether the
74 /// curated allowlist has been extended on this host.
75 pub user_extension_count: usize,
76}
77
78/// What kind of value (if any) a [`FlagSpec`] carries.
79///
80/// Typed (not `&'static str`) so a typo can't slip into the wire
81/// format and the `Enum`-implies-`values`-present invariant is
82/// expressible at the type layer (see #215 for the design pass). Serialises
83/// as the lowercased variant name: `"bool"`, `"enum"`, `"string"`.
84#[non_exhaustive]
85#[derive(Debug, Serialize)]
86#[serde(rename_all = "lowercase")]
87pub enum FlagKind {
88 /// Boolean switch (no value).
89 Bool,
90 /// Value-bounded flag — `values` carries the accepted set.
91 Enum,
92 /// Any non-`Bool`, non-`Enum` flag. Today every such flag emits
93 /// `"string"`; richer typing (`Path` / `Int` etc.) is intentionally
94 /// out of scope until a real consumer needs it — `#[non_exhaustive]`
95 /// reserves space without commitment.
96 String,
97}
98
99#[allow(missing_docs)] // Field names ARE the schema; documented externally in #214.
100#[non_exhaustive]
101#[derive(Debug, Serialize)]
102pub struct FlagSpec {
103 /// e.g. `--mode`, `--json`, `-q`.
104 pub name: String,
105 /// Boolean / enum / free-string discriminator. See [`FlagKind`].
106 pub kind: FlagKind,
107 /// `clap` `help` text.
108 pub help: Option<String>,
109 /// For `kind == FlagKind::Enum`: the accepted values, harvested
110 /// from clap's `PossibleValuesParser`. Owned (not `&'static`) so
111 /// the helper works for any future enum flag, not just `--mode`
112 /// (see #215).
113 #[serde(skip_serializing_if = "Option::is_none")]
114 pub values: Option<Vec<String>>,
115}
116
117#[allow(missing_docs)] // Field names ARE the schema; documented externally in #214.
118#[non_exhaustive]
119#[derive(Debug, Serialize)]
120pub struct SubcommandSpec {
121 pub name: String,
122 pub summary: Option<String>,
123 pub args: Vec<ArgSpec>,
124 pub flags: Vec<FlagSpec>,
125 /// Hand-maintained canonical invocations.
126 pub examples: &'static [&'static str],
127 /// How this command interacts with `--mode json`. See [`JsonMode`].
128 pub json_mode: JsonMode,
129 /// Cargo feature this subcommand is gated behind, if any.
130 #[serde(skip_serializing_if = "Option::is_none")]
131 pub feature_gated: Option<&'static str>,
132}
133
134/// What kind of positional argument an [`ArgSpec`] describes.
135///
136/// Currently every entry is `Positional`; the typed enum reserves
137/// space for future variants (e.g. `Stdin` markers) without breaking
138/// existing JSON consumers. Serialises as `"positional"`.
139#[non_exhaustive]
140#[derive(Debug, Serialize)]
141#[serde(rename_all = "lowercase")]
142pub enum ArgKind {
143 /// A required-or-optional positional argument on the subcommand.
144 Positional,
145}
146
147#[allow(missing_docs)] // Field names ARE the schema; documented externally in #214.
148#[non_exhaustive]
149#[derive(Debug, Serialize)]
150pub struct ArgSpec {
151 pub name: String,
152 /// Always [`ArgKind::Positional`] today. Kept as a discriminator
153 /// so the JSON shape can grow new arg kinds later without
154 /// renaming fields (see #215 for the design pass).
155 pub kind: ArgKind,
156 pub help: Option<String>,
157 /// `true` when the arg has no default and no `Option<T>` wrapper.
158 pub required: bool,
159}
160
161/// How a subcommand interacts with `--mode json`.
162///
163/// Wire shape: every variant serialises to an object with a `status`
164/// discriminant, so a consumer sees uniform `{"status":"…", …}`
165/// records (`#[serde(tag = "status")]`). Before #215 the previous
166/// mixed string/object representation forced consumers to handle two
167/// JSON shapes for sibling variants.
168///
169/// **Tuple variants not permitted.** `#[serde(tag = "status")]`
170/// requires the tag to live in the same flat object as variant
171/// fields; tuple variants are incompatible with internally-tagged
172/// representation. Future variants MUST use named fields.
173#[non_exhaustive] // Adding a future variant is non-breaking for JSON consumers.
174#[derive(Debug, Serialize)]
175#[serde(tag = "status", rename_all = "lowercase")]
176pub enum JsonMode {
177 /// The command's primary output IS the requested artifact, not
178 /// informational chatter. `--mode` is informational here; the
179 /// exact stdout shape (e.g. JSON for `csl` / `graph` /
180 /// `capabilities` and the JSON-RPC stream from `serve`; BibTeX
181 /// for `bib`; PDF-on-disk + stderr summary for `fetch`; a
182 /// `--dry-run` JSON plan in the dry-run variants) is fixed by
183 /// the subcommand and may vary across flags. **Consult
184 /// `examples` for the per-flag stdout form** rather than
185 /// assuming JSON.
186 Artifact,
187 /// Under `--mode json` the command emits a structured JSON body
188 /// on stdout; otherwise the human form (e.g. `info`,
189 /// `list-recent`, `audit-log`, `provenance migrate`, `batch`).
190 Supported,
191 // NOTE: a `Deferred { tracking: &'static str }` variant was
192 // sketched during #214's design phase but never instantiated by
193 // any subcommand. Removed in the #215 self-review pass to avoid
194 // shipping an unused wire shape; `#[non_exhaustive]` keeps the
195 // door open to add it back non-breakingly when a real consumer
196 // emerges.
197}
198
199#[allow(missing_docs)] // Field names ARE the schema; documented externally in #214.
200#[non_exhaustive]
201#[derive(Debug, Serialize)]
202pub struct EnvVar {
203 pub name: &'static str,
204 /// `(none)` when no built-in default.
205 pub default: &'static str,
206 pub help: &'static str,
207}
208
209#[allow(missing_docs)] // Field names ARE the schema; documented externally in #214.
210#[non_exhaustive]
211#[derive(Debug, Serialize)]
212pub struct McpTool {
213 pub name: &'static str,
214 /// Anchor-style reference into `docs/MCP_TOOLS.md`.
215 pub schema_ref: &'static str,
216}
217
218#[allow(missing_docs)] // Field names ARE the schema; documented externally in #214.
219#[non_exhaustive]
220#[derive(Debug, Serialize)]
221pub struct Docs {
222 pub config: &'static str,
223 pub errors: &'static str,
224 pub scope: &'static str,
225 pub mcp: &'static str,
226 pub sources: &'static str,
227 pub redirect_allowlist: &'static str,
228 pub provenance_log: &'static str,
229}
230
231// ---------------------------------------------------------------------------
232// Static tables
233// ---------------------------------------------------------------------------
234
235const MODES: &[&str] = &["human", "json", "quiet", "mcp"];
236
237const ENV_VARS: &[EnvVar] = &[
238 EnvVar {
239 name: "DOIGET_STORE_ROOT",
240 default: "$HOME/papers",
241 help: "Root of the on-disk paper store. CONFIG.md §4.",
242 },
243 EnvVar {
244 name: "DOIGET_CACHE_ROOT",
245 default: "$HOME/.cache/doiget",
246 help: "Root of the on-disk HTTP / metadata cache. CONFIG.md §4.",
247 },
248 EnvVar {
249 name: "DOIGET_LOG_PATH",
250 default: "<config_dir>/doiget/access.jsonl",
251 help: "JSON-Lines provenance log file path (PROVENANCE_LOG.md §3).",
252 },
253 EnvVar {
254 name: "DOIGET_LOG_RETENTION_DAYS",
255 default: "90",
256 help: "Rotated-segment retention window (0 disables pruning). #140 / PROVENANCE_LOG.md §6.",
257 },
258 EnvVar {
259 name: "DOIGET_MODE",
260 default: "(none)",
261 help: "Output mode (`human`/`json`/`quiet`/`mcp`). ADR-0017 ladder rung 3.",
262 },
263 EnvVar {
264 name: "DOIGET_CONTACT_EMAIL",
265 default: "(none)",
266 help: "Contact email for polite User-Agent header (CONFIG.md §4).",
267 },
268 EnvVar {
269 name: "DOIGET_UNPAYWALL_EMAIL",
270 default: "(falls back to DOIGET_CONTACT_EMAIL)",
271 help: "Unpaywall-specific contact email.",
272 },
273 EnvVar {
274 name: "DOIGET_USER_AGENT",
275 default: "(default polite UA)",
276 help: "Override the User-Agent header for all outbound requests.",
277 },
278 EnvVar {
279 name: "DOIGET_ENABLE_OPENALEX",
280 default: "(off)",
281 help: "Enable the OpenAlex citation graph source (graph subcommand prerequisite).",
282 },
283 EnvVar {
284 name: "DOIGET_ARXIV_BASE",
285 default: "https://export.arxiv.org/",
286 help: "arXiv API base URL — primarily for testing/wiremock override.",
287 },
288 EnvVar {
289 name: "DOIGET_CROSSREF_BASE",
290 default: "https://api.crossref.org/",
291 help: "Crossref API base URL.",
292 },
293 EnvVar {
294 name: "DOIGET_UNPAYWALL_BASE",
295 default: "https://api.unpaywall.org/",
296 help: "Unpaywall API base URL.",
297 },
298];
299
300const MCP_TOOLS: &[McpTool] = &[
301 McpTool {
302 name: "doiget_resolve_paper",
303 schema_ref: "docs/MCP_TOOLS.md#1-tool-list",
304 },
305 McpTool {
306 name: "doiget_fetch_paper",
307 schema_ref: "docs/MCP_TOOLS.md#1-tool-list",
308 },
309 McpTool {
310 name: "doiget_metadata_only",
311 schema_ref: "docs/MCP_TOOLS.md#11-doiget_metadata_only-normative",
312 },
313 McpTool {
314 name: "doiget_batch_fetch",
315 schema_ref: "docs/MCP_TOOLS.md#1-tool-list",
316 },
317 McpTool {
318 name: "doiget_info",
319 schema_ref: "docs/MCP_TOOLS.md#1-tool-list",
320 },
321 McpTool {
322 name: "doiget_search_local",
323 schema_ref: "docs/MCP_TOOLS.md#1-tool-list",
324 },
325 McpTool {
326 name: "doiget_list_recent",
327 schema_ref: "docs/MCP_TOOLS.md#1-tool-list",
328 },
329 McpTool {
330 name: "doiget_paper_pdf_path",
331 schema_ref: "docs/MCP_TOOLS.md#1-tool-list",
332 },
333 McpTool {
334 name: "doiget_capability_profile",
335 schema_ref: "docs/MCP_TOOLS.md#1-tool-list",
336 },
337 McpTool {
338 name: "doiget_health",
339 schema_ref: "docs/MCP_TOOLS.md#1-tool-list",
340 },
341 McpTool {
342 name: "doiget_expand_citation_graph",
343 schema_ref: "docs/MCP_TOOLS.md#1-tool-list",
344 },
345 McpTool {
346 name: "doiget_bibtex_export",
347 schema_ref: "docs/MCP_TOOLS.md#1-tool-list",
348 },
349 McpTool {
350 name: "doiget_csl_export",
351 schema_ref: "docs/MCP_TOOLS.md#1-tool-list",
352 },
353 McpTool {
354 // ADR-0030 D6: parse a CSL-JSON / (future) BibTeX file and
355 // fetch each resolvable entry; each result row carries the
356 // source bibliography's `entry_key` so a Zotero / Mendeley
357 // plugin can bridge the fetched PDF back to the originating
358 // reference.
359 name: "doiget_batch_from_bibliography",
360 schema_ref: "docs/MCP_TOOLS.md#1-tool-list",
361 },
362];
363
364const DOCS: Docs = Docs {
365 config: "docs/CONFIG.md",
366 errors: "docs/ERRORS.md",
367 scope: "docs/SCOPE.md",
368 mcp: "docs/MCP_TOOLS.md",
369 sources: "docs/SOURCES.md",
370 redirect_allowlist: "docs/REDIRECT_ALLOWLIST.md",
371 provenance_log: "docs/PROVENANCE_LOG.md",
372};
373
374/// Per-subcommand hand-maintained metadata. The clap walk provides
375/// name + summary + args + flags; this table adds examples,
376/// `json_mode` semantics, and feature-gating that clap doesn't
377/// expose. A regression unit test asserts every clap-visible
378/// subcommand has an entry here (otherwise the test fails loudly).
379///
380/// **Maintenance:** `feature_gated` MUST be kept in sync with the
381/// corresponding `#[cfg(feature = …)]` annotation in `main.rs`. There
382/// is no compile-time check; the `every_test_cli_subcommand_has_metadata`
383/// regression test does not cover feature-gating directly — it only
384/// asserts metadata exists. Add a CI matrix entry (`--features
385/// citation`) when introducing new gated subcommands so the e2e
386/// assertion list catches drift (see #215). Alternatively, add a
387/// unit test that asserts `metadata_for("graph").unwrap().feature_gated
388/// == Some("citation")` to lock the gate at the lib-test layer.
389struct SubcommandMeta {
390 examples: &'static [&'static str],
391 json_mode: JsonMode,
392 feature_gated: Option<&'static str>,
393}
394
395fn metadata_for(subcommand: &str) -> Option<SubcommandMeta> {
396 let m = match subcommand {
397 "fetch" => SubcommandMeta {
398 examples: &[
399 "doiget fetch 10.1234/foo",
400 "doiget fetch arxiv:2401.12345",
401 "doiget fetch 10.1234/foo --dry-run",
402 ],
403 // The success summary is on stderr (ADR-0001); the
404 // dry-run plan is JSON product output (ADR-0022).
405 json_mode: JsonMode::Artifact,
406 feature_gated: None,
407 },
408 "batch" => SubcommandMeta {
409 examples: &[
410 "doiget batch refs.txt",
411 "doiget batch refs.txt --dry-run",
412 "doiget batch refs.txt --json",
413 ],
414 // `--json` emits the ERRORS.md §3 JSONL per-ref shape (#205).
415 json_mode: JsonMode::Supported,
416 feature_gated: None,
417 },
418 "verify" => SubcommandMeta {
419 examples: &[
420 "doiget verify refs.bib",
421 "doiget verify library.bib --strict",
422 "doiget verify refs.txt --format refs",
423 ],
424 // Emits one JSON-Lines record per entry regardless of mode;
425 // the JSONL stream is the product output.
426 json_mode: JsonMode::Artifact,
427 feature_gated: None,
428 },
429 "lint" => SubcommandMeta {
430 examples: &["doiget lint refs.bib", "doiget lint library.bib --strict"],
431 // Emits one JSON-Lines finding per issue; the JSONL stream is
432 // the product output (mirrors verify).
433 json_mode: JsonMode::Artifact,
434 feature_gated: None,
435 },
436 "info" => SubcommandMeta {
437 examples: &[
438 "doiget info 10.1234/foo",
439 "doiget info arxiv:2401.12345 --json",
440 ],
441 json_mode: JsonMode::Supported,
442 feature_gated: None,
443 },
444 "list-recent" => SubcommandMeta {
445 examples: &[
446 "doiget list-recent",
447 "doiget list-recent 20",
448 "doiget list-recent --json",
449 ],
450 json_mode: JsonMode::Supported,
451 feature_gated: None,
452 },
453 "search" => SubcommandMeta {
454 examples: &[
455 "doiget search 'quantum entanglement'",
456 "doiget search renormalization --json",
457 ],
458 json_mode: JsonMode::Supported,
459 feature_gated: None,
460 },
461 "bib" => SubcommandMeta {
462 examples: &["doiget bib 10.1234/foo", "doiget bib arxiv:2401.12345"],
463 // BibTeX output is the product; `--mode` is informational.
464 json_mode: JsonMode::Artifact,
465 feature_gated: None,
466 },
467 "cite" => SubcommandMeta {
468 examples: &["doiget cite 10.1234/foo", "doiget cite arxiv:2401.12345"],
469 // BibTeX output is the product (resolved live); `--mode` is
470 // informational, mirroring `bib`.
471 json_mode: JsonMode::Artifact,
472 feature_gated: None,
473 },
474 "csl" => SubcommandMeta {
475 examples: &["doiget csl 10.1234/foo"],
476 json_mode: JsonMode::Artifact,
477 feature_gated: None,
478 },
479 "audit-log" => SubcommandMeta {
480 examples: &[
481 "doiget audit-log --verify",
482 "doiget audit-log --verify --json",
483 "doiget --quiet audit-log --verify # exit code only",
484 ],
485 json_mode: JsonMode::Supported,
486 feature_gated: None,
487 },
488 "provenance" => SubcommandMeta {
489 examples: &[
490 "doiget provenance migrate --dry-run",
491 "doiget provenance migrate",
492 "doiget provenance migrate --dry-run --json",
493 ],
494 json_mode: JsonMode::Supported,
495 feature_gated: None,
496 },
497 "config" => SubcommandMeta {
498 examples: &[
499 "doiget config show",
500 "doiget config show --json",
501 "doiget config path",
502 "doiget config doctor",
503 ],
504 json_mode: JsonMode::Supported,
505 feature_gated: None,
506 },
507 "serve" => SubcommandMeta {
508 examples: &["doiget serve # stdio MCP server (ADR-0001)"],
509 // serve always runs in mcp mode; the protocol output is
510 // JSON-RPC, which is product.
511 json_mode: JsonMode::Artifact,
512 feature_gated: None,
513 },
514 "graph" => SubcommandMeta {
515 examples: &[
516 "DOIGET_ENABLE_OPENALEX=1 doiget graph 10.1234/foo",
517 "DOIGET_ENABLE_OPENALEX=1 doiget graph 10.1234/foo --depth 2 --total 50",
518 ],
519 json_mode: JsonMode::Artifact,
520 feature_gated: Some("citation"),
521 },
522 "version" => SubcommandMeta {
523 examples: &[
524 "doiget version",
525 "doiget version --check",
526 "doiget version --check --mode json",
527 ],
528 json_mode: JsonMode::Supported,
529 feature_gated: None,
530 },
531 "capabilities" => SubcommandMeta {
532 examples: &["doiget capabilities | jq ."],
533 // The whole point of capabilities IS JSON output.
534 json_mode: JsonMode::Artifact,
535 feature_gated: None,
536 },
537 // clap auto-adds `help`; we silently ignore it (it's not a
538 // domain subcommand).
539 "help" => return None,
540 _ => return None,
541 };
542 Some(m)
543}
544
545// ---------------------------------------------------------------------------
546// Build
547// ---------------------------------------------------------------------------
548
549/// Build the [`Capabilities`] inventory from `cli` (the clap parser
550/// for this binary, supplied by the caller because the `Cli` struct
551/// lives in `main.rs` and is not exposed in the library crate). The
552/// caller is `commands::main::run_dispatch` via `Cli::command()`.
553pub fn build_capabilities(cli: &clap::Command) -> Capabilities {
554 let global_flags = collect_global_flags(cli);
555 let subcommands = cli
556 .get_subcommands()
557 .filter_map(|sub| build_subcommand(sub, cli))
558 .collect::<Vec<_>>();
559 Capabilities {
560 version: env!("CARGO_PKG_VERSION"),
561 features: compile_time_features(),
562 modes: MODES,
563 global_flags,
564 subcommands,
565 env_vars: ENV_VARS,
566 mcp_tools: MCP_TOOLS,
567 docs: DOCS,
568 user_extension_count: user_extension_count(),
569 }
570}
571
572/// Count valid `[[network.additional_hosts]]` entries in
573/// `<config_dir>/doiget/config.toml` (ADR-0028 D2). Returns `0` on any
574/// failure — missing config, parse error, unresolvable config dir.
575/// Diagnose failures via `doiget config doctor`; here we only need a
576/// best-effort cold-boot signal for the inventory.
577fn user_extension_count() -> usize {
578 let cfg_dir = match super::fetch::config_dir_utf8() {
579 Ok(p) => p,
580 Err(_) => return 0,
581 };
582 let path = cfg_dir.join("doiget").join("config.toml");
583 match doiget_core::user_extension::load(&path) {
584 Ok(hosts) => hosts.len(),
585 Err(_) => 0,
586 }
587}
588
589fn compile_time_features() -> Vec<&'static str> {
590 let mut feats: Vec<&'static str> = Vec::new();
591 if cfg!(feature = "oa-only") {
592 feats.push("oa-only");
593 }
594 if cfg!(feature = "metadata") {
595 feats.push("metadata");
596 }
597 if cfg!(feature = "citation") {
598 feats.push("citation");
599 }
600 if cfg!(feature = "tdm-elsevier") {
601 feats.push("tdm-elsevier");
602 }
603 if cfg!(feature = "tdm-aps") {
604 feats.push("tdm-aps");
605 }
606 if cfg!(feature = "tdm-springer") {
607 feats.push("tdm-springer");
608 }
609 feats
610}
611
612fn collect_global_flags(cmd: &clap::Command) -> Vec<FlagSpec> {
613 cmd.get_arguments()
614 .filter(|a| a.is_global_set())
615 .map(arg_to_flag_spec)
616 .collect()
617}
618
619fn build_subcommand(sub: &clap::Command, root: &clap::Command) -> Option<SubcommandSpec> {
620 let name = sub.get_name();
621 let meta = metadata_for(name)?;
622 let (args, flags) = split_args_and_flags(sub, root);
623 Some(SubcommandSpec {
624 name: name.to_string(),
625 summary: sub.get_about().map(|s| s.to_string()),
626 args,
627 flags,
628 examples: meta.examples,
629 json_mode: meta.json_mode,
630 feature_gated: meta.feature_gated,
631 })
632}
633
634fn split_args_and_flags(
635 sub: &clap::Command,
636 root: &clap::Command,
637) -> (Vec<ArgSpec>, Vec<FlagSpec>) {
638 // The root's global args appear in every subcommand's iterator;
639 // suppress them from per-subcommand `flags` (they're already in
640 // `global_flags`).
641 let global_names: std::collections::HashSet<&str> = root
642 .get_arguments()
643 .filter(|a| a.is_global_set())
644 .map(|a| a.get_id().as_str())
645 .collect();
646 let mut args = Vec::new();
647 let mut flags = Vec::new();
648 for a in sub.get_arguments() {
649 if global_names.contains(a.get_id().as_str()) {
650 continue;
651 }
652 // Clap auto-adds `--help` (and `--version` on the root) to
653 // every subcommand. They're not positional and not
654 // `is_global_set()`, so they would otherwise leak into every
655 // subcommand's `flags[]` as `kind: "string"`. Filter on the
656 // action against the known built-in variants.
657 //
658 // **Maintenance:** `clap::ArgAction` is itself
659 // `#[non_exhaustive]` upstream. A future clap release that
660 // adds a new built-in action (e.g. a hypothetical
661 // `HelpMarkdown`) would fall through this `matches!` and
662 // reappear in `flags[]`. Re-audit this filter on every clap
663 // minor-version bump.
664 if matches!(
665 a.get_action(),
666 clap::ArgAction::Help
667 | clap::ArgAction::HelpShort
668 | clap::ArgAction::HelpLong
669 | clap::ArgAction::Version
670 ) {
671 continue;
672 }
673 if a.is_positional() {
674 args.push(ArgSpec {
675 name: a.get_id().to_string(),
676 kind: ArgKind::Positional,
677 help: a.get_help().map(|s| s.to_string()),
678 required: a.is_required_set(),
679 });
680 } else {
681 flags.push(arg_to_flag_spec(a));
682 }
683 }
684 (args, flags)
685}
686
687fn arg_to_flag_spec(a: &clap::Arg) -> FlagSpec {
688 let name = a
689 .get_long()
690 .map(|s| format!("--{s}"))
691 .or_else(|| a.get_short().map(|c| format!("-{c}")))
692 .unwrap_or_else(|| a.get_id().to_string());
693 // Boolean switches → `Bool`; value-enum flags → `Enum` with the
694 // accepted values harvested from clap directly; everything else
695 // → `String`. The `possible_values()` harvest covers any future
696 // enum flag without code change (see #215).
697 let possible: Option<Vec<String>> = a
698 .get_value_parser()
699 .possible_values()
700 .map(|it| it.map(|pv| pv.get_name().to_owned()).collect());
701 let (kind, values) = if matches!(
702 a.get_action(),
703 clap::ArgAction::SetTrue | clap::ArgAction::SetFalse
704 ) {
705 (FlagKind::Bool, None)
706 } else if let Some(vs) = possible {
707 (FlagKind::Enum, Some(vs))
708 } else {
709 (FlagKind::String, None)
710 };
711 FlagSpec {
712 name,
713 kind,
714 help: a.get_help().map(|s| s.to_string()),
715 values,
716 }
717}
718
719// ---------------------------------------------------------------------------
720// Entry point
721// ---------------------------------------------------------------------------
722
723/// Run the `doiget capabilities` subcommand.
724///
725/// `capabilities` is an **artifact** command per ADR-0017 Amendment 1:
726/// its stdout output IS the deliverable (the inventory JSON an LLM
727/// reads on cold-boot). It honors only **explicit** Quiet —
728/// `--quiet` / `-q` / `--mode quiet` / `DOIGET_MODE=quiet` — and emits
729/// the inventory on every other path. The `quiet_was_explicit`
730/// discriminator is what distinguishes the two cases:
731///
732/// | mode | quiet_was_explicit | behaviour |
733/// |--------------------|--------------------|--------------------|
734/// | non-`Quiet` | - | emit |
735/// | `Quiet` (explicit) | `true` | suppress |
736/// | `Quiet` (non-TTY) | `false` | **emit** (#219) |
737///
738/// The non-TTY case is the one #219 / #220 report: an LLM tool
739/// executor captures stdout, so `stdout_is_tty()` is `false`, the
740/// resolver falls through to `Quiet`, but the caller wants the JSON
741/// inventory exactly because it's about to be machine-parsed. The
742/// table's bottom row is the fix.
743///
744/// The caller passes the live `clap::Command` so the clap walk
745/// operates on the binary's actual `Cli` tree (which the lib half of
746/// this crate can't reach directly — the `Cli` struct lives in
747/// `main.rs`).
748pub fn run(
749 cli: &clap::Command,
750 mode: super::output::OutputMode,
751 quiet_was_explicit: bool,
752) -> Result<()> {
753 // ADR-0017 Amendment 1: artifact command — suppress ONLY on
754 // explicit Quiet, never on the non-TTY implicit fallback.
755 if mode == super::output::OutputMode::Quiet && quiet_was_explicit {
756 return Ok(());
757 }
758 let caps = build_capabilities(cli);
759 let s = serde_json::to_string_pretty(&caps).context("serialise capabilities inventory")?;
760 // `print_stdout` workspace-deny; localised allow at the
761 // sanctioned product-output sink. See `commands/csl.rs`'s pattern.
762 #[allow(clippy::print_stdout)]
763 {
764 println!("{s}");
765 }
766 Ok(())
767}
768
769// ---------------------------------------------------------------------------
770// Tests
771// ---------------------------------------------------------------------------
772
773#[cfg(test)]
774#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
775mod tests {
776 use super::*;
777
778 /// Mirrors the `Cli` struct in `main.rs` for lib-test reach.
779 /// `commands::capabilities` is library-level; the binary-only
780 /// `Cli` struct can't be reached from here, so we re-derive a
781 /// shadow whose subcommand list is identical. The
782 /// `cli_shadow_matches_main_cli` integration test in
783 /// `tests/capabilities_e2e.rs` runs the real binary and asserts
784 /// the wire output matches.
785 fn test_cli() -> clap::Command {
786 use clap::{Arg, ArgAction, Command};
787 let mode_values = ["human", "json", "quiet", "mcp"];
788 let cmd = Command::new("doiget")
789 .arg(
790 Arg::new("mode")
791 .long("mode")
792 .global(true)
793 .value_parser(clap::builder::PossibleValuesParser::new(mode_values))
794 .help("Output mode (human|json|quiet|mcp)."),
795 )
796 .arg(
797 Arg::new("json")
798 .long("json")
799 .global(true)
800 .action(ArgAction::SetTrue)
801 .help("Short for `--mode json`."),
802 )
803 .arg(
804 Arg::new("quiet")
805 .long("quiet")
806 .short('q')
807 .global(true)
808 .action(ArgAction::SetTrue)
809 .help("Short for `--mode quiet`."),
810 )
811 .subcommand(
812 Command::new("fetch")
813 .about("Fetch a single paper PDF")
814 .arg(Arg::new("ref").required(true))
815 .arg(
816 Arg::new("dry-run")
817 .long("dry-run")
818 .action(ArgAction::SetTrue),
819 ),
820 )
821 .subcommand(
822 Command::new("batch")
823 .about("Fetch many refs")
824 .arg(Arg::new("path").required(true))
825 .arg(
826 Arg::new("dry-run")
827 .long("dry-run")
828 .action(ArgAction::SetTrue),
829 ),
830 )
831 .subcommand(
832 Command::new("info")
833 .about("Show metadata")
834 .arg(Arg::new("ref").required(true)),
835 )
836 .subcommand(Command::new("list-recent").about("List recent"))
837 .subcommand(
838 Command::new("search")
839 .about("Search local")
840 .arg(Arg::new("query").required(true)),
841 )
842 .subcommand(
843 Command::new("bib")
844 .about("BibTeX export")
845 .arg(Arg::new("ref").required(true)),
846 )
847 .subcommand(
848 Command::new("cite")
849 .about("Live BibTeX")
850 .arg(Arg::new("ref").required(true)),
851 )
852 .subcommand(
853 Command::new("csl")
854 .about("CSL export")
855 .arg(Arg::new("ref").required(true)),
856 )
857 .subcommand(
858 Command::new("audit-log")
859 .about("Audit log")
860 .arg(Arg::new("verify").long("verify").action(ArgAction::SetTrue)),
861 )
862 .subcommand(Command::new("provenance").about("Provenance ops"))
863 .subcommand(
864 Command::new("config")
865 .about("Config")
866 .arg(Arg::new("action").required(true)),
867 )
868 .subcommand(Command::new("serve").about("MCP server"));
869 // `graph` is `#[cfg(feature = "citation")]` in main.rs; mirror
870 // the gate so the shadow CLI matches the production surface
871 // (see #215).
872 #[cfg(feature = "citation")]
873 let cmd = cmd.subcommand(
874 Command::new("graph")
875 .about("Citation graph")
876 .arg(Arg::new("ref").required(true)),
877 );
878 cmd.subcommand(Command::new("capabilities").about("Capabilities"))
879 }
880
881 fn caps() -> Capabilities {
882 build_capabilities(&test_cli())
883 }
884
885 #[test]
886 fn capabilities_serialises_to_valid_json() {
887 let s = serde_json::to_string_pretty(&caps()).expect("serialise");
888 let v: serde_json::Value = serde_json::from_str(&s).expect("parse round-trip");
889 for key in [
890 "version",
891 "features",
892 "modes",
893 "global_flags",
894 "subcommands",
895 "env_vars",
896 "mcp_tools",
897 "docs",
898 "user_extension_count",
899 ] {
900 assert!(
901 v.get(key).is_some(),
902 "top-level key `{key}` missing from capabilities JSON: {v}"
903 );
904 }
905 }
906
907 #[test]
908 fn modes_field_matches_output_mode_enum() {
909 // Tied to `OutputMode { Human, Json, Quiet, Mcp }`.
910 assert_eq!(caps().modes, &["human", "json", "quiet", "mcp"]);
911 }
912
913 #[test]
914 fn env_vars_all_use_doiget_prefix() {
915 for ev in ENV_VARS {
916 assert!(
917 ev.name.starts_with("DOIGET_"),
918 "env var name MUST use DOIGET_ prefix, got `{}`",
919 ev.name
920 );
921 }
922 }
923
924 #[test]
925 fn mcp_tools_all_use_doiget_prefix() {
926 for t in MCP_TOOLS {
927 assert!(
928 t.name.starts_with("doiget_"),
929 "MCP tool name MUST use doiget_ prefix, got `{}`",
930 t.name
931 );
932 }
933 }
934
935 #[test]
936 fn subcommand_examples_reference_the_subcommand_name() {
937 for sub in &caps().subcommands {
938 for ex in sub.examples {
939 // `graph` examples carry a `DOIGET_ENABLE_OPENALEX=1`
940 // env prefix before `doiget …`. Allow either form.
941 assert!(
942 ex.starts_with("doiget ") || ex.contains(" doiget "),
943 "example `{ex}` for `{}` must invoke `doiget` somewhere",
944 sub.name
945 );
946 assert!(
947 ex.contains(&sub.name),
948 "example `{ex}` does not mention subcommand `{}`",
949 sub.name
950 );
951 }
952 }
953 }
954
955 // Exact-set parity guard against drift between the static
956 // `ENV_VARS` table and the documented surface (#215). The expected set is the SOURCE OF TRUTH at test time;
957 // adding a new DOIGET_* env var requires updating both ENV_VARS
958 // and this list in lockstep. CHANGELOG records cross-PR changes.
959 #[test]
960 fn env_vars_exact_set_matches_expected() {
961 let actual: std::collections::BTreeSet<&str> = ENV_VARS.iter().map(|ev| ev.name).collect();
962 let expected: std::collections::BTreeSet<&str> = [
963 // CONFIG.md §4 documented:
964 "DOIGET_STORE_ROOT",
965 "DOIGET_CACHE_ROOT",
966 "DOIGET_LOG_PATH",
967 "DOIGET_LOG_RETENTION_DAYS",
968 "DOIGET_USER_AGENT",
969 "DOIGET_UNPAYWALL_EMAIL",
970 "DOIGET_MODE",
971 // Code-reachable but documented in code-level docs or
972 // CAPABILITY.md (not CONFIG.md §4):
973 "DOIGET_CONTACT_EMAIL",
974 "DOIGET_ENABLE_OPENALEX",
975 // Test/wiremock-override base URLs:
976 "DOIGET_ARXIV_BASE",
977 "DOIGET_CROSSREF_BASE",
978 "DOIGET_UNPAYWALL_BASE",
979 ]
980 .into_iter()
981 .collect();
982 assert_eq!(
983 actual, expected,
984 "ENV_VARS table drifted from the expected canonical set; \
985 update both `ENV_VARS` and this test together (and CONFIG.md §4 \
986 if the new var is user-documented)."
987 );
988 }
989
990 // Exact-set parity guard against drift between the static
991 // `MCP_TOOLS` table and `docs/MCP_TOOLS.md` §1 (#215).
992 #[test]
993 fn mcp_tools_exact_set_matches_expected() {
994 let actual: std::collections::BTreeSet<&str> = MCP_TOOLS.iter().map(|t| t.name).collect();
995 let expected: std::collections::BTreeSet<&str> = [
996 "doiget_resolve_paper",
997 "doiget_fetch_paper",
998 "doiget_metadata_only",
999 "doiget_batch_fetch",
1000 "doiget_info",
1001 "doiget_search_local",
1002 "doiget_list_recent",
1003 "doiget_paper_pdf_path",
1004 "doiget_capability_profile",
1005 "doiget_health",
1006 "doiget_expand_citation_graph",
1007 "doiget_bibtex_export",
1008 "doiget_csl_export",
1009 "doiget_batch_from_bibliography",
1010 ]
1011 .into_iter()
1012 .collect();
1013 assert_eq!(
1014 actual, expected,
1015 "MCP_TOOLS table drifted from the expected set; update both \
1016 `MCP_TOOLS` and this test together (and docs/MCP_TOOLS.md §1)."
1017 );
1018 }
1019
1020 // Pin the `#[serde(tag = "status")]` wire shape: every variant
1021 // serialises to a `{"status":"…", …}` object. Accidentally
1022 // removing the `tag` attribute (or renaming the discriminant)
1023 // would silently degrade the wire format; this test catches it
1024 // (#215 N1).
1025 #[test]
1026 fn json_mode_serialises_with_status_discriminant() {
1027 let s = serde_json::to_string(&JsonMode::Artifact).expect("serialise");
1028 assert_eq!(
1029 s, r#"{"status":"artifact"}"#,
1030 "Artifact must emit a status-tagged object"
1031 );
1032 let s = serde_json::to_string(&JsonMode::Supported).expect("serialise");
1033 assert_eq!(s, r#"{"status":"supported"}"#);
1034 }
1035
1036 // `arg_to_flag_spec` was generalised in #215 to harvest the
1037 // accepted values from clap's `PossibleValuesParser` instead of
1038 // hard-coding `--mode`. Pin the contract: the `--mode` entry in
1039 // `global_flags` MUST report `kind: Enum` with all four mode
1040 // strings. A future regression that silently degrades `--mode`
1041 // to `kind: String, values: None` would otherwise pass every
1042 // existing test (#215 N3).
1043 #[test]
1044 fn mode_flag_carries_enum_kind_and_all_four_values() {
1045 let global = &caps().global_flags;
1046 let mode = global
1047 .iter()
1048 .find(|f| f.name == "--mode")
1049 .expect("--mode flag is in global_flags");
1050 assert!(
1051 matches!(mode.kind, FlagKind::Enum),
1052 "--mode kind MUST be Enum, got {:?}",
1053 mode.kind
1054 );
1055 let vs = mode.values.as_ref().expect("--mode carries values");
1056 let mut sorted = vs.clone();
1057 sorted.sort();
1058 assert_eq!(sorted, vec!["human", "json", "mcp", "quiet"]);
1059 }
1060
1061 // `compile_time_features()` pushes string literals that must
1062 // exactly match the Cargo feature names in `Cargo.toml`. A
1063 // typo in the literal (`"oa_only"` vs `"oa-only"`) would
1064 // silently invert the inventory's `features` field for every
1065 // consumer. The default build has `oa-only` active; assert
1066 // the literal round-trips (#215 A9).
1067 #[test]
1068 fn compile_time_features_contains_oa_only_under_default() {
1069 // `cfg!(feature = "oa-only")` is true in the default test
1070 // build; if a future maintainer disables the default feature
1071 // for the test target, this test becomes meaningless but
1072 // does not cause a false failure.
1073 if cfg!(feature = "oa-only") {
1074 let f = compile_time_features();
1075 assert!(
1076 f.contains(&"oa-only"),
1077 "oa-only feature was enabled at compile time but \
1078 `compile_time_features()` did not list it: {f:?}"
1079 );
1080 }
1081 }
1082
1083 #[test]
1084 fn version_is_cargo_pkg_version() {
1085 assert_eq!(caps().version, env!("CARGO_PKG_VERSION"));
1086 }
1087
1088 /// ADR-0028 D2: `user_extension_count` must reflect the number of
1089 /// `[[network.additional_hosts]]` entries actually present in
1090 /// `<config_dir>/doiget/config.toml`. The test points every
1091 /// config-dir env var at a tempdir, writes a 2-host config, and
1092 /// asserts the inventory reports `2`. Drift here would silently
1093 /// hide user-curated allowlist hosts from the cold-boot JSON.
1094 #[test]
1095 #[serial_test::serial]
1096 fn user_extension_count_reflects_config_toml_entries() {
1097 let tmp = tempfile::TempDir::new().expect("tempdir");
1098 let cfg_root = camino::Utf8Path::from_path(tmp.path()).expect("utf8 tempdir");
1099 let doiget_dir = cfg_root.join("doiget");
1100 std::fs::create_dir_all(doiget_dir.as_std_path()).expect("mk dir");
1101 let config_toml = doiget_dir.join("config.toml");
1102 std::fs::write(
1103 config_toml.as_std_path(),
1104 "[[network.additional_hosts]]\n\
1105 host = \"example.org\"\n\
1106 \n\
1107 [[network.additional_hosts]]\n\
1108 host = \"*.example.net\"\n\
1109 note = \"university OA mirror\"\n",
1110 )
1111 .expect("write config.toml");
1112
1113 let _x = EnvGuard::set("XDG_CONFIG_HOME", cfg_root.as_str());
1114 let _a = EnvGuard::unset("APPDATA");
1115 let _h = EnvGuard::unset("HOME");
1116 let _u = EnvGuard::unset("USERPROFILE");
1117
1118 let cli = test_cli();
1119 let caps = build_capabilities(&cli);
1120 assert_eq!(
1121 caps.user_extension_count, 2,
1122 "expected 2 user-extension hosts, got {}",
1123 caps.user_extension_count
1124 );
1125 }
1126
1127 /// Companion: with no config file (and a resolvable config dir),
1128 /// the count is `0` — the curated allowlist is the entire surface.
1129 /// Confirms the `Ok(vec![])` not-found path in `user_extension::load`
1130 /// flows through unchanged.
1131 #[test]
1132 #[serial_test::serial]
1133 fn user_extension_count_is_zero_without_config_toml() {
1134 let tmp = tempfile::TempDir::new().expect("tempdir");
1135 let cfg_root = camino::Utf8Path::from_path(tmp.path()).expect("utf8 tempdir");
1136
1137 let _x = EnvGuard::set("XDG_CONFIG_HOME", cfg_root.as_str());
1138 let _a = EnvGuard::unset("APPDATA");
1139 let _h = EnvGuard::unset("HOME");
1140 let _u = EnvGuard::unset("USERPROFILE");
1141
1142 let caps = build_capabilities(&test_cli());
1143 assert_eq!(caps.user_extension_count, 0);
1144 }
1145
1146 /// Minimal env-guard local to this tests module; mirrors the
1147 /// pattern in `commands::config::tests` (each module keeps its
1148 /// own copy so they stay leaf-level cheap).
1149 struct EnvGuard {
1150 var: &'static str,
1151 prior: Option<std::ffi::OsString>,
1152 }
1153
1154 impl EnvGuard {
1155 fn set(var: &'static str, value: &str) -> Self {
1156 let prior = std::env::var_os(var);
1157 std::env::set_var(var, value);
1158 EnvGuard { var, prior }
1159 }
1160 fn unset(var: &'static str) -> Self {
1161 let prior = std::env::var_os(var);
1162 std::env::remove_var(var);
1163 EnvGuard { var, prior }
1164 }
1165 }
1166
1167 impl Drop for EnvGuard {
1168 fn drop(&mut self) {
1169 match &self.prior {
1170 Some(v) => std::env::set_var(self.var, v),
1171 None => std::env::remove_var(self.var),
1172 }
1173 }
1174 }
1175
1176 #[test]
1177 fn every_test_cli_subcommand_has_metadata() {
1178 // Regression at the lib layer: anything we add to the shadow
1179 // `test_cli` must also be in `metadata_for`. The real
1180 // `Cli::command()` is exercised by the e2e test in
1181 // `tests/capabilities_e2e.rs`.
1182 for sub in test_cli().get_subcommands() {
1183 let name = sub.get_name();
1184 if name == "help" {
1185 continue;
1186 }
1187 assert!(
1188 metadata_for(name).is_some(),
1189 "subcommand `{name}` lacks metadata in `metadata_for`"
1190 );
1191 }
1192 }
1193}