kanade_shared/
bootstrap.rs

1//! Idempotent JetStream bootstrap (Sprint 6.x follow-up).
2//!
3//! Lists every NATS JetStream resource the kanade fleet expects —
4//! streams, KV buckets, Object Stores — and asks the broker to
5//! create-or-update them. v0.25.0 switched from `create_*` to
6//! `create_or_update_*`: the old form returned error 10058 ("name
7//! already in use with a different configuration") when a release
8//! widened a stream's subjects or changed its retention policy on
9//! a broker that still held the older config. With the new form the
10//! broker reconciles its definition to the one in this file, so
11//! version bumps no longer require operator-side data wipes.
12//!
13//! Centralising the list here means a future "we added a new
14//! bucket" change touches one place and both the operator CLI +
15//! the auto-bootstrap path pick it up.
16
17use std::time::Duration;
18
19use anyhow::{Context, Result};
20use async_nats::jetstream::{
21    self,
22    kv::Config as KvConfig,
23    object_store::Config as ObjectStoreConfig,
24    stream::{Config as StreamConfig, DiscardPolicy},
25};
26use tracing::{info, warn};
27
28use crate::kv::{
29    BUCKET_AGENT_CONFIG, BUCKET_AGENT_GROUPS, BUCKET_AGENT_GROUPS_DERIVED, BUCKET_AGENT_META,
30    BUCKET_AGENTS_STATE, BUCKET_FLEET_CONFIG, BUCKET_GROUP_CONTACTS, BUCKET_JOBS, BUCKET_JOBS_YAML,
31    BUCKET_NOTIFICATIONS_READ, BUCKET_SCHEDULES, BUCKET_SCHEDULES_YAML, BUCKET_SCRIPT_CURRENT,
32    BUCKET_SCRIPT_STATUS, BUCKET_SERVER_SETTINGS, OBJECT_AGENT_RELEASES, OBJECT_APP_PACKAGES,
33    OBJECT_COLLECTIONS, OBJECT_RESULT_OUTPUT, OBJECT_SCRIPTS, STREAM_AUDIT, STREAM_EVENTS,
34    STREAM_EXEC, STREAM_INVENTORY, STREAM_NOTIFICATIONS, STREAM_OBS_EVENTS, STREAM_RESULTS,
35};
36use crate::wire::DEFAULT_COLLECT_RETENTION_DAYS;
37
38/// Create-or-update an Object Store, but never let it wedge backend
39/// startup. `create_object_store` neither reconciles an existing
40/// store's config nor has a `create_or_update` form in async-nats
41/// 0.49, so a store whose desired config drifted — e.g. the #518
42/// `max_bytes` cap added after the bucket was first created uncapped,
43/// which the broker then rejects with error 10058 ("stream name
44/// already in use with a different configuration") — would otherwise
45/// fail `ensure_jetstream_resources` and crash the backend on boot
46/// (production outage 2026-06-11). Fall back to the existing store
47/// (uncapped, as it already was) and warn. #506 tracks real
48/// reconciliation of object-store config.
49async fn ensure_object_store(js: &jetstream::Context, cfg: ObjectStoreConfig) -> Result<()> {
50    let name = cfg.bucket.clone();
51    if let Err(e) = js.create_object_store(cfg).await {
52        // The fallback is deliberately broad — any create error is
53        // tolerated AS LONG AS the store already exists, because the
54        // alternative is a wedged backend and "never crash on boot"
55        // wins over "surface this specific error". The expected error
56        // is 10058 (config drift, the incident), but auth/network
57        // blips on an already-bootstrapped broker take this path too;
58        // they remain visible via the `warn!`. Only a genuine
59        // "can't create AND doesn't exist" is fatal.
60        if js.get_object_store(&name).await.is_err() {
61            return Err(e).with_context(|| {
62                format!("create_object_store {name} (and no existing store to fall back to)")
63            });
64        }
65        warn!(
66            store = %name, error = %e,
67            "object store exists with a different config; using it as-is (cap not reconciled)",
68        );
69    }
70    info!(store = %name, "ready");
71    Ok(())
72}
73
74/// Idempotently create every NATS JetStream resource the kanade
75/// fleet relies on. Calling repeatedly is safe — `create_*` returns
76/// the existing resource if it's already configured.
77///
78/// Returns once every resource is in place. The function is async
79/// so backends can `await` it as part of their startup sequence
80/// (one round-trip per resource — ~10 RTTs total).
81pub async fn ensure_jetstream_resources(js: &jetstream::Context) -> Result<()> {
82    // ── Streams ──────────────────────────────────────────────────
83    // #518: every stream carries a `max_bytes` cap with
84    // `Discard::Old` on top of its `max_age` window. Within their
85    // age windows the streams used to be unbounded by size, and
86    // JetStream's file store shares a disk with SQLite on the
87    // backend host — one job printing 200 KB per run fleet-wide
88    // could exhaust the store, at which point EVERY publish fails
89    // (results, obs, audit, KV puts). With the caps, worst-case
90    // degradation is "shorter history on the offending stream"
91    // instead of "broker down".
92    //
93    // Sizing: JetStream RESERVES each `max_bytes` against its
94    // available storage (min of max_file_store and free disk) at
95    // create/update time and fails with error 10047 when the sum
96    // doesn't fit, so these must stay small enough for modest
97    // hosts. That's fine: every stream here is a transport +
98    // replay buffer — the durable record is the backend's SQLite
99    // (results/inventory/obs/audit are all projected within
100    // seconds) — so the caps are runaway-output backstops, not
101    // history budgets. Total reservation ≈ 5.3 GiB including the
102    // result_output object store below.
103    const MIB: i64 = 1024 * 1024;
104    const GIB: i64 = 1024 * MIB;
105
106    // INVENTORY — 90-day rolling history (spec §2.3.1).
107    js.create_or_update_stream(StreamConfig {
108        name: STREAM_INVENTORY.into(),
109        subjects: vec!["inventory.>".into()],
110        max_age: Duration::from_secs(90 * 24 * 60 * 60),
111        max_bytes: GIB,
112        discard: DiscardPolicy::Old,
113        ..Default::default()
114    })
115    .await
116    .with_context(|| format!("create_or_update_stream {STREAM_INVENTORY}"))?;
117    info!(stream = STREAM_INVENTORY, "ready");
118
119    // RESULTS — 30-day rolling history. The biggest producer by
120    // far (every job run on every PC, with up to 256 KB of inline
121    // stdout/stderr per message), so it gets the largest slice of
122    // the disk budget.
123    js.create_or_update_stream(StreamConfig {
124        name: STREAM_RESULTS.into(),
125        subjects: vec!["results.>".into()],
126        max_age: Duration::from_secs(30 * 24 * 60 * 60),
127        max_bytes: 2 * GIB,
128        discard: DiscardPolicy::Old,
129        ..Default::default()
130    })
131    .await
132    .with_context(|| format!("create_or_update_stream {STREAM_RESULTS}"))?;
133    info!(stream = STREAM_RESULTS, "ready");
134
135    // EXEC — latest-per-subject only (spec §2.6 Layer 1). v0.22.1:
136    // catch the existing `commands.{all,group.X,pc.Y}` subjects so a
137    // single backend publish lands in BOTH the agent's live core
138    // subscription AND the stream's retention store. Reconnecting
139    // agents catch up via a durable consumer with
140    // `DeliverPolicy::LastPerSubject` — they receive the most
141    // recent Command per subject they care about, no matter how
142    // long they were offline (within `max_age`).
143    js.create_or_update_stream(StreamConfig {
144        name: STREAM_EXEC.into(),
145        subjects: vec!["commands.>".into()],
146        max_messages_per_subject: 1,
147        max_age: Duration::from_secs(7 * 24 * 60 * 60),
148        // Latest-per-subject keeps this tiny (one Command per
149        // group/pc subject); the cap is a backstop against subject
150        // cardinality bugs, not a working budget.
151        max_bytes: 64 * MIB,
152        discard: DiscardPolicy::Old,
153        ..Default::default()
154    })
155    .await
156    .with_context(|| format!("create_or_update_stream {STREAM_EXEC}"))?;
157    info!(stream = STREAM_EXEC, "ready");
158
159    // EVENTS — short-lived broadcast bus for kill / revoke / etc.
160    // 7-day window matches the EXEC spec window.
161    js.create_or_update_stream(StreamConfig {
162        name: STREAM_EVENTS.into(),
163        subjects: vec!["events.>".into()],
164        max_age: Duration::from_secs(7 * 24 * 60 * 60),
165        max_bytes: 256 * MIB,
166        discard: DiscardPolicy::Old,
167        ..Default::default()
168    })
169    .await
170    .with_context(|| format!("create_or_update_stream {STREAM_EVENTS}"))?;
171    info!(stream = STREAM_EVENTS, "ready");
172
173    // AUDIT — operator-action record (spec §2.3.1). The DURABLE
174    // copy is the backend's SQLite `audit_log` table (the projector
175    // INSERTs each message, idempotently since #501; 365-day
176    // retention since #486) — the stream is transport + replay
177    // buffer, not the archive, so it can be bounded like the rest.
178    // 90 days / 512 MiB is far more than the projector ever lags;
179    // previously this stream had NO limits at all, making it an
180    // unbounded disk leak on the broker host.
181    js.create_or_update_stream(StreamConfig {
182        name: STREAM_AUDIT.into(),
183        subjects: vec!["audit.>".into()],
184        max_age: Duration::from_secs(90 * 24 * 60 * 60),
185        max_bytes: 512 * MIB,
186        discard: DiscardPolicy::Old,
187        ..Default::default()
188    })
189    .await
190    .with_context(|| format!("create_or_update_stream {STREAM_AUDIT}"))?;
191    info!(stream = STREAM_AUDIT, "ready");
192
193    // OBS_EVENTS — per-PC observability timeline (Issue #246). The
194    // 90-day window matches `obs_events` table retention so a
195    // backend bootstrapping after long downtime can catch up but
196    // doesn't carry data the table will discard anyway. Subject
197    // filter `obs.>` catches every PC without a per-PC subscription.
198    //
199    // Days-to-seconds is spelt out once instead of `90 * 24 * 60 *
200    // 60` open-coded across bootstrap + cleanup; the matching prune
201    // window in `kanade-backend::cleanup` quotes the same number
202    // separately (SQLite-relative string syntax there, not a
203    // duration), so it can't share a constant — but a single
204    // arithmetic spell-out here makes the relationship grep-able.
205    const SECS_PER_DAY: u64 = 24 * 60 * 60;
206    const OBS_EVENTS_RETENTION_DAYS: u64 = 90;
207    js.create_or_update_stream(StreamConfig {
208        name: STREAM_OBS_EVENTS.into(),
209        subjects: vec!["obs.>".into()],
210        max_age: Duration::from_secs(OBS_EVENTS_RETENTION_DAYS * SECS_PER_DAY),
211        max_bytes: 512 * MIB,
212        discard: DiscardPolicy::Old,
213        ..Default::default()
214    })
215    .await
216    .with_context(|| format!("create_or_update_stream {STREAM_OBS_EVENTS}"))?;
217    info!(stream = STREAM_OBS_EVENTS, "ready");
218
219    // NOTIFICATIONS — end-user notification history (SPEC §2.3.1 /
220    // Phase E). 90-day window matches INVENTORY: a Client App that
221    // connects after a notification was sent fetches the missed ones
222    // via KLP `notifications.list`. Subject filter `notifications.>`
223    // catches every fan-out target (`all` / `group.X` / `pc.Y`) with
224    // one stream. Retains all messages per subject — each notification
225    // is its own history entry, not a latest-only state like EXEC.
226    // #518: 512 MiB cap + DiscardPolicy::Old, matching the other
227    // 90-day streams (AUDIT / OBS_EVENTS) — notification payloads are
228    // small, so this is generous headroom while still bounding the
229    // broker's disk lease.
230    js.create_or_update_stream(StreamConfig {
231        name: STREAM_NOTIFICATIONS.into(),
232        subjects: vec!["notifications.>".into()],
233        max_age: Duration::from_secs(90 * 24 * 60 * 60),
234        max_bytes: 512 * MIB,
235        discard: DiscardPolicy::Old,
236        ..Default::default()
237    })
238    .await
239    .with_context(|| format!("create_or_update_stream {STREAM_NOTIFICATIONS}"))?;
240    info!(stream = STREAM_NOTIFICATIONS, "ready");
241
242    // ── KV buckets ───────────────────────────────────────────────
243    // script_current — cmd_id → version (spec §2.6 Layer 2).
244    js.create_or_update_key_value(KvConfig {
245        bucket: BUCKET_SCRIPT_CURRENT.into(),
246        history: 5,
247        ..Default::default()
248    })
249    .await
250    .with_context(|| format!("create_or_update_key_value {BUCKET_SCRIPT_CURRENT}"))?;
251    info!(bucket = BUCKET_SCRIPT_CURRENT, "ready");
252
253    // script_status — cmd_id → ACTIVE / REVOKED.
254    js.create_or_update_key_value(KvConfig {
255        bucket: BUCKET_SCRIPT_STATUS.into(),
256        history: 5,
257        ..Default::default()
258    })
259    .await
260    .with_context(|| format!("create_or_update_key_value {BUCKET_SCRIPT_STATUS}"))?;
261    info!(bucket = BUCKET_SCRIPT_STATUS, "ready");
262
263    // agents_state — pc_id → latest hw snapshot (history=1).
264    js.create_or_update_key_value(KvConfig {
265        bucket: BUCKET_AGENTS_STATE.into(),
266        history: 1,
267        ..Default::default()
268    })
269    .await
270    .with_context(|| format!("create_or_update_key_value {BUCKET_AGENTS_STATE}"))?;
271    info!(bucket = BUCKET_AGENTS_STATE, "ready");
272
273    // agent_config — Sprint 6 layered scopes (global / groups.* /
274    // pcs.*) plus the legacy target_version key.
275    // history: 1 — agents only ever read the current value (the watch is
276    // DeliverPolicy::New + an initial_sync get(), never kv.history()).
277    // Retained old revisions only fed reconnect history-replay, which
278    // flapped self-update backward (#828). Operator change-history lives
279    // in the audit log, so keeping one revision loses nothing. (#830)
280    js.create_or_update_key_value(KvConfig {
281        bucket: BUCKET_AGENT_CONFIG.into(),
282        history: 1,
283        ..Default::default()
284    })
285    .await
286    .with_context(|| format!("create_or_update_key_value {BUCKET_AGENT_CONFIG}"))?;
287    info!(bucket = BUCKET_AGENT_CONFIG, "ready");
288
289    // agent_groups — Sprint 5 per-pc group membership.
290    // history: 1 — same reasoning as agent_config above: agents only need
291    // the current membership; replayed history just churned subscriptions
292    // through stale sets on every reconnect (a transient wrong membership,
293    // #830). One revision makes that replay material non-existent. (#830)
294    js.create_or_update_key_value(KvConfig {
295        bucket: BUCKET_AGENT_GROUPS.into(),
296        history: 1,
297        ..Default::default()
298    })
299    .await
300    .with_context(|| format!("create_or_update_key_value {BUCKET_AGENT_GROUPS}"))?;
301    info!(bucket = BUCKET_AGENT_GROUPS, "ready");
302
303    // agent_groups_derived — #1032①: per-pc DERIVED group membership written by
304    // the backend group-materializer (resolved from GroupDefs). Agents watch it
305    // alongside agent_groups and union. history: 1 for the same reason as
306    // agent_groups (#830 — agents read only the current value; replayed history
307    // churns subscriptions). Separate bucket so the materializer (sole writer
308    // here) never clobbers operator-set membership in agent_groups.
309    js.create_or_update_key_value(KvConfig {
310        bucket: BUCKET_AGENT_GROUPS_DERIVED.into(),
311        history: 1,
312        ..Default::default()
313    })
314    .await
315    .with_context(|| format!("create_or_update_key_value {BUCKET_AGENT_GROUPS_DERIVED}"))?;
316    info!(bucket = BUCKET_AGENT_GROUPS_DERIVED, "ready");
317
318    // agent_meta — per-PC operator-managed key/value annotations
319    // (edited via the SPA agent detail page / the `kanade meta` CLI, and
320    // typically bulk-populated by an operator AD-sync job). history: 5 —
321    // a few revisions of operator edit history, like group_contacts;
322    // nothing replays it, so the exact depth is not load-bearing.
323    js.create_or_update_key_value(KvConfig {
324        bucket: BUCKET_AGENT_META.into(),
325        history: 5,
326        ..Default::default()
327    })
328    .await
329    .with_context(|| format!("create_or_update_key_value {BUCKET_AGENT_META}"))?;
330    info!(bucket = BUCKET_AGENT_META, "ready");
331
332    // group_contacts — per-group notification email addresses
333    // (operator-managed via the SPA Groups page).
334    js.create_or_update_key_value(KvConfig {
335        bucket: BUCKET_GROUP_CONTACTS.into(),
336        history: 5,
337        ..Default::default()
338    })
339    .await
340    .with_context(|| format!("create_or_update_key_value {BUCKET_GROUP_CONTACTS}"))?;
341    info!(bucket = BUCKET_GROUP_CONTACTS, "ready");
342
343    // schedules — admin-API CRUD'd cron table (spec §2.5.3).
344    // Backend's scheduler.rs also creates this on startup; calling
345    // twice is harmless.
346    js.create_or_update_key_value(KvConfig {
347        bucket: BUCKET_SCHEDULES.into(),
348        history: 5,
349        ..Default::default()
350    })
351    .await
352    .with_context(|| format!("create_or_update_key_value {BUCKET_SCHEDULES}"))?;
353    info!(bucket = BUCKET_SCHEDULES, "ready");
354
355    // jobs — v0.15 operator-registered Manifest catalog. Schedules
356    // reference rows here by id; editing a job rewrites what future
357    // schedule fires exec.
358    js.create_or_update_key_value(KvConfig {
359        bucket: BUCKET_JOBS.into(),
360        history: 5,
361        ..Default::default()
362    })
363    .await
364    .with_context(|| format!("create_or_update_key_value {BUCKET_JOBS}"))?;
365    info!(bucket = BUCKET_JOBS, "ready");
366
367    // fleet_config — #418 Phase 5 fleet-wide singletons (the global
368    // change-freeze under KEY_FREEZE). history: 1 — only the current
369    // state matters; both schedulers watch it.
370    js.create_or_update_key_value(KvConfig {
371        bucket: BUCKET_FLEET_CONFIG.into(),
372        history: 1,
373        ..Default::default()
374    })
375    .await
376    .with_context(|| format!("create_or_update_key_value {BUCKET_FLEET_CONFIG}"))?;
377    info!(bucket = BUCKET_FLEET_CONFIG, "ready");
378
379    // server_settings — backend-side operator-editable settings (SPA
380    // Settings page "server settings" tab). A single JSON document under
381    // KEY_SERVER_SETTINGS; history: 1 since only the current state
382    // matters. First consumer is the cleanup task's dead-agent prune
383    // window.
384    js.create_or_update_key_value(KvConfig {
385        bucket: BUCKET_SERVER_SETTINGS.into(),
386        history: 1,
387        ..Default::default()
388    })
389    .await
390    .with_context(|| format!("create_or_update_key_value {BUCKET_SERVER_SETTINGS}"))?;
391    info!(bucket = BUCKET_SERVER_SETTINGS, "ready");
392
393    // notifications_read — per-(pc, user, notification) read/ack state
394    // (SPEC §2.3.2 / Phase E). The agent writes here on KLP
395    // `notifications.ack`; `notifications.list` reads it back to filter
396    // the unread bucket. history: 1 — only the latest ack per key
397    // matters.
398    js.create_or_update_key_value(KvConfig {
399        bucket: BUCKET_NOTIFICATIONS_READ.into(),
400        history: 1,
401        ..Default::default()
402    })
403    .await
404    .with_context(|| format!("create_or_update_key_value {BUCKET_NOTIFICATIONS_READ}"))?;
405    info!(bucket = BUCKET_NOTIFICATIONS_READ, "ready");
406
407    // jobs_yaml / schedules_yaml — operator source-of-truth YAML
408    // alongside the JSON catalogs above. Same key shape (manifest id
409    // / schedule id), but the value is the raw YAML bytes so the
410    // SPA's YAML editor preserves comments + script block-scalar
411    // indentation across edits. Agents/scheduler don't read these.
412    js.create_or_update_key_value(KvConfig {
413        bucket: BUCKET_JOBS_YAML.into(),
414        history: 5,
415        ..Default::default()
416    })
417    .await
418    .with_context(|| format!("create_or_update_key_value {BUCKET_JOBS_YAML}"))?;
419    info!(bucket = BUCKET_JOBS_YAML, "ready");
420
421    js.create_or_update_key_value(KvConfig {
422        bucket: BUCKET_SCHEDULES_YAML.into(),
423        history: 5,
424        ..Default::default()
425    })
426    .await
427    .with_context(|| format!("create_or_update_key_value {BUCKET_SCHEDULES_YAML}"))?;
428    info!(bucket = BUCKET_SCHEDULES_YAML, "ready");
429
430    // ── Object Store ─────────────────────────────────────────────
431    // agent_releases — one object per version, raw exe bytes.
432    ensure_object_store(
433        js,
434        ObjectStoreConfig {
435            bucket: OBJECT_AGENT_RELEASES.into(),
436            ..Default::default()
437        },
438    )
439    .await?;
440
441    // app_packages — generic operator-uploaded binary distribution
442    // (kanade-client today; third-party installers like Webex /
443    // Teams once those flows land). Object keys are
444    // `<name>/<version>`; see `kanade-shared::kv::OBJECT_APP_PACKAGES`
445    // for the full rationale.
446    ensure_object_store(
447        js,
448        ObjectStoreConfig {
449            bucket: OBJECT_APP_PACKAGES.into(),
450            ..Default::default()
451        },
452    )
453    .await?;
454
455    // scripts — manifest script bodies referenced by
456    // `Execute::script_object` (SPEC §2.4.1). Sibling of
457    // `app_packages`; see `kanade-shared::kv::OBJECT_SCRIPTS` for
458    // the bucket-split rationale (smaller payloads + manifest-
459    // coupled lifecycle vs operator-curated installers).
460    ensure_object_store(
461        js,
462        ObjectStoreConfig {
463            bucket: OBJECT_SCRIPTS.into(),
464            ..Default::default()
465        },
466    )
467    .await?;
468
469    // result_output — overflow stdout / stderr blobs for the
470    // `ExecResult` wire kind (#227). Anything larger than the agent's
471    // 256 KB inline threshold gets uploaded here under
472    // `<request_id>/{stdout,stderr}`; the backend's results
473    // projector derefs the pointer fields before INSERT so SQLite
474    // + the SPA see the full text inline. 30-day max_age matches
475    // STREAM_RESULTS so the lifetimes stay in lockstep — a row still
476    // resolvable in execution_results never points at a missing
477    // blob.
478    // #518: capped like the streams — a job whose output overflows
479    // the inline threshold writes blobs HERE instead of
480    // STREAM_RESULTS, so without its own cap this store bypasses
481    // the stream budget entirely and can still fill the file store.
482    // The projector derefs blobs within seconds of publish, so
483    // eviction only ever hits already-projected (or expired)
484    // output.
485    ensure_object_store(
486        js,
487        ObjectStoreConfig {
488            bucket: OBJECT_RESULT_OUTPUT.into(),
489            max_age: Duration::from_secs(SECS_PER_DAY * 30),
490            max_bytes: GIB,
491            ..Default::default()
492        },
493    )
494    .await?;
495
496    // #219: collected file bundles. A `collect:` job's agent zips the
497    // script's listed files and uploads the archive here under
498    // `<pc_id>/<job_id>/<rfc3339>.zip`; the SPA Collect page lists /
499    // downloads them. Default max_age = DEFAULT_COLLECT_RETENTION_DAYS —
500    // bundles are debugging / audit artifacts (not curated config like
501    // app_packages / scripts), so they auto-expire and the bucket doesn't
502    // grow unbounded. Capped at 5 GiB (DiscardPolicy::Old evicts oldest
503    // first) so a fleet's worth of bundles can't fill the file store.
504    //
505    // This is only the value a FRESH bucket is born with; the window is
506    // operator-tunable from the SPA (`ServerSettings::collect_retention_days`)
507    // and the backend reconciles the live bucket's max_age to the configured
508    // value at boot and on save — see [`reconcile_collect_retention`].
509    ensure_object_store(
510        js,
511        ObjectStoreConfig {
512            bucket: OBJECT_COLLECTIONS.into(),
513            max_age: Duration::from_secs(SECS_PER_DAY * DEFAULT_COLLECT_RETENTION_DAYS as u64),
514            max_bytes: 5 * GIB,
515            ..Default::default()
516        },
517    )
518    .await?;
519
520    Ok(())
521}
522
523/// NATS names the stream backing an Object Store `OBJ_<bucket>` (mirroring
524/// `KV_<bucket>` for key-value stores). We reconcile the collect bucket's
525/// retention through this stream because async-nats 0.49 has no
526/// create-or-update / reconcile form for Object Stores themselves (the same
527/// gap [`ensure_object_store`] works around) — but the underlying stream
528/// *does* support `update_stream`.
529fn object_store_stream_name(bucket: &str) -> String {
530    format!("OBJ_{bucket}")
531}
532
533/// Reconcile the `collections` Object Store's retention window to
534/// `retention_days` by updating the `max_age` on its backing stream.
535///
536/// Why this exists: the bucket is created once (at bootstrap) with the
537/// built-in default, and `create_object_store` neither has a
538/// create-or-update form nor reconciles config in async-nats 0.49. So to
539/// honour an operator's `ServerSettings::collect_retention_days` change on an
540/// already-provisioned bucket, we read the backing stream's config, patch
541/// **only** `max_age` (a read-modify-write that leaves every object-store-
542/// specific stream setting untouched), and `update_stream`. `max_bytes`
543/// and the discard policy are deliberately left as-is, so extending the
544/// window never lifts the 5 GiB disk ceiling.
545///
546/// Idempotent: if the stream's `max_age` already matches, it's a no-op
547/// (skips the update round-trip and returns `false`). A missing stream (the
548/// bucket was never provisioned — e.g. a broker that predates this feature
549/// and hasn't run bootstrap) is a soft error the caller can log-and-continue:
550/// bootstrap runs before this on the backend boot path, so in practice the
551/// stream is always present.
552///
553/// Returns `Ok(true)` when it actually changed the stream, `Ok(false)` when
554/// already in sync.
555pub async fn reconcile_collect_retention(
556    js: &jetstream::Context,
557    retention_days: u32,
558) -> Result<bool> {
559    const SECS_PER_DAY: u64 = 24 * 60 * 60;
560    let desired = Duration::from_secs(SECS_PER_DAY * retention_days as u64);
561    let stream_name = object_store_stream_name(OBJECT_COLLECTIONS);
562
563    let mut stream = js
564        .get_stream(&stream_name)
565        .await
566        .with_context(|| format!("get_stream {stream_name} for collect-retention reconcile"))?;
567    let info = stream
568        .info()
569        .await
570        .with_context(|| format!("stream info {stream_name}"))?;
571    if info.config.max_age == desired {
572        return Ok(false);
573    }
574    let mut cfg = info.config.clone();
575    cfg.max_age = desired;
576    js.update_stream(cfg)
577        .await
578        .with_context(|| format!("update_stream {stream_name} max_age"))?;
579    info!(
580        stream = %stream_name,
581        retention_days,
582        "collect retention: reconciled Object Store max_age",
583    );
584    Ok(true)
585}
586
587#[cfg(test)]
588mod tests {
589    use super::*;
590    use std::process::Stdio;
591
592    /// Throwaway `nats-server -js` on a random port, like the
593    /// kv_cas_live / offline_boot harnesses. Ignored tests only.
594    struct Broker {
595        js: jetstream::Context,
596        _server: tokio::process::Child,
597        _storage: tempfile::TempDir,
598    }
599
600    async fn spawn_broker() -> Broker {
601        let port = portpicker::pick_unused_port().expect("pick port");
602        let storage = tempfile::TempDir::new().expect("storage tempdir");
603        let server = tokio::process::Command::new("nats-server")
604            .arg("-js")
605            .arg("-p")
606            .arg(port.to_string())
607            .arg("-sd")
608            .arg(storage.path())
609            .stdout(Stdio::null())
610            .stderr(Stdio::null())
611            .kill_on_drop(true)
612            .spawn()
613            .expect("spawn nats-server (is it in PATH?)");
614        let url = format!("nats://127.0.0.1:{port}");
615        let mut client = None;
616        for _ in 0..50 {
617            if let Ok(c) = async_nats::connect(&url).await {
618                client = Some(c);
619                break;
620            }
621            tokio::time::sleep(Duration::from_millis(100)).await;
622        }
623        Broker {
624            js: jetstream::new(client.expect("nats-server did not come up in 5s")),
625            _server: server,
626            _storage: storage,
627        }
628    }
629
630    /// #506 / 2026-06-11 incident: `create_object_store` neither
631    /// reconciles config nor has a create-or-update form, so adding
632    /// the #518 `max_bytes` cap to a store first created uncapped made
633    /// the broker reject the create (error 10058 "name already in use
634    /// with a different configuration") and crashed the backend on
635    /// boot. `ensure_object_store` must instead accept the existing
636    /// store and let startup proceed.
637    #[tokio::test]
638    #[ignore = "requires nats-server in PATH; cargo test -- --ignored"]
639    async fn ensure_object_store_accepts_config_drift() {
640        let b = spawn_broker().await;
641        // First create: uncapped, as the pre-#518 backend did.
642        ensure_object_store(
643            &b.js,
644            ObjectStoreConfig {
645                bucket: "result_output".into(),
646                ..Default::default()
647            },
648        )
649        .await
650        .expect("fresh create");
651
652        // Second create with a conflicting config (now capped) must
653        // NOT error — it accepts the existing store.
654        ensure_object_store(
655            &b.js,
656            ObjectStoreConfig {
657                bucket: "result_output".into(),
658                max_bytes: 1024 * 1024 * 1024,
659                ..Default::default()
660            },
661        )
662        .await
663        .expect("config drift must not wedge startup");
664
665        // The store is still usable.
666        let store = b.js.get_object_store("result_output").await.expect("store");
667        store
668            .put("k", &mut &b"hi"[..])
669            .await
670            .expect("put after drift");
671    }
672
673    /// A fresh create with a cap succeeds on a broker with room (the
674    /// normal first-boot path).
675    #[tokio::test]
676    #[ignore = "requires nats-server in PATH; cargo test -- --ignored"]
677    async fn ensure_object_store_fresh_create_with_cap() {
678        let b = spawn_broker().await;
679        ensure_object_store(
680            &b.js,
681            ObjectStoreConfig {
682                bucket: "fresh".into(),
683                max_bytes: 64 * 1024 * 1024,
684                ..Default::default()
685            },
686        )
687        .await
688        .expect("fresh capped create");
689        b.js.get_object_store("fresh").await.expect("exists");
690    }
691
692    /// The fatal path: when create fails for a store that ALSO does
693    /// not exist, the error must propagate (we only swallow errors we
694    /// can fall back from). An invalid bucket name fails create's
695    /// charset validation and never creates a store to fall back to.
696    #[tokio::test]
697    #[ignore = "requires nats-server in PATH; cargo test -- --ignored"]
698    async fn ensure_object_store_propagates_when_no_fallback() {
699        let b = spawn_broker().await;
700        let err = ensure_object_store(
701            &b.js,
702            ObjectStoreConfig {
703                // Spaces / '!' are rejected by the object-store name
704                // rules, so create fails and get also finds nothing.
705                bucket: "bad name!".into(),
706                ..Default::default()
707            },
708        )
709        .await
710        .expect_err("a create failure with no existing store must be fatal");
711        assert!(
712            err.to_string()
713                .contains("no existing store to fall back to"),
714            "unexpected error: {err:#}",
715        );
716    }
717
718    /// `reconcile_collect_retention` must change the live bucket's `max_age`
719    /// (broker-side retention) without disturbing the other stream config —
720    /// the mechanism the SPA relies on to extend collect retention past the
721    /// 30-day default. Also asserts the idempotent no-op path (`Ok(false)`
722    /// when already in sync) and that `max_bytes` survives the update.
723    #[tokio::test]
724    #[ignore = "requires nats-server in PATH; cargo test -- --ignored"]
725    async fn reconcile_collect_retention_updates_max_age() {
726        use crate::kv::OBJECT_COLLECTIONS;
727        const SECS_PER_DAY: u64 = 24 * 60 * 60;
728        let b = spawn_broker().await;
729
730        // Provision the collections bucket the way bootstrap does: 30-day
731        // default max_age, 5 GiB cap.
732        ensure_object_store(
733            &b.js,
734            ObjectStoreConfig {
735                bucket: OBJECT_COLLECTIONS.into(),
736                max_age: Duration::from_secs(SECS_PER_DAY * 30),
737                max_bytes: 5 * 1024 * 1024 * 1024,
738                ..Default::default()
739            },
740        )
741        .await
742        .expect("fresh collections bucket");
743
744        let stream_name = object_store_stream_name(OBJECT_COLLECTIONS);
745
746        // Extend to 90 days — first call changes the stream.
747        assert!(
748            reconcile_collect_retention(&b.js, 90)
749                .await
750                .expect("reconcile to 90d"),
751            "first reconcile should report a change",
752        );
753        let mut stream = b.js.get_stream(&stream_name).await.expect("stream");
754        let info = stream.info().await.expect("info");
755        assert_eq!(
756            info.config.max_age,
757            Duration::from_secs(SECS_PER_DAY * 90),
758            "max_age must be extended to 90 days",
759        );
760        assert_eq!(
761            info.config.max_bytes,
762            5 * 1024 * 1024 * 1024,
763            "the size cap must survive the max_age-only update",
764        );
765
766        // Re-applying the same value is a no-op (no revision-bumping update).
767        assert!(
768            !reconcile_collect_retention(&b.js, 90)
769                .await
770                .expect("idempotent reconcile"),
771            "second reconcile with the same value should be a no-op",
772        );
773    }
774}
kanade_shared/bootstrap.rs

kanade_shared/
bootstrap.rs