Skip to main content

ai_memory/
metrics.rs

1// Copyright 2026 AlphaOne LLC
2// SPDX-License-Identifier: Apache-2.0
3
4//! v0.6.0.0 Prometheus metrics. Exposed at `GET /metrics` by the daemon.
5//!
6//! Minimal, non-invasive instrumentation — the process has a single
7//! default `Registry`, a handful of global counters and a couple of
8//! histograms. Callers increment via the typed helpers (`record_store`,
9//! `record_recall`) rather than poking the registry directly so a future
10//! metrics-backend swap stays internal.
11
12use std::sync::OnceLock;
13use std::sync::atomic::{AtomicU64, Ordering};
14
15use prometheus::{
16    Encoder, HistogramOpts, HistogramVec, IntCounter, IntCounterVec, IntGauge, Registry,
17    TextEncoder,
18};
19
20// =====================================================================
21// pm-v3.1 PR8 (issue #1174) — HNSW eviction observability.
22//
23// Pre-PR8 lived as two free `static AtomicU64`s at the top of
24// `src/hnsw.rs`. Class A "SHOULD" extraction: the counters are
25// metrics-bound (surfaced in `/metrics`, `memory_capabilities`,
26// `memory_stats`) so the metrics registry is the natural owner.
27//
28// The Prometheus handles (`hnsw_evictions_total` IntCounter +
29// `hnsw_last_eviction_at_nanos` IntGauge in `Metrics`) carry the
30// scrape-side wiring. The atomics below carry the read-side logic
31// (`evicted_recently` 60s window) AND the test-only reset path that
32// prometheus's monotonic-counter discipline does not support. Both
33// kept-in-lockstep by the `record_hnsw_eviction` sink and the
34// `reset_hnsw_eviction_counters_for_test` resetter.
35//
36// Process-local. The counters reset on restart because the index
37// itself resets on restart. Both atomics are touched only on the
38// eviction edge (rare: requires >100k vectors), so there is no
39// measurable hot-path cost.
40// =====================================================================
41
42static HNSW_EVICTIONS_TOTAL: AtomicU64 = AtomicU64::new(0);
43static HNSW_LAST_EVICTION_AT_NANOS: AtomicU64 = AtomicU64::new(0);
44
45/// Record one HNSW eviction event. Bumps the process-local cumulative
46/// counter by `count`, sets the last-eviction wall-clock nanos to
47/// `now_nanos`, and mirrors both onto the Prometheus registry handles
48/// so `/metrics` scrapes see the same signal without a separate
49/// observer thread.
50pub fn record_hnsw_eviction(count: u64, now_nanos: u64) {
51    HNSW_EVICTIONS_TOTAL.fetch_add(count, Ordering::Relaxed);
52    HNSW_LAST_EVICTION_AT_NANOS.store(now_nanos, Ordering::Relaxed);
53    let r = registry();
54    r.hnsw_evictions_total.inc_by(count);
55    // IntGauge value is i64; nanos can exceed i64::MAX in ~292 years
56    // past the UNIX epoch — saturating clamp keeps the gauge in range
57    // for any plausible operator timeline.
58    #[allow(clippy::cast_possible_wrap)]
59    let nanos_i64 = i64::try_from(now_nanos).unwrap_or(i64::MAX);
60    r.hnsw_last_eviction_at_nanos.set(nanos_i64);
61}
62
63/// Cumulative HNSW oldest-eviction count since process start. Reads
64/// from the process-local atomic; the same value is scrape-visible at
65/// `/metrics` as `ai_memory_hnsw_evictions_total`.
66#[must_use]
67pub fn hnsw_evictions_total() -> u64 {
68    HNSW_EVICTIONS_TOTAL.load(Ordering::Relaxed)
69}
70
71/// Wall-clock UNIX nanoseconds of the most recent HNSW eviction (0 if
72/// none have occurred this process). Reads from the process-local
73/// atomic; the same value is scrape-visible at `/metrics` as
74/// `ai_memory_hnsw_last_eviction_at_nanos`.
75#[must_use]
76pub fn hnsw_last_eviction_at_nanos() -> u64 {
77    HNSW_LAST_EVICTION_AT_NANOS.load(Ordering::Relaxed)
78}
79
80/// Reset the HNSW eviction counters. Test-only: production callers
81/// must never reach into the counter directly. The Prometheus
82/// monotonic-counter discipline does NOT permit decrement, so the
83/// scrape-side `ai_memory_hnsw_evictions_total` retains its
84/// cumulative value across the reset — only the process-local
85/// atomics (used by `hnsw_evictions_total()`, `evicted_recently`,
86/// and `memory_stats`) drop back to zero. This asymmetry is
87/// deliberate: `/metrics` scrapes are time-series consumers that
88/// expect monotonic counters; the in-process reset is a unit-test
89/// affordance.
90#[doc(hidden)]
91pub fn reset_hnsw_eviction_counters_for_test() {
92    HNSW_EVICTIONS_TOTAL.store(0, Ordering::Relaxed);
93    HNSW_LAST_EVICTION_AT_NANOS.store(0, Ordering::Relaxed);
94    // Mirror the gauge reset so scrape-side `last_eviction_at_nanos`
95    // also flips back to 0 (gauges, unlike counters, may decrement).
96    registry().hnsw_last_eviction_at_nanos.set(0);
97}
98
99/// Handles to the registered metric families. Built once on first access
100/// via `registry()`.
101///
102/// Fields are public so call sites in `handlers.rs`, future
103/// `subscriptions.rs`, and the test module can `.inc()` / `.observe()` /
104/// `.set()` directly. `#[allow(dead_code)]` covers the handles that
105/// aren't wired to a caller yet — they surface in `/metrics` output
106/// (see the `render_includes_registered_names` test) and will be
107/// instrumented as sibling features land (hnsw gauge via the HNSW
108/// module, subscriptions gauge via the webhook PR, webhook counters
109/// via the dispatch path, etc.).
110#[allow(dead_code)]
111pub struct Metrics {
112    pub registry: Registry,
113    pub store_total: IntCounterVec,
114    pub recall_total: IntCounterVec,
115    pub recall_latency_seconds: HistogramVec,
116    pub autonomy_hook_total: IntCounterVec,
117    pub contradiction_detected_total: IntCounter,
118    pub webhook_dispatched_total: IntCounter,
119    pub webhook_failed_total: IntCounter,
120    pub memories_gauge: IntGauge,
121    pub hnsw_size_gauge: IntGauge,
122    pub subscriptions_active_gauge: IntGauge,
123    pub curator_cycles_total: IntCounter,
124    pub curator_operations_total: IntCounterVec,
125    pub curator_cycle_duration_seconds: HistogramVec,
126    /// Ultrareview #343: count of post-quorum fanout tasks whose
127    /// outcome could not be observed (shutdown, panic, or the
128    /// spawned task erred). Non-zero indicates mesh divergence risk.
129    pub federation_fanout_dropped_total: IntCounterVec,
130    /// S40 (v0.6.2 Patch 2): count of peer POST retries, labeled by
131    /// final outcome. `ok` = retry recovered the row; `fail` = both
132    /// attempts failed (peer likely truly down); `id_drift` = retry
133    /// observed the same peer id-drift as attempt 1.
134    pub federation_fanout_retry_total: IntCounterVec,
135    /// H9 (v0.7.0 round-2): count of quorum writes that the leader
136    /// returned `200` for (W met) but where at least one configured
137    /// peer did NOT ack inside the deadline. Operators alert on
138    /// non-zero rate to detect mesh-divergence drift early — before a
139    /// follow-up catchup sync surfaces the gap.
140    pub federation_partial_quorum_total: IntCounter,
141    /// Cluster-A COR-3 (v0.7.0): count of memory rows whose Form 4
142    /// fact-provenance JSON columns (`citations`, `source_span`,
143    /// `confidence_signals`, or pre-Form-4 `metadata`) failed to parse
144    /// and were silently defaulted by `row_to_memory`. Non-zero
145    /// indicates schema drift, writer-side corruption, or a
146    /// migration that left malformed JSON in the column. Labeled by
147    /// column name (`citations` | `source_span` | `confidence_signals`
148    /// | `metadata`).
149    pub corrupt_provenance_rows_total: IntCounterVec,
150    /// v0.7-polish SEC-15 / COR-11 (issue #780): count of
151    /// `post_reflect.auto_export` detached worker invocations whose
152    /// outcome was a panic or a returned `Err`. Non-zero means an
153    /// operator-opted-in namespace had a reflection that did NOT
154    /// land on the filesystem and the failure would otherwise be
155    /// silent (the worker thread is detached; the reflection itself
156    /// already committed). The capabilities-v3 surface mirrors this
157    /// counter so operator dashboards can alert without scraping
158    /// `/metrics` directly.
159    pub auto_export_spawn_failed_total: IntCounter,
160    /// v0.7.0 Track D #933 — current depth of the federation push
161    /// DLQ (`federation_push_dlq` table, `WHERE replayed_at IS NULL`).
162    /// Refreshed on every tick of the `replay_federation_push_dlq`
163    /// worker spawned alongside the catchup loop. Operators alert on
164    /// non-zero sustained depth — a healthy mesh should drain back
165    /// to 0 within one replay interval after the peer recovers.
166    pub federation_push_dlq_depth: IntGauge,
167
168    /// #1032 (HIGH, 2026-05-21) — monotonic counter for DLQ rows the
169    /// replay worker has marked as quarantined (`attempt_count >=
170    /// MAX_REPLAY_ATTEMPTS`). Pre-#1032 the replay loop retried
171    /// poison messages forever; now rows past the ceiling are
172    /// skipped + this counter increments per quarantined row per
173    /// tick (the row stays in the DLQ until an operator drains it
174    /// via `ai-memory federation dlq drain --quarantined`). Operators
175    /// alert on non-zero increment rate — a healthy mesh should have
176    /// zero rows reaching the quarantine threshold.
177    pub federation_push_dlq_quarantined: IntCounter,
178
179    /// pm-v3.1 PR8 (issue #1174) — cumulative HNSW oldest-eviction
180    /// count since process start. Replaces the prior process-global
181    /// `AtomicU64` `INDEX_EVICTIONS_TOTAL` in `src/hnsw.rs`.
182    /// Non-zero means the in-memory vector index has hit
183    /// `MAX_ENTRIES` and dropped older embeddings; recall quality
184    /// may have degraded for evicted ids until they are re-inserted
185    /// (e.g. on next access via the `recall` touch path). Surfaces in
186    /// `memory_capabilities` (`hnsw.evictions_total`), `/metrics`
187    /// (`ai_memory_hnsw_evictions_total`), and `memory_stats`.
188    pub hnsw_evictions_total: IntCounter,
189
190    /// pm-v3.1 PR8 (issue #1174) — wall-clock UNIX nanoseconds of the
191    /// most recent HNSW eviction (0 if none have occurred). Replaces
192    /// the prior process-global `AtomicU64` `LAST_EVICTION_AT_NANOS`
193    /// in `src/hnsw.rs`. Capabilities derives `hnsw.evicted_recently`
194    /// from this with a 60s rolling window. Surfaced as an `IntGauge`
195    /// so the value is also readable via Prometheus scraping.
196    pub hnsw_last_eviction_at_nanos: IntGauge,
197
198    /// #1253 (MED, 2026-05-25) — monotonic counter for subscription
199    /// DLQ insert attempts that were refused because the per-
200    /// subscription DLQ depth had already hit
201    /// [`crate::subscriptions::MAX_SUBSCRIPTION_DLQ_ROWS`]. Non-zero
202    /// means a hostile (or simply-broken) webhook target is failing
203    /// every delivery and would otherwise fill the operator's disk
204    /// with quarantined rows. Each refused insert pairs with a
205    /// `tracing::warn!` so operators see the subscription id + correlation
206    /// id of the dropped row.
207    pub subscription_dlq_overflow_total: IntCounter,
208
209    /// FED-P4-e (federation-identity-at-scale §8) — federation
210    /// credential-verification outcomes on the receiver path, labeled
211    /// `result` (`ok` | `fail`). The verify-failure-rate SLO is
212    /// `fail / (ok + fail)`. A non-zero sustained fail rate means peers
213    /// are presenting credentials the local trust bundle cannot verify
214    /// — an expired leaf, a revoked issuer, a clock-skew window, or a
215    /// chain that fails to anchor. Healthy meshes hold this at 0 once
216    /// every peer's issuer key is enrolled in the bundle.
217    pub federation_cred_verify_total: IntCounterVec,
218
219    /// FED-P4-e (federation-identity-at-scale §8) — inbound federation
220    /// requests bucketed by whether they presented a signed credential
221    /// at all, labeled `presence` (`signed` | `unsigned`). The
222    /// signed-vs-unsigned-ratio SLO is `signed / (signed + unsigned)`.
223    /// During a rollout this climbs from 0 toward 1 as peers upgrade to
224    /// credential-presenting builds; operators gate the flip of
225    /// `AI_MEMORY_FED_REQUIRE_PEER_ENROLLMENT` to the secure default on
226    /// this ratio reaching 1.0 across the fleet.
227    pub federation_inbound_cred_total: IntCounterVec,
228
229    /// FED-P4-e (federation-identity-at-scale §8) — age in seconds of
230    /// the local outbound leaf credential (now − `issued_at`),
231    /// refreshed on every renewal tick. The max-cred-age SLO alerts
232    /// when this approaches the leaf TTL
233    /// ([`crate::federation::identity::issuer::DEFAULT_CREDENTIAL_TTL_SECS`])
234    /// — a credential that ages past its TTL without a renewal means
235    /// the refresh worker has stalled and outbound sync will start
236    /// failing peer verification.
237    pub federation_cred_max_age_seconds: IntGauge,
238
239    /// FED-P4-e (federation-identity-at-scale §8) — seconds since the
240    /// last successful outbound-credential renewal (now − last-renew
241    /// wall clock), refreshed on every renewal tick. The renewal-lag
242    /// SLO alerts when this exceeds the configured refresh interval by
243    /// a safety margin: a healthy worker re-renews well inside the leaf
244    /// TTL, so a lag larger than the interval means renewals are
245    /// silently failing (bad CA reachability, key-load fault) even
246    /// though the worker thread is still alive.
247    pub federation_renewal_lag_seconds: IntGauge,
248}
249
250/// Lazily-built process-global metrics handle.
251pub fn registry() -> &'static Metrics {
252    static HANDLE: OnceLock<Metrics> = OnceLock::new();
253    HANDLE.get_or_init(Metrics::new_or_panic)
254}
255
256impl Metrics {
257    fn new_or_panic() -> Self {
258        // Registration can only fail on duplicate-name conflict; with a
259        // fresh registry that's unreachable. Panic is acceptable because
260        // the metrics subsystem is a daemon-startup concern — a failure
261        // here means a programming bug, not a runtime condition.
262        Self::try_new().expect("prometheus registry init failed")
263    }
264
265    // COVERAGE: every `?` Err-arm closure on `IntCounterVec::new(...)?`,
266    //           `IntCounter::new(...)?`, `IntGauge::new(...)?`,
267    //           `HistogramVec::new(...)?`, and
268    //           `registry.register(Box::new(...))?` in this function
269    //           is structurally unreachable in production:
270    //
271    //           1. The function constructs a fresh `Registry::new()`
272    //              per call (no shared state). Registration can only
273    //              fail on duplicate metric name; with a fresh registry
274    //              and unique names per counter, collision is
275    //              impossible.
276    //           2. Every metric name + label name passed to the
277    //              constructors is a compile-time string literal that
278    //              already matches the Prometheus regex
279    //              `[a-zA-Z_:][a-zA-Z0-9_:]*` — construction cannot
280    //              fail on name-validation grounds.
281    //
282    //           The Err-arms exist because the prometheus crate's
283    //           API returns `Result<...>` from these constructors, and
284    //           the `?` propagation is the idiomatic Rust pattern.
285    //           Triggering coverage would require a synthetic
286    //           registry-injection layer that doesn't exist (and
287    //           shouldn't — try_new owns its registry by design).
288    //           Documented per L0.7 playbook §3c.
289    #[allow(clippy::too_many_lines)]
290    pub(crate) fn try_new() -> prometheus::Result<Self> {
291        let registry = Registry::new();
292
293        let store_total = IntCounterVec::new(
294            prometheus::Opts::new(
295                "ai_memory_store_total",
296                "Total memory_store calls, labeled by tier and result.",
297            ),
298            &["tier", "result"],
299        )?;
300        registry.register(Box::new(store_total.clone()))?;
301
302        let recall_total = IntCounterVec::new(
303            prometheus::Opts::new(
304                "ai_memory_recall_total",
305                "Total memory_recall calls, labeled by mode.",
306            ),
307            &["mode"],
308        )?;
309        registry.register(Box::new(recall_total.clone()))?;
310
311        let recall_latency_seconds = HistogramVec::new(
312            HistogramOpts::new(
313                "ai_memory_recall_latency_seconds",
314                "Recall latency in seconds, labeled by mode.",
315            )
316            .buckets(vec![
317                0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0,
318            ]),
319            &["mode"],
320        )?;
321        registry.register(Box::new(recall_latency_seconds.clone()))?;
322
323        let autonomy_hook_total = IntCounterVec::new(
324            prometheus::Opts::new(
325                "ai_memory_autonomy_hook_total",
326                "Post-store autonomy hook invocations, labeled by kind and result.",
327            ),
328            &["kind", "result"],
329        )?;
330        registry.register(Box::new(autonomy_hook_total.clone()))?;
331
332        let contradiction_detected_total = IntCounter::new(
333            "ai_memory_contradiction_detected_total",
334            "Count of contradictions the LLM hook confirmed.",
335        )?;
336        registry.register(Box::new(contradiction_detected_total.clone()))?;
337
338        let webhook_dispatched_total = IntCounter::new(
339            "ai_memory_webhook_dispatched_total",
340            "Total webhook deliveries attempted.",
341        )?;
342        registry.register(Box::new(webhook_dispatched_total.clone()))?;
343
344        let webhook_failed_total = IntCounter::new(
345            "ai_memory_webhook_failed_total",
346            "Webhook deliveries that failed after all retries.",
347        )?;
348        registry.register(Box::new(webhook_failed_total.clone()))?;
349
350        let memories_gauge = IntGauge::new(
351            "ai_memory_memories",
352            "Current count of non-archived memories.",
353        )?;
354        registry.register(Box::new(memories_gauge.clone()))?;
355
356        let hnsw_size_gauge = IntGauge::new(
357            "ai_memory_hnsw_size",
358            "Current HNSW vector index population.",
359        )?;
360        registry.register(Box::new(hnsw_size_gauge.clone()))?;
361
362        let subscriptions_active_gauge = IntGauge::new(
363            "ai_memory_subscriptions_active",
364            "Current count of active webhook subscriptions.",
365        )?;
366        registry.register(Box::new(subscriptions_active_gauge.clone()))?;
367
368        let curator_cycles_total = IntCounter::new(
369            "ai_memory_curator_cycles_total",
370            "Total curator sweep cycles completed.",
371        )?;
372        registry.register(Box::new(curator_cycles_total.clone()))?;
373
374        let curator_operations_total = IntCounterVec::new(
375            prometheus::Opts::new(
376                "ai_memory_curator_operations_total",
377                "Curator operations, labeled by kind (auto_tag|contradiction|persist) and result.",
378            ),
379            &["kind", "result"],
380        )?;
381        registry.register(Box::new(curator_operations_total.clone()))?;
382
383        let curator_cycle_duration_seconds = HistogramVec::new(
384            HistogramOpts::new(
385                "ai_memory_curator_cycle_duration_seconds",
386                "Curator sweep cycle wall-clock duration, labeled by dry_run.",
387            )
388            .buckets(vec![
389                0.1,
390                0.5,
391                1.0,
392                5.0,
393                15.0,
394                60.0,
395                300.0,
396                900.0,
397                crate::SECS_PER_HOUR as f64,
398            ]),
399            &["dry_run"],
400        )?;
401        registry.register(Box::new(curator_cycle_duration_seconds.clone()))?;
402
403        let federation_fanout_dropped_total = IntCounterVec::new(
404            prometheus::Opts::new(
405                "ai_memory_federation_fanout_dropped_total",
406                "Post-quorum fanout tasks whose outcome could not be observed. \
407                 reason=shutdown|panic|join_error. Non-zero indicates mesh divergence risk.",
408            ),
409            &["reason"],
410        )?;
411        registry.register(Box::new(federation_fanout_dropped_total.clone()))?;
412
413        let federation_fanout_retry_total = IntCounterVec::new(
414            prometheus::Opts::new(
415                "ai_memory_federation_fanout_retry_total",
416                "Peer POSTs that hit a transient failure on first attempt and \
417                 were retried once via the Idempotency-Key path. \
418                 outcome=ok|fail|id_drift. Non-zero ok indicates the retry \
419                 recovered a row that would otherwise be missing on a peer.",
420            ),
421            &["outcome"],
422        )?;
423        registry.register(Box::new(federation_fanout_retry_total.clone()))?;
424
425        // H9 (v0.7.0 round-2) — partial-quorum observability.
426        let federation_partial_quorum_total = IntCounter::new(
427            "ai_memory_federation_partial_quorum_total",
428            "Quorum writes that succeeded (W met) but where at least one \
429             configured peer did not ack inside the deadline.",
430        )?;
431        registry.register(Box::new(federation_partial_quorum_total.clone()))?;
432
433        // Cluster-A COR-3 (v0.7.0) — corrupt-provenance observability.
434        let corrupt_provenance_rows_total = IntCounterVec::new(
435            prometheus::Opts::new(
436                "ai_memory_corrupt_provenance_rows_total",
437                "Memory rows whose Form 4 fact-provenance JSON columns \
438                 failed to deserialise and were silently defaulted. \
439                 Non-zero indicates schema drift, writer-side corruption, \
440                 or a migration leaving malformed JSON.",
441            ),
442            &["column"],
443        )?;
444        registry.register(Box::new(corrupt_provenance_rows_total.clone()))?;
445
446        // v0.7-polish SEC-15 / COR-11 (issue #780) — auto-export
447        // detached-worker failure observability.
448        let auto_export_spawn_failed_total = IntCounter::new(
449            "ai_memory_auto_export_spawn_failed_total",
450            "Detached post_reflect.auto_export worker invocations whose \
451             outcome was a panic or returned Err. Non-zero means at \
452             least one reflection was committed to the DB but its \
453             on-disk markdown/json artefact did not land — operators \
454             use this to alert on otherwise-silent disk-write failures.",
455        )?;
456        registry.register(Box::new(auto_export_spawn_failed_total.clone()))?;
457
458        // v0.7.0 Track D #933 — federation push DLQ depth gauge.
459        let federation_push_dlq_depth = IntGauge::new(
460            "ai_memory_federation_push_dlq_depth",
461            "Current count of pending federation_push_dlq rows \
462             (replayed_at IS NULL). Refreshed on every replay tick. \
463             Non-zero sustained depth indicates one or more peers are \
464             persistently unreachable; healthy meshes drain back to 0 \
465             within one replay interval after peer recovery.",
466        )?;
467        registry.register(Box::new(federation_push_dlq_depth.clone()))?;
468
469        // #1032 (HIGH, 2026-05-21) — federation push DLQ quarantine counter.
470        let federation_push_dlq_quarantined = IntCounter::new(
471            "ai_memory_federation_push_dlq_quarantined_total",
472            "Monotonic counter of federation_push_dlq rows the replay \
473             worker has skipped because their attempt_count exceeded \
474             MAX_REPLAY_ATTEMPTS (currently 100). Non-zero sustained \
475             rate indicates poison-message rows that need operator \
476             intervention via `ai-memory federation dlq drain \
477             --quarantined`. Pre-#1032 the worker retried these \
478             forever, amplifying network load against rejecting peers.",
479        )?;
480        registry.register(Box::new(federation_push_dlq_quarantined.clone()))?;
481
482        // pm-v3.1 PR8 (issue #1174) — HNSW eviction observability moved
483        // from process-global atomics in `src/hnsw.rs` into the metrics
484        // registry. The counter mirrors `INDEX_EVICTIONS_TOTAL`; the
485        // gauge mirrors `LAST_EVICTION_AT_NANOS` as a UNIX-nanosecond
486        // wall-clock timestamp (0 if no eviction has occurred). Both
487        // are surfaced at `/metrics` so the eviction signal is
488        // scrape-visible without going through `memory_stats`.
489        let hnsw_evictions_total = IntCounter::new(
490            "ai_memory_hnsw_evictions_total",
491            "Cumulative HNSW oldest-eviction count since process start. \
492             Non-zero indicates the in-memory vector index has hit \
493             MAX_ENTRIES and dropped older embeddings; recall quality \
494             may have degraded for evicted ids until they are \
495             re-inserted on next access.",
496        )?;
497        registry.register(Box::new(hnsw_evictions_total.clone()))?;
498
499        let hnsw_last_eviction_at_nanos = IntGauge::new(
500            "ai_memory_hnsw_last_eviction_at_nanos",
501            "Wall-clock UNIX nanoseconds of the most recent HNSW \
502             eviction (0 if none). Capabilities derives \
503             hnsw.evicted_recently from this with a 60s rolling window.",
504        )?;
505        registry.register(Box::new(hnsw_last_eviction_at_nanos.clone()))?;
506
507        // #1253 (MED, 2026-05-25) — subscription DLQ overflow counter.
508        let subscription_dlq_overflow_total = IntCounter::new(
509            "ai_memory_subscription_dlq_overflow_total",
510            "Monotonic counter of subscription_dlq inserts refused \
511             because the per-subscription DLQ depth had already hit \
512             MAX_SUBSCRIPTION_DLQ_ROWS (10_000). Non-zero indicates a \
513             hostile or persistently-broken webhook target that would \
514             otherwise fill the operator's disk with quarantined rows. \
515             Operators drain the queue via `ai-memory subscription dlq \
516             drain <subscription_id>` before resetting.",
517        )?;
518        registry.register(Box::new(subscription_dlq_overflow_total.clone()))?;
519
520        // FED-P4-e (federation-identity-at-scale §8) — federation
521        // identity SLO surfaces: verify-failure-rate, signed-vs-unsigned
522        // ratio, max cred age, renewal lag.
523        let federation_cred_verify_total = IntCounterVec::new(
524            prometheus::Opts::new(
525                "ai_memory_federation_cred_verify_total",
526                "Federation credential-verification outcomes on the \
527                 receiver path, labeled result (ok|fail). \
528                 verify-failure-rate SLO = fail / (ok + fail). Non-zero \
529                 sustained fail rate means peers present credentials the \
530                 local trust bundle cannot verify (expired leaf, revoked \
531                 issuer, clock skew, or a chain that fails to anchor).",
532            ),
533            &["result"],
534        )?;
535        registry.register(Box::new(federation_cred_verify_total.clone()))?;
536
537        let federation_inbound_cred_total = IntCounterVec::new(
538            prometheus::Opts::new(
539                "ai_memory_federation_inbound_cred_total",
540                "Inbound federation requests bucketed by whether they \
541                 presented a signed credential, labeled presence \
542                 (signed|unsigned). signed-vs-unsigned-ratio SLO = \
543                 signed / (signed + unsigned). Climbs toward 1.0 as \
544                 peers upgrade to credential-presenting builds.",
545            ),
546            &["presence"],
547        )?;
548        registry.register(Box::new(federation_inbound_cred_total.clone()))?;
549
550        let federation_cred_max_age_seconds = IntGauge::new(
551            "ai_memory_federation_cred_max_age_seconds",
552            "Age in seconds of the local outbound leaf credential \
553             (now - issued_at), refreshed on every renewal tick. \
554             max-cred-age SLO alerts when this approaches the leaf TTL \
555             — a credential aging past its TTL without a renewal means \
556             the refresh worker has stalled and outbound sync will \
557             start failing peer verification.",
558        )?;
559        registry.register(Box::new(federation_cred_max_age_seconds.clone()))?;
560
561        let federation_renewal_lag_seconds = IntGauge::new(
562            "ai_memory_federation_renewal_lag_seconds",
563            "Seconds since the last successful outbound-credential \
564             renewal (now - last-renew wall clock), refreshed on every \
565             renewal tick. renewal-lag SLO alerts when this exceeds the \
566             configured refresh interval by a safety margin: a lag \
567             larger than the interval means renewals are silently \
568             failing even though the worker thread is still alive.",
569        )?;
570        registry.register(Box::new(federation_renewal_lag_seconds.clone()))?;
571
572        Ok(Self {
573            registry,
574            store_total,
575            recall_total,
576            recall_latency_seconds,
577            autonomy_hook_total,
578            contradiction_detected_total,
579            webhook_dispatched_total,
580            webhook_failed_total,
581            memories_gauge,
582            hnsw_size_gauge,
583            subscriptions_active_gauge,
584            curator_cycles_total,
585            curator_operations_total,
586            curator_cycle_duration_seconds,
587            federation_fanout_dropped_total,
588            federation_fanout_retry_total,
589            federation_partial_quorum_total,
590            corrupt_provenance_rows_total,
591            auto_export_spawn_failed_total,
592            federation_push_dlq_depth,
593            federation_push_dlq_quarantined,
594            hnsw_evictions_total,
595            hnsw_last_eviction_at_nanos,
596            subscription_dlq_overflow_total,
597            federation_cred_verify_total,
598            federation_inbound_cred_total,
599            federation_cred_max_age_seconds,
600            federation_renewal_lag_seconds,
601        })
602    }
603}
604
605/// #1253 (MED, 2026-05-25) — record one subscription_dlq insert that
606/// was refused because the per-subscription DLQ already held
607/// [`crate::subscriptions::MAX_SUBSCRIPTION_DLQ_ROWS`] rows. Pairs
608/// with a `tracing::warn!` at the call site so operators see the
609/// subscription id + correlation id of the dropped row.
610pub fn record_subscription_dlq_overflow() {
611    registry().subscription_dlq_overflow_total.inc();
612}
613
614/// #1253 (MED, 2026-05-25) — read the current value of the
615/// subscription DLQ overflow counter. Test-only accessor for the
616/// regression that pins this cap.
617#[must_use]
618pub fn subscription_dlq_overflow_count() -> u64 {
619    registry().subscription_dlq_overflow_total.get()
620}
621
622/// FED-P4-e (federation-identity-at-scale §8) — record one federation
623/// credential-verification outcome on the receiver path. `ok = true`
624/// means the presented credential (or chain leaf) verified against the
625/// local trust bundle; `ok = false` means it was rejected. Feeds the
626/// verify-failure-rate SLO.
627pub fn record_federation_cred_verify(ok: bool) {
628    let result = if ok { "ok" } else { "fail" };
629    registry()
630        .federation_cred_verify_total
631        .with_label_values(&[result])
632        .inc();
633}
634
635/// FED-P4-e — read the federation credential-verify counter for a given
636/// outcome (`ok` | `fail`). Test-only accessor for the SLO regression.
637#[must_use]
638pub fn federation_cred_verify_count(result: &str) -> u64 {
639    registry()
640        .federation_cred_verify_total
641        .with_label_values(&[result])
642        .get()
643}
644
645/// FED-P4-e (federation-identity-at-scale §8) — record one inbound
646/// federation request bucketed by whether it presented a signed
647/// credential. `signed = true` means a credential header was present
648/// (regardless of verify outcome); `false` means the peer sent no
649/// credential. Feeds the signed-vs-unsigned-ratio SLO.
650pub fn record_federation_inbound_cred(signed: bool) {
651    let presence = if signed { "signed" } else { "unsigned" };
652    registry()
653        .federation_inbound_cred_total
654        .with_label_values(&[presence])
655        .inc();
656}
657
658/// FED-P4-e — read the inbound-credential presence counter for a given
659/// bucket (`signed` | `unsigned`). Test-only accessor for the SLO
660/// regression.
661#[must_use]
662pub fn federation_inbound_cred_count(presence: &str) -> u64 {
663    registry()
664        .federation_inbound_cred_total
665        .with_label_values(&[presence])
666        .get()
667}
668
669/// FED-P4-e (federation-identity-at-scale §8) — set the age in seconds
670/// of the local outbound leaf credential (now − `issued_at`). Called on
671/// every renewal tick. Feeds the max-cred-age SLO.
672pub fn set_federation_cred_max_age_seconds(secs: i64) {
673    registry().federation_cred_max_age_seconds.set(secs);
674}
675
676/// FED-P4-e (federation-identity-at-scale §8) — set the seconds elapsed
677/// since the last successful outbound-credential renewal. Called on
678/// every renewal tick. Feeds the renewal-lag SLO.
679pub fn set_federation_renewal_lag_seconds(secs: i64) {
680    registry().federation_renewal_lag_seconds.set(secs);
681}
682
683/// Cluster-A COR-3 (v0.7.0) — record a single corrupt-provenance row
684/// observation. `column` is the offending JSON column name
685/// (`citations` / `source_span` / `confidence_signals` / `metadata`).
686/// Pairs with a `tracing::warn!` at the call site so operators see the
687/// row id + parse error.
688pub fn record_corrupt_provenance(column: &str) {
689    registry()
690        .corrupt_provenance_rows_total
691        .with_label_values(&[column])
692        .inc();
693}
694
695/// v0.7-polish SEC-15 / COR-11 (issue #780) — record one detached
696/// `auto_export` worker failure (panic OR returned `Err`). Pairs with
697/// a `tracing::warn!` at the call site so operators see the
698/// reflection id + failure mode. The counter is also mirrored onto the
699/// capabilities-v3 `hooks.auto_export_spawn_failed_total` field so
700/// dashboards that consume `memory_capabilities` (vs `/metrics`) see
701/// the same signal.
702pub fn record_auto_export_spawn_failed() {
703    registry().auto_export_spawn_failed_total.inc();
704}
705
706/// v0.7-polish SEC-15 / COR-11 (issue #780) — read the current value
707/// of the auto-export spawn-failure counter. Used by the
708/// capabilities-v3 builder to mirror the metric onto the
709/// `hooks.auto_export_spawn_failed_total` field without scraping
710/// `/metrics`.
711#[must_use]
712pub fn auto_export_spawn_failed_count() -> u64 {
713    registry().auto_export_spawn_failed_total.get()
714}
715
716/// Render the current registry state to the Prometheus text exposition
717/// format. Ignores errors from the encoder (unreachable in practice) and
718/// returns an empty string — the scrape returns 200 with a possibly-empty
719/// body rather than a 5xx, which Prometheus handles gracefully.
720#[must_use]
721pub fn render() -> String {
722    let encoder = TextEncoder::new();
723    let mut buf = Vec::new();
724    let _ = encoder.encode(&registry().registry.gather(), &mut buf);
725    String::from_utf8(buf).unwrap_or_default()
726}
727
728/// Convenience: record a store, labeled by tier.
729#[allow(dead_code)]
730pub fn record_store(tier: &str, ok: bool) {
731    let result = if ok { "ok" } else { "err" };
732    registry()
733        .store_total
734        .with_label_values(&[tier, result])
735        .inc();
736}
737
738/// Convenience: record a recall, labeled by mode + latency.
739#[allow(dead_code)]
740pub fn record_recall(mode: &str, latency_seconds: f64) {
741    registry().recall_total.with_label_values(&[mode]).inc();
742    registry()
743        .recall_latency_seconds
744        .with_label_values(&[mode])
745        .observe(latency_seconds);
746}
747
748/// Convenience: record an autonomy-hook invocation.
749#[allow(dead_code)]
750pub fn record_autonomy_hook(kind: &str, ok: bool) {
751    let result = if ok { "ok" } else { "err" };
752    registry()
753        .autonomy_hook_total
754        .with_label_values(&[kind, result])
755        .inc();
756}
757
758/// Convenience: record a completed curator cycle (v0.6.1).
759#[allow(dead_code)]
760pub fn curator_cycle_completed(
761    operations_attempted: usize,
762    auto_tagged: usize,
763    contradictions_found: usize,
764    errors: usize,
765) {
766    let r = registry();
767    r.curator_cycles_total.inc();
768    if auto_tagged > 0 {
769        r.curator_operations_total
770            .with_label_values(&["auto_tag", "ok"])
771            .inc_by(auto_tagged as u64);
772    }
773    if contradictions_found > 0 {
774        r.curator_operations_total
775            .with_label_values(&["contradiction", "ok"])
776            .inc_by(contradictions_found as u64);
777    }
778    let failed = operations_attempted.saturating_sub(auto_tagged + contradictions_found);
779    if failed > 0 || errors > 0 {
780        r.curator_operations_total
781            .with_label_values(&["any", "err"])
782            .inc_by(errors as u64);
783    }
784}
785
786#[cfg(test)]
787mod tests {
788    use super::*;
789    use crate::models::Tier;
790
791    #[test]
792    fn registry_is_singleton() {
793        let r1 = registry();
794        let r2 = registry();
795        // Same instance — no double-registration.
796        assert!(std::ptr::eq(std::ptr::from_ref(r1), std::ptr::from_ref(r2)));
797    }
798
799    #[test]
800    fn render_includes_registered_names() {
801        // Tickle every series so each one has ≥1 sample.
802        record_store(Tier::Short.as_str(), true);
803        record_recall("hybrid", 0.042);
804        record_autonomy_hook("auto_tag", true);
805        registry().contradiction_detected_total.inc();
806        registry().webhook_dispatched_total.inc();
807        registry().memories_gauge.set(42);
808        registry().hnsw_size_gauge.set(42);
809        registry().subscriptions_active_gauge.set(3);
810        registry().federation_push_dlq_depth.set(0);
811        // FED-P4-e — federation identity SLO surfaces.
812        record_federation_cred_verify(true);
813        record_federation_inbound_cred(true);
814        set_federation_cred_max_age_seconds(0);
815        set_federation_renewal_lag_seconds(0);
816
817        let text = render();
818        for name in [
819            "ai_memory_store_total",
820            "ai_memory_recall_total",
821            "ai_memory_recall_latency_seconds",
822            "ai_memory_autonomy_hook_total",
823            "ai_memory_contradiction_detected_total",
824            "ai_memory_webhook_dispatched_total",
825            "ai_memory_webhook_failed_total",
826            "ai_memory_memories",
827            "ai_memory_hnsw_size",
828            "ai_memory_subscriptions_active",
829            // v0.7.0 Track D #933 — federation push DLQ depth gauge.
830            "ai_memory_federation_push_dlq_depth",
831            // FED-P4-e — federation identity SLO surfaces (§8).
832            "ai_memory_federation_cred_verify_total",
833            "ai_memory_federation_inbound_cred_total",
834            "ai_memory_federation_cred_max_age_seconds",
835            "ai_memory_federation_renewal_lag_seconds",
836        ] {
837            assert!(text.contains(name), "/metrics missing {name}\n\n{text}");
838        }
839    }
840
841    #[test]
842    fn federation_cred_verify_labels_outcome() {
843        let before_ok = federation_cred_verify_count("ok");
844        let before_fail = federation_cred_verify_count("fail");
845        record_federation_cred_verify(true);
846        record_federation_cred_verify(false);
847        assert!(federation_cred_verify_count("ok") >= before_ok + 1);
848        assert!(federation_cred_verify_count("fail") >= before_fail + 1);
849        let text = render();
850        assert!(text.contains("ai_memory_federation_cred_verify_total{result=\"ok\"}"));
851        assert!(text.contains("ai_memory_federation_cred_verify_total{result=\"fail\"}"));
852    }
853
854    #[test]
855    fn federation_inbound_cred_labels_presence() {
856        let before_signed = federation_inbound_cred_count("signed");
857        let before_unsigned = federation_inbound_cred_count("unsigned");
858        record_federation_inbound_cred(true);
859        record_federation_inbound_cred(false);
860        assert!(federation_inbound_cred_count("signed") >= before_signed + 1);
861        assert!(federation_inbound_cred_count("unsigned") >= before_unsigned + 1);
862    }
863
864    #[test]
865    fn federation_cred_age_and_lag_gauges_settable() {
866        set_federation_cred_max_age_seconds(1234);
867        set_federation_renewal_lag_seconds(56);
868        assert_eq!(registry().federation_cred_max_age_seconds.get(), 1234);
869        assert_eq!(registry().federation_renewal_lag_seconds.get(), 56);
870    }
871
872    #[test]
873    fn record_store_labels_tier() {
874        record_store(Tier::Long.as_str(), true);
875        let text = render();
876        assert!(text.contains("ai_memory_store_total{result=\"ok\",tier=\"long\"}"));
877    }
878
879    // ---- Wave 3 (Closer T): tests for curator_cycle_completed (L263-287)
880    // and webhook_dispatched/_failed counter labels.
881
882    #[test]
883    fn curator_cycle_completed_increments_total() {
884        // Other tests running in parallel may bump the same singleton
885        // counter; what we own is the +1 contributed by *this* call.
886        let before = registry().curator_cycles_total.get();
887        curator_cycle_completed(0, 0, 0, 0);
888        let after = registry().curator_cycles_total.get();
889        assert!(
890            after >= before + 1,
891            "curator_cycles_total did not advance (before={before}, after={after})"
892        );
893    }
894
895    #[test]
896    fn curator_cycle_completed_records_auto_tag_ok() {
897        curator_cycle_completed(5, 3, 0, 0);
898        let text = render();
899        assert!(
900            text.contains("ai_memory_curator_operations_total"),
901            "curator_operations_total counter missing from /metrics output"
902        );
903    }
904
905    #[test]
906    fn curator_cycle_completed_records_contradiction_ok() {
907        curator_cycle_completed(2, 0, 2, 0);
908        let text = render();
909        assert!(text.contains("ai_memory_curator_operations_total"));
910    }
911
912    #[test]
913    fn curator_cycle_completed_records_errors() {
914        // operations_attempted=5, auto_tagged=2, contradictions=1 → failed=2
915        // plus errors=1 → the err counter is exercised.
916        curator_cycle_completed(5, 2, 1, 1);
917        let text = render();
918        assert!(text.contains("ai_memory_curator_operations_total"));
919    }
920
921    #[test]
922    fn curator_cycle_completed_with_zero_args_is_safe() {
923        // No labels emitted, no panic — a zero cycle is valid (empty DB).
924        let before = registry().curator_cycles_total.get();
925        curator_cycle_completed(0, 0, 0, 0);
926        let after = registry().curator_cycles_total.get();
927        // Same race-tolerant assertion as above.
928        assert!(after >= before + 1);
929    }
930
931    // -----------------------------------------------------------------
932    // W12-H — additional helpers + render shape pinning
933    // -----------------------------------------------------------------
934
935    #[test]
936    fn record_store_err_path() {
937        record_store(Tier::Short.as_str(), false);
938        let text = render();
939        assert!(text.contains("ai_memory_store_total{result=\"err\",tier=\"short\""));
940    }
941
942    #[test]
943    fn record_recall_emits_latency_histogram() {
944        record_recall("keyword", 0.5);
945        let text = render();
946        assert!(text.contains("ai_memory_recall_total{mode=\"keyword\""));
947        assert!(text.contains("ai_memory_recall_latency_seconds"));
948    }
949
950    #[test]
951    fn record_autonomy_hook_err_path() {
952        record_autonomy_hook("contradiction", false);
953        let text = render();
954        assert!(
955            text.contains("ai_memory_autonomy_hook_total{kind=\"contradiction\",result=\"err\"")
956        );
957    }
958
959    #[test]
960    fn render_emits_help_and_type_lines() {
961        // Tickle one series, then render and assert prom-format HELP/TYPE lines.
962        record_store(Tier::Mid.as_str(), true);
963        let text = render();
964        assert!(text.contains("# HELP ai_memory_store_total"));
965        assert!(text.contains("# TYPE ai_memory_store_total counter"));
966    }
967
968    #[test]
969    fn fanout_dropped_counter_increments() {
970        registry()
971            .federation_fanout_dropped_total
972            .with_label_values(&["shutdown"])
973            .inc();
974        let text = render();
975        assert!(text.contains("ai_memory_federation_fanout_dropped_total{reason=\"shutdown\""));
976    }
977
978    #[test]
979    fn fanout_retry_counter_outcome_labels() {
980        // All three outcome labels exercised — `ok`, `fail`, `id_drift`.
981        for outcome in ["ok", "fail", "id_drift"] {
982            registry()
983                .federation_fanout_retry_total
984                .with_label_values(&[outcome])
985                .inc();
986        }
987        let text = render();
988        assert!(text.contains("ai_memory_federation_fanout_retry_total"));
989    }
990
991    #[test]
992    fn curator_cycle_duration_histogram_buckets() {
993        // Just observe — confirms registry accepts the value and surfaces
994        // the histogram in /metrics output.
995        registry()
996            .curator_cycle_duration_seconds
997            .with_label_values(&["false"])
998            .observe(0.42);
999        let text = render();
1000        assert!(text.contains("ai_memory_curator_cycle_duration_seconds"));
1001    }
1002
1003    // -----------------------------------------------------------------
1004    // L0.7-2 Tier A — exercise try_new() directly so the metric-builder
1005    // happy paths (lines 88-210) get covered. The process singleton
1006    // registry() builds once on first access; we need a second pass for
1007    // line coverage of every metric registration in the try_new body.
1008    // -----------------------------------------------------------------
1009
1010    #[test]
1011    fn try_new_builds_a_fresh_metrics_handle() {
1012        // Build a second instance on top of an independent registry —
1013        // hits every metric-construction line in `try_new` even when
1014        // another test has already initialised the process-wide
1015        // singleton. Each call uses a fresh Registry, so register()
1016        // cannot collide.
1017        let m = super::Metrics::try_new().expect("fresh registry must succeed");
1018        // The handle must expose every metric family — touch each to
1019        // exercise the assignment side of the struct literal.
1020        m.store_total
1021            .with_label_values(&[Tier::Short.as_str(), "ok"])
1022            .inc();
1023        m.recall_total.with_label_values(&["hybrid"]).inc();
1024        m.recall_latency_seconds
1025            .with_label_values(&["hybrid"])
1026            .observe(0.001);
1027        m.autonomy_hook_total.with_label_values(&["x", "ok"]).inc();
1028        m.contradiction_detected_total.inc();
1029        m.webhook_dispatched_total.inc();
1030        m.webhook_failed_total.inc();
1031        m.memories_gauge.set(1);
1032        m.hnsw_size_gauge.set(1);
1033        m.subscriptions_active_gauge.set(1);
1034        m.curator_cycles_total.inc();
1035        m.curator_operations_total
1036            .with_label_values(&["auto_tag", "ok"])
1037            .inc();
1038        m.curator_cycle_duration_seconds
1039            .with_label_values(&["true"])
1040            .observe(1.0);
1041        m.federation_fanout_dropped_total
1042            .with_label_values(&["panic"])
1043            .inc();
1044        m.federation_fanout_retry_total
1045            .with_label_values(&["ok"])
1046            .inc();
1047        m.federation_partial_quorum_total.inc();
1048        m.auto_export_spawn_failed_total.inc();
1049    }
1050
1051    #[test]
1052    fn try_new_can_build_two_isolated_registries() {
1053        // Two consecutive try_new() calls succeed because each builds
1054        // its own Registry — no name collision.
1055        let a = super::Metrics::try_new().expect("first");
1056        let b = super::Metrics::try_new().expect("second");
1057        // Tickle a counter on each so the family surfaces in gather().
1058        a.store_total
1059            .with_label_values(&[Tier::Short.as_str(), "ok"])
1060            .inc();
1061        b.store_total
1062            .with_label_values(&[Tier::Short.as_str(), "ok"])
1063            .inc();
1064        let mut buf_a = Vec::new();
1065        let mut buf_b = Vec::new();
1066        let enc = TextEncoder::new();
1067        enc.encode(&a.registry.gather(), &mut buf_a).unwrap();
1068        enc.encode(&b.registry.gather(), &mut buf_b).unwrap();
1069        assert!(String::from_utf8_lossy(&buf_a).contains("ai_memory_store_total"));
1070        assert!(String::from_utf8_lossy(&buf_b).contains("ai_memory_store_total"));
1071    }
1072
1073    #[test]
1074    fn record_auto_export_spawn_failed_increments_singleton() {
1075        // v0.7-polish #780 — record_auto_export_spawn_failed() must
1076        // monotonically advance the process-wide counter that the
1077        // capabilities-v3 builder mirrors onto
1078        // `hooks.auto_export_spawn_failed_total`.
1079        let before = auto_export_spawn_failed_count();
1080        record_auto_export_spawn_failed();
1081        let after = auto_export_spawn_failed_count();
1082        assert!(
1083            after >= before + 1,
1084            "auto_export_spawn_failed_total did not advance \
1085             (before={before}, after={after})"
1086        );
1087        // The render text must mention the metric name so /metrics
1088        // scrapers see it.
1089        let text = render();
1090        assert!(
1091            text.contains("ai_memory_auto_export_spawn_failed_total"),
1092            "/metrics output missing auto_export counter\n\n{text}"
1093        );
1094    }
1095
1096    #[test]
1097    fn curator_cycle_completed_no_progress_branch_skips_err_increment() {
1098        // operations_attempted=0, auto_tagged=0, contradictions=0,
1099        // errors=0 → failed = 0.saturating_sub(0+0) = 0 → the `if
1100        // failed > 0 || errors > 0` block does NOT execute. Pins the
1101        // negative branch.
1102        let before = registry().curator_cycles_total.get();
1103        curator_cycle_completed(0, 0, 0, 0);
1104        let after = registry().curator_cycles_total.get();
1105        assert!(after >= before + 1);
1106    }
1107}