use std::sync::OnceLock;
use prometheus::{
Encoder, HistogramOpts, HistogramVec, IntCounter, IntCounterVec, IntGauge, Registry,
TextEncoder,
};
#[allow(dead_code)]
pub struct Metrics {
pub registry: Registry,
pub store_total: IntCounterVec,
pub recall_total: IntCounterVec,
pub recall_latency_seconds: HistogramVec,
pub autonomy_hook_total: IntCounterVec,
pub contradiction_detected_total: IntCounter,
pub webhook_dispatched_total: IntCounter,
pub webhook_failed_total: IntCounter,
pub memories_gauge: IntGauge,
pub hnsw_size_gauge: IntGauge,
pub subscriptions_active_gauge: IntGauge,
pub curator_cycles_total: IntCounter,
pub curator_operations_total: IntCounterVec,
pub curator_cycle_duration_seconds: HistogramVec,
pub federation_fanout_dropped_total: IntCounterVec,
pub federation_fanout_retry_total: IntCounterVec,
}
pub fn registry() -> &'static Metrics {
static HANDLE: OnceLock<Metrics> = OnceLock::new();
HANDLE.get_or_init(Metrics::new_or_panic)
}
impl Metrics {
fn new_or_panic() -> Self {
Self::try_new().expect("prometheus registry init failed")
}
#[allow(clippy::too_many_lines)]
fn try_new() -> prometheus::Result<Self> {
let registry = Registry::new();
let store_total = IntCounterVec::new(
prometheus::Opts::new(
"ai_memory_store_total",
"Total memory_store calls, labeled by tier and result.",
),
&["tier", "result"],
)?;
registry.register(Box::new(store_total.clone()))?;
let recall_total = IntCounterVec::new(
prometheus::Opts::new(
"ai_memory_recall_total",
"Total memory_recall calls, labeled by mode.",
),
&["mode"],
)?;
registry.register(Box::new(recall_total.clone()))?;
let recall_latency_seconds = HistogramVec::new(
HistogramOpts::new(
"ai_memory_recall_latency_seconds",
"Recall latency in seconds, labeled by mode.",
)
.buckets(vec![
0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0,
]),
&["mode"],
)?;
registry.register(Box::new(recall_latency_seconds.clone()))?;
let autonomy_hook_total = IntCounterVec::new(
prometheus::Opts::new(
"ai_memory_autonomy_hook_total",
"Post-store autonomy hook invocations, labeled by kind and result.",
),
&["kind", "result"],
)?;
registry.register(Box::new(autonomy_hook_total.clone()))?;
let contradiction_detected_total = IntCounter::new(
"ai_memory_contradiction_detected_total",
"Count of contradictions the LLM hook confirmed.",
)?;
registry.register(Box::new(contradiction_detected_total.clone()))?;
let webhook_dispatched_total = IntCounter::new(
"ai_memory_webhook_dispatched_total",
"Total webhook deliveries attempted.",
)?;
registry.register(Box::new(webhook_dispatched_total.clone()))?;
let webhook_failed_total = IntCounter::new(
"ai_memory_webhook_failed_total",
"Webhook deliveries that failed after all retries.",
)?;
registry.register(Box::new(webhook_failed_total.clone()))?;
let memories_gauge = IntGauge::new(
"ai_memory_memories",
"Current count of non-archived memories.",
)?;
registry.register(Box::new(memories_gauge.clone()))?;
let hnsw_size_gauge = IntGauge::new(
"ai_memory_hnsw_size",
"Current HNSW vector index population.",
)?;
registry.register(Box::new(hnsw_size_gauge.clone()))?;
let subscriptions_active_gauge = IntGauge::new(
"ai_memory_subscriptions_active",
"Current count of active webhook subscriptions.",
)?;
registry.register(Box::new(subscriptions_active_gauge.clone()))?;
let curator_cycles_total = IntCounter::new(
"ai_memory_curator_cycles_total",
"Total curator sweep cycles completed.",
)?;
registry.register(Box::new(curator_cycles_total.clone()))?;
let curator_operations_total = IntCounterVec::new(
prometheus::Opts::new(
"ai_memory_curator_operations_total",
"Curator operations, labeled by kind (auto_tag|contradiction|persist) and result.",
),
&["kind", "result"],
)?;
registry.register(Box::new(curator_operations_total.clone()))?;
let curator_cycle_duration_seconds = HistogramVec::new(
HistogramOpts::new(
"ai_memory_curator_cycle_duration_seconds",
"Curator sweep cycle wall-clock duration, labeled by dry_run.",
)
.buckets(vec![0.1, 0.5, 1.0, 5.0, 15.0, 60.0, 300.0, 900.0, 3600.0]),
&["dry_run"],
)?;
registry.register(Box::new(curator_cycle_duration_seconds.clone()))?;
let federation_fanout_dropped_total = IntCounterVec::new(
prometheus::Opts::new(
"ai_memory_federation_fanout_dropped_total",
"Post-quorum fanout tasks whose outcome could not be observed. \
reason=shutdown|panic|join_error. Non-zero indicates mesh divergence risk.",
),
&["reason"],
)?;
registry.register(Box::new(federation_fanout_dropped_total.clone()))?;
let federation_fanout_retry_total = IntCounterVec::new(
prometheus::Opts::new(
"ai_memory_federation_fanout_retry_total",
"Peer POSTs that hit a transient failure on first attempt and \
were retried once via the Idempotency-Key path. \
outcome=ok|fail|id_drift. Non-zero ok indicates the retry \
recovered a row that would otherwise be missing on a peer.",
),
&["outcome"],
)?;
registry.register(Box::new(federation_fanout_retry_total.clone()))?;
Ok(Self {
registry,
store_total,
recall_total,
recall_latency_seconds,
autonomy_hook_total,
contradiction_detected_total,
webhook_dispatched_total,
webhook_failed_total,
memories_gauge,
hnsw_size_gauge,
subscriptions_active_gauge,
curator_cycles_total,
curator_operations_total,
curator_cycle_duration_seconds,
federation_fanout_dropped_total,
federation_fanout_retry_total,
})
}
}
#[must_use]
pub fn render() -> String {
let encoder = TextEncoder::new();
let mut buf = Vec::new();
let _ = encoder.encode(®istry().registry.gather(), &mut buf);
String::from_utf8(buf).unwrap_or_default()
}
#[allow(dead_code)]
pub fn record_store(tier: &str, ok: bool) {
let result = if ok { "ok" } else { "err" };
registry()
.store_total
.with_label_values(&[tier, result])
.inc();
}
#[allow(dead_code)]
pub fn record_recall(mode: &str, latency_seconds: f64) {
registry().recall_total.with_label_values(&[mode]).inc();
registry()
.recall_latency_seconds
.with_label_values(&[mode])
.observe(latency_seconds);
}
#[allow(dead_code)]
pub fn record_autonomy_hook(kind: &str, ok: bool) {
let result = if ok { "ok" } else { "err" };
registry()
.autonomy_hook_total
.with_label_values(&[kind, result])
.inc();
}
#[allow(dead_code)]
pub fn curator_cycle_completed(
operations_attempted: usize,
auto_tagged: usize,
contradictions_found: usize,
errors: usize,
) {
let r = registry();
r.curator_cycles_total.inc();
if auto_tagged > 0 {
r.curator_operations_total
.with_label_values(&["auto_tag", "ok"])
.inc_by(auto_tagged as u64);
}
if contradictions_found > 0 {
r.curator_operations_total
.with_label_values(&["contradiction", "ok"])
.inc_by(contradictions_found as u64);
}
let failed = operations_attempted.saturating_sub(auto_tagged + contradictions_found);
if failed > 0 || errors > 0 {
r.curator_operations_total
.with_label_values(&["any", "err"])
.inc_by(errors as u64);
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn registry_is_singleton() {
let r1 = registry();
let r2 = registry();
assert!(std::ptr::eq(std::ptr::from_ref(r1), std::ptr::from_ref(r2)));
}
#[test]
fn render_includes_registered_names() {
record_store("short", true);
record_recall("hybrid", 0.042);
record_autonomy_hook("auto_tag", true);
registry().contradiction_detected_total.inc();
registry().webhook_dispatched_total.inc();
registry().memories_gauge.set(42);
registry().hnsw_size_gauge.set(42);
registry().subscriptions_active_gauge.set(3);
let text = render();
for name in [
"ai_memory_store_total",
"ai_memory_recall_total",
"ai_memory_recall_latency_seconds",
"ai_memory_autonomy_hook_total",
"ai_memory_contradiction_detected_total",
"ai_memory_webhook_dispatched_total",
"ai_memory_webhook_failed_total",
"ai_memory_memories",
"ai_memory_hnsw_size",
"ai_memory_subscriptions_active",
] {
assert!(text.contains(name), "/metrics missing {name}\n\n{text}");
}
}
#[test]
fn record_store_labels_tier() {
record_store("long", true);
let text = render();
assert!(text.contains("ai_memory_store_total{result=\"ok\",tier=\"long\"}"));
}
#[test]
fn curator_cycle_completed_increments_total() {
let before = registry().curator_cycles_total.get();
curator_cycle_completed(0, 0, 0, 0);
let after = registry().curator_cycles_total.get();
assert!(
after >= before + 1,
"curator_cycles_total did not advance (before={before}, after={after})"
);
}
#[test]
fn curator_cycle_completed_records_auto_tag_ok() {
curator_cycle_completed(5, 3, 0, 0);
let text = render();
assert!(
text.contains("ai_memory_curator_operations_total"),
"curator_operations_total counter missing from /metrics output"
);
}
#[test]
fn curator_cycle_completed_records_contradiction_ok() {
curator_cycle_completed(2, 0, 2, 0);
let text = render();
assert!(text.contains("ai_memory_curator_operations_total"));
}
#[test]
fn curator_cycle_completed_records_errors() {
curator_cycle_completed(5, 2, 1, 1);
let text = render();
assert!(text.contains("ai_memory_curator_operations_total"));
}
#[test]
fn curator_cycle_completed_with_zero_args_is_safe() {
let before = registry().curator_cycles_total.get();
curator_cycle_completed(0, 0, 0, 0);
let after = registry().curator_cycles_total.get();
assert!(after >= before + 1);
}
#[test]
fn record_store_err_path() {
record_store("short", false);
let text = render();
assert!(text.contains("ai_memory_store_total{result=\"err\",tier=\"short\""));
}
#[test]
fn record_recall_emits_latency_histogram() {
record_recall("keyword", 0.5);
let text = render();
assert!(text.contains("ai_memory_recall_total{mode=\"keyword\""));
assert!(text.contains("ai_memory_recall_latency_seconds"));
}
#[test]
fn record_autonomy_hook_err_path() {
record_autonomy_hook("contradiction", false);
let text = render();
assert!(
text.contains("ai_memory_autonomy_hook_total{kind=\"contradiction\",result=\"err\"")
);
}
#[test]
fn render_emits_help_and_type_lines() {
record_store("mid", true);
let text = render();
assert!(text.contains("# HELP ai_memory_store_total"));
assert!(text.contains("# TYPE ai_memory_store_total counter"));
}
#[test]
fn fanout_dropped_counter_increments() {
registry()
.federation_fanout_dropped_total
.with_label_values(&["shutdown"])
.inc();
let text = render();
assert!(text.contains("ai_memory_federation_fanout_dropped_total{reason=\"shutdown\""));
}
#[test]
fn fanout_retry_counter_outcome_labels() {
for outcome in ["ok", "fail", "id_drift"] {
registry()
.federation_fanout_retry_total
.with_label_values(&[outcome])
.inc();
}
let text = render();
assert!(text.contains("ai_memory_federation_fanout_retry_total"));
}
#[test]
fn curator_cycle_duration_histogram_buckets() {
registry()
.curator_cycle_duration_seconds
.with_label_values(&["false"])
.observe(0.42);
let text = render();
assert!(text.contains("ai_memory_curator_cycle_duration_seconds"));
}
}