use std::sync::{
atomic::{AtomicUsize, Ordering},
Arc,
};
use axum::{body::Body, extract::State, http::StatusCode, response::Response};
use prometheus_client::{
encoding::text::encode,
metrics::{
counter::Counter,
family::Family,
gauge::Gauge,
histogram::{exponential_buckets, Histogram},
},
registry::Registry,
};
#[derive(Clone, Hash, Eq, PartialEq, Debug, prometheus_client::encoding::EncodeLabelSet)]
pub struct HttpReqLabels {
pub method: String,
pub path: &'static str,
pub status: u16,
}
#[derive(Clone, Hash, Eq, PartialEq, Debug, prometheus_client::encoding::EncodeLabelSet)]
pub struct TenantLabel {
pub tenant: String,
}
#[derive(Clone, Hash, Eq, PartialEq, Debug, prometheus_client::encoding::EncodeLabelSet)]
pub struct AuthFailLabel {
pub reason: &'static str,
}
#[derive(Clone, Hash, Eq, PartialEq, Debug, prometheus_client::encoding::EncodeLabelSet)]
pub struct CuratorActionLabel {
pub action: &'static str,
}
#[derive(Clone, Hash, Eq, PartialEq, Debug, prometheus_client::encoding::EncodeLabelSet)]
pub struct LlmBackendLabel {
pub backend: &'static str,
pub outcome: &'static str,
}
#[allow(dead_code)]
#[derive(Clone)]
pub struct AppMetrics {
pub registry: Arc<Registry>,
pub http_requests: Family<HttpReqLabels, Counter>,
pub http_duration: Family<HttpReqLabels, Histogram>,
pub queue_depth: Family<TenantLabel, Gauge>,
pub queue_lag: Family<TenantLabel, Gauge>,
pub auth_failures: Family<AuthFailLabel, Counter>,
pub revocation_size: Gauge,
pub curator_decisions: Family<CuratorActionLabel, Counter>,
pub llm_calls: Family<LlmBackendLabel, Counter>,
pub event_log_rows: Gauge,
tenant_count: Arc<AtomicUsize>,
cap: usize,
}
impl AppMetrics {
pub fn new() -> Self {
let http_requests: Family<HttpReqLabels, Counter> = Family::default();
let http_duration: Family<HttpReqLabels, Histogram> =
Family::new_with_constructor(|| Histogram::new(exponential_buckets(0.001, 2.0, 10)));
let queue_depth: Family<TenantLabel, Gauge> = Family::default();
let queue_lag: Family<TenantLabel, Gauge> = Family::default();
let auth_failures: Family<AuthFailLabel, Counter> = Family::default();
let revocation_size: Gauge = Gauge::default();
let curator_decisions: Family<CuratorActionLabel, Counter> = Family::default();
let llm_calls: Family<LlmBackendLabel, Counter> = Family::default();
let event_log_rows: Gauge = Gauge::default();
let mut registry = Registry::default();
registry.register(
"gradatum_http_requests",
"Nombre total de requêtes HTTP reçues",
http_requests.clone(),
);
registry.register(
"gradatum_http_request_duration_seconds",
"Durée des requêtes HTTP en secondes",
http_duration.clone(),
);
registry.register(
"gradatum_queue_depth",
"Profondeur de la file d'écriture par tenant",
queue_depth.clone(),
);
registry.register(
"gradatum_queue_lag_seconds",
"Décalage de la file d'écriture en secondes par tenant",
queue_lag.clone(),
);
registry.register(
"gradatum_auth_failures",
"Nombre d'échecs d'authentification par raison",
auth_failures.clone(),
);
registry.register(
"gradatum_revocation_store_size",
"Nombre d'entrées dans le store de révocation",
revocation_size.clone(),
);
registry.register(
"gradatum_curator_decisions",
"Décisions curator par action (stub T11)",
curator_decisions.clone(),
);
registry.register(
"gradatum_llm_backend_calls",
"Appels LLM backend par backend+outcome (stub T11)",
llm_calls.clone(),
);
registry.register(
"gradatum_event_log_rows",
"Nombre de lignes courantes dans event_log (mis à jour par la tâche de rétention)",
event_log_rows.clone(),
);
Self {
registry: Arc::new(registry),
http_requests,
http_duration,
queue_depth,
queue_lag,
auth_failures,
revocation_size,
curator_decisions,
llm_calls,
event_log_rows,
tenant_count: Arc::new(AtomicUsize::new(0)),
cap: 100,
}
}
#[allow(dead_code)]
pub fn observe_tenant(&self, label: TenantLabel) -> Option<TenantLabel> {
let current = self.tenant_count.load(Ordering::Relaxed);
if current >= self.cap {
tracing::warn!(
tenant = %label.tenant,
cap = self.cap,
"cardinality cap atteint, label tenant ignoré"
);
return None;
}
self.tenant_count.fetch_add(1, Ordering::Relaxed);
Some(label)
}
}
impl Default for AppMetrics {
fn default() -> Self {
Self::new()
}
}
pub async fn metrics_handler(State(m): State<AppMetrics>) -> Result<Response, StatusCode> {
let mut buf = String::new();
encode(&mut buf, &m.registry).map_err(|e| {
tracing::error!(error = %e, "échec encodage métriques Prometheus");
StatusCode::INTERNAL_SERVER_ERROR
})?;
Response::builder()
.header(
"Content-Type",
"application/openmetrics-text; version=1.0.0; charset=utf-8",
)
.body(Body::from(buf))
.map_err(|e| {
tracing::error!(error = %e, "échec construction réponse /metrics");
StatusCode::INTERNAL_SERVER_ERROR
})
}
pub async fn spawn_metrics_listener(
bind: std::net::SocketAddr,
m: AppMetrics,
) -> anyhow::Result<()> {
use axum::{routing::get, Router};
if !bind.ip().is_loopback() {
anyhow::bail!(
"metrics listener doit être loopback (caveat C7) : adresse refusée = {}",
bind
);
}
let app = Router::new()
.route("/metrics", get(metrics_handler))
.with_state(m);
let listener = tokio::net::TcpListener::bind(bind).await?;
tracing::info!(addr = %bind, "metrics listener en écoute");
axum::serve(listener, app).await?;
Ok(())
}