forest/metrics/
mod.rs

1// Copyright 2019-2025 ChainSafe Systems
2// SPDX-License-Identifier: Apache-2.0, MIT
3
4pub mod db;
5
6use crate::{db::DBStatistics, networks::ChainConfig, shim::clock::ChainEpoch};
7use axum::{Router, http::StatusCode, response::IntoResponse, routing::get};
8use parking_lot::{RwLock, RwLockWriteGuard};
9use prometheus_client::{
10    collector::Collector,
11    encoding::EncodeLabelSet,
12    metrics::{
13        counter::Counter,
14        family::Family,
15        histogram::{Histogram, exponential_buckets},
16    },
17};
18use std::sync::{Arc, LazyLock};
19use std::{path::PathBuf, time::Instant};
20use tokio::net::TcpListener;
21use tower_http::compression::CompressionLayer;
22use tracing::warn;
23
24static DEFAULT_REGISTRY: LazyLock<RwLock<prometheus_client::registry::Registry>> =
25    LazyLock::new(Default::default);
26
27static COLLECTOR_REGISTRY: LazyLock<RwLock<prometheus_client::registry::Registry>> =
28    LazyLock::new(Default::default);
29
30pub fn default_registry<'a>() -> RwLockWriteGuard<'a, prometheus_client::registry::Registry> {
31    DEFAULT_REGISTRY.write()
32}
33
34pub fn collector_registry<'a>() -> RwLockWriteGuard<'a, prometheus_client::registry::Registry> {
35    COLLECTOR_REGISTRY.write()
36}
37
38pub fn register_collector(collector: Box<dyn Collector>) {
39    #[allow(clippy::disallowed_methods)]
40    collector_registry().register_collector(collector)
41}
42
43pub fn reset_collector_registry() {
44    *collector_registry() = Default::default();
45}
46
47pub static LRU_CACHE_HIT: LazyLock<Family<KindLabel, Counter>> = LazyLock::new(|| {
48    let metric = Family::default();
49    DEFAULT_REGISTRY
50        .write()
51        .register("lru_cache_hit", "Stats of lru cache hit", metric.clone());
52    metric
53});
54pub static LRU_CACHE_MISS: LazyLock<Family<KindLabel, Counter>> = LazyLock::new(|| {
55    let metric = Family::default();
56    DEFAULT_REGISTRY
57        .write()
58        .register("lru_cache_miss", "Stats of lru cache miss", metric.clone());
59    metric
60});
61
62pub static RPC_METHOD_FAILURE: LazyLock<Family<RpcMethodLabel, Counter>> = LazyLock::new(|| {
63    let metric = Family::default();
64    DEFAULT_REGISTRY.write().register(
65        "rpc_method_failure",
66        "Number of failed RPC calls",
67        metric.clone(),
68    );
69    metric
70});
71
72pub static RPC_METHOD_TIME: LazyLock<Family<RpcMethodLabel, Histogram>> = LazyLock::new(|| {
73    let metric = Family::<RpcMethodLabel, Histogram>::new_with_constructor(|| {
74        // Histogram with 5 buckets starting from 0.1ms going to 1s, each bucket 10 times as big as the last.
75        Histogram::new(exponential_buckets(0.1, 10., 5))
76    });
77    crate::metrics::default_registry().register(
78        "rpc_processing_time",
79        "Duration of RPC method call in milliseconds",
80        metric.clone(),
81    );
82    metric
83});
84
85pub async fn init_prometheus<DB>(
86    prometheus_listener: TcpListener,
87    db_directory: PathBuf,
88    db: Arc<DB>,
89    chain_config: Arc<ChainConfig>,
90    get_chain_head_height: Arc<impl Fn() -> ChainEpoch + Send + Sync + 'static>,
91    get_chain_head_actor_version: Arc<impl Fn() -> u64 + Send + Sync + 'static>,
92) -> anyhow::Result<()>
93where
94    DB: DBStatistics + Send + Sync + 'static,
95{
96    // Add the process collector to the registry
97    if let Err(err) = kubert_prometheus_process::register(
98        collector_registry().sub_registry_with_prefix("process"),
99    ) {
100        warn!("Failed to register process metrics: {err}");
101    }
102
103    register_collector(Box::new(
104        crate::utils::version::ForestVersionCollector::new(),
105    ));
106    register_collector(Box::new(crate::metrics::db::DBCollector::new(db_directory)));
107    register_collector(Box::new(
108        crate::networks::metrics::NetworkVersionCollector::new(
109            chain_config,
110            get_chain_head_height,
111            get_chain_head_actor_version,
112        ),
113    ));
114
115    // Create an configure HTTP server
116    let app = Router::new()
117        .route("/metrics", get(collect_prometheus_metrics))
118        .route("/stats/db", get(collect_db_metrics::<DB>))
119        .layer(CompressionLayer::new())
120        .with_state(db);
121
122    // Wait for server to exit
123    Ok(axum::serve(prometheus_listener, app.into_make_service()).await?)
124}
125
126async fn collect_prometheus_metrics() -> impl IntoResponse {
127    let mut metrics = String::new();
128    if let Err(e) =
129        prometheus_client::encoding::text::encode_registry(&mut metrics, &DEFAULT_REGISTRY.read())
130    {
131        warn!("failed to encode the default metrics registry: {e}");
132    };
133    if let Err(e) =
134        prometheus_client::encoding::text::encode_registry(&mut metrics, &COLLECTOR_REGISTRY.read())
135    {
136        warn!("failed to encode the collector metrics registry: {e}");
137    };
138    if let Err(e) = prometheus_client::encoding::text::encode_eof(&mut metrics) {
139        warn!("failed to encode metrics eof {e}");
140    };
141    (
142        StatusCode::OK,
143        [("content-type", "text/plain; charset=utf-8")],
144        metrics,
145    )
146}
147
148async fn collect_db_metrics<DB>(
149    axum::extract::State(db): axum::extract::State<Arc<DB>>,
150) -> impl IntoResponse
151where
152    DB: DBStatistics,
153{
154    let mut metrics = "# DB statistics:\n".to_owned();
155    if let Some(db_stats) = db.get_statistics() {
156        metrics.push_str(&db_stats);
157    } else {
158        metrics.push_str("Not enabled. Set enable_statistics to true in config and restart daemon");
159    }
160    (
161        StatusCode::OK,
162        [("content-type", "text/plain; charset=utf-8")],
163        metrics,
164    )
165}
166
167#[derive(Clone, Debug, Hash, PartialEq, Eq, EncodeLabelSet)]
168pub struct RpcMethodLabel {
169    pub method: String,
170}
171
172#[derive(Clone, Debug, Hash, PartialEq, Eq, EncodeLabelSet)]
173pub struct KindLabel {
174    kind: &'static str,
175}
176
177impl KindLabel {
178    pub const fn new(kind: &'static str) -> Self {
179        Self { kind }
180    }
181}
182
183pub mod values {
184    use super::KindLabel;
185
186    /// `TipsetCache`.
187    pub const TIPSET: KindLabel = KindLabel::new("tipset");
188    /// tipset cache in state manager
189    pub const STATE_MANAGER_TIPSET: KindLabel = KindLabel::new("sm_tipset");
190}
191
192pub fn default_histogram() -> Histogram {
193    // Default values from go client(https://github.com/prometheus/client_golang/blob/5d584e2717ef525673736d72cd1d12e304f243d7/prometheus/histogram.go#L68)
194    Histogram::new([
195        0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0,
196    ])
197}
198
199pub struct HistogramTimer<'a> {
200    histogram: &'a Histogram,
201    start: Instant,
202}
203
204impl Drop for HistogramTimer<'_> {
205    fn drop(&mut self) {
206        let duration = Instant::now() - self.start;
207        self.histogram.observe(duration.as_secs_f64());
208    }
209}
210
211pub trait HistogramTimerExt {
212    fn start_timer(&self) -> HistogramTimer<'_>;
213}
214
215impl HistogramTimerExt for Histogram {
216    fn start_timer(&self) -> HistogramTimer<'_> {
217        HistogramTimer {
218            histogram: self,
219            start: Instant::now(),
220        }
221    }
222}