1pub mod db;
5
6use crate::{db::DBStatistics, networks::ChainConfig, shim::clock::ChainEpoch};
7use axum::{Router, http::StatusCode, response::IntoResponse, routing::get};
8use parking_lot::{RwLock, RwLockWriteGuard};
9use prometheus_client::{
10 collector::Collector,
11 encoding::EncodeLabelSet,
12 metrics::{
13 counter::Counter,
14 family::Family,
15 histogram::{Histogram, exponential_buckets},
16 },
17};
18use std::sync::{Arc, LazyLock};
19use std::{path::PathBuf, time::Instant};
20use tokio::net::TcpListener;
21use tower_http::compression::CompressionLayer;
22use tracing::warn;
23
24static DEFAULT_REGISTRY: LazyLock<RwLock<prometheus_client::registry::Registry>> =
25 LazyLock::new(Default::default);
26
27static COLLECTOR_REGISTRY: LazyLock<RwLock<prometheus_client::registry::Registry>> =
28 LazyLock::new(Default::default);
29
30pub fn default_registry<'a>() -> RwLockWriteGuard<'a, prometheus_client::registry::Registry> {
31 DEFAULT_REGISTRY.write()
32}
33
34pub fn collector_registry<'a>() -> RwLockWriteGuard<'a, prometheus_client::registry::Registry> {
35 COLLECTOR_REGISTRY.write()
36}
37
38pub fn register_collector(collector: Box<dyn Collector>) {
39 #[allow(clippy::disallowed_methods)]
40 collector_registry().register_collector(collector)
41}
42
43pub fn reset_collector_registry() {
44 *collector_registry() = Default::default();
45}
46
47pub static LRU_CACHE_HIT: LazyLock<Family<KindLabel, Counter>> = LazyLock::new(|| {
48 let metric = Family::default();
49 DEFAULT_REGISTRY
50 .write()
51 .register("lru_cache_hit", "Stats of lru cache hit", metric.clone());
52 metric
53});
54pub static LRU_CACHE_MISS: LazyLock<Family<KindLabel, Counter>> = LazyLock::new(|| {
55 let metric = Family::default();
56 DEFAULT_REGISTRY
57 .write()
58 .register("lru_cache_miss", "Stats of lru cache miss", metric.clone());
59 metric
60});
61
62pub static RPC_METHOD_FAILURE: LazyLock<Family<RpcMethodLabel, Counter>> = LazyLock::new(|| {
63 let metric = Family::default();
64 DEFAULT_REGISTRY.write().register(
65 "rpc_method_failure",
66 "Number of failed RPC calls",
67 metric.clone(),
68 );
69 metric
70});
71
72pub static RPC_METHOD_TIME: LazyLock<Family<RpcMethodLabel, Histogram>> = LazyLock::new(|| {
73 let metric = Family::<RpcMethodLabel, Histogram>::new_with_constructor(|| {
74 Histogram::new(exponential_buckets(0.1, 10., 5))
76 });
77 crate::metrics::default_registry().register(
78 "rpc_processing_time",
79 "Duration of RPC method call in milliseconds",
80 metric.clone(),
81 );
82 metric
83});
84
85pub async fn init_prometheus<DB>(
86 prometheus_listener: TcpListener,
87 db_directory: PathBuf,
88 db: Arc<DB>,
89 chain_config: Arc<ChainConfig>,
90 get_chain_head_height: Arc<impl Fn() -> ChainEpoch + Send + Sync + 'static>,
91 get_chain_head_actor_version: Arc<impl Fn() -> u64 + Send + Sync + 'static>,
92) -> anyhow::Result<()>
93where
94 DB: DBStatistics + Send + Sync + 'static,
95{
96 if let Err(err) = kubert_prometheus_process::register(
98 collector_registry().sub_registry_with_prefix("process"),
99 ) {
100 warn!("Failed to register process metrics: {err}");
101 }
102
103 register_collector(Box::new(
104 crate::utils::version::ForestVersionCollector::new(),
105 ));
106 register_collector(Box::new(crate::metrics::db::DBCollector::new(db_directory)));
107 register_collector(Box::new(
108 crate::networks::metrics::NetworkVersionCollector::new(
109 chain_config,
110 get_chain_head_height,
111 get_chain_head_actor_version,
112 ),
113 ));
114
115 let app = Router::new()
117 .route("/metrics", get(collect_prometheus_metrics))
118 .route("/stats/db", get(collect_db_metrics::<DB>))
119 .layer(CompressionLayer::new())
120 .with_state(db);
121
122 Ok(axum::serve(prometheus_listener, app.into_make_service()).await?)
124}
125
126async fn collect_prometheus_metrics() -> impl IntoResponse {
127 let mut metrics = String::new();
128 if let Err(e) =
129 prometheus_client::encoding::text::encode_registry(&mut metrics, &DEFAULT_REGISTRY.read())
130 {
131 warn!("failed to encode the default metrics registry: {e}");
132 };
133 if let Err(e) =
134 prometheus_client::encoding::text::encode_registry(&mut metrics, &COLLECTOR_REGISTRY.read())
135 {
136 warn!("failed to encode the collector metrics registry: {e}");
137 };
138 if let Err(e) = prometheus_client::encoding::text::encode_eof(&mut metrics) {
139 warn!("failed to encode metrics eof {e}");
140 };
141 (
142 StatusCode::OK,
143 [("content-type", "text/plain; charset=utf-8")],
144 metrics,
145 )
146}
147
148async fn collect_db_metrics<DB>(
149 axum::extract::State(db): axum::extract::State<Arc<DB>>,
150) -> impl IntoResponse
151where
152 DB: DBStatistics,
153{
154 let mut metrics = "# DB statistics:\n".to_owned();
155 if let Some(db_stats) = db.get_statistics() {
156 metrics.push_str(&db_stats);
157 } else {
158 metrics.push_str("Not enabled. Set enable_statistics to true in config and restart daemon");
159 }
160 (
161 StatusCode::OK,
162 [("content-type", "text/plain; charset=utf-8")],
163 metrics,
164 )
165}
166
167#[derive(Clone, Debug, Hash, PartialEq, Eq, EncodeLabelSet)]
168pub struct RpcMethodLabel {
169 pub method: String,
170}
171
172#[derive(Clone, Debug, Hash, PartialEq, Eq, EncodeLabelSet)]
173pub struct KindLabel {
174 kind: &'static str,
175}
176
177impl KindLabel {
178 pub const fn new(kind: &'static str) -> Self {
179 Self { kind }
180 }
181}
182
183pub mod values {
184 use super::KindLabel;
185
186 pub const TIPSET: KindLabel = KindLabel::new("tipset");
188 pub const STATE_MANAGER_TIPSET: KindLabel = KindLabel::new("sm_tipset");
190}
191
192pub fn default_histogram() -> Histogram {
193 Histogram::new([
195 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0,
196 ])
197}
198
199pub struct HistogramTimer<'a> {
200 histogram: &'a Histogram,
201 start: Instant,
202}
203
204impl Drop for HistogramTimer<'_> {
205 fn drop(&mut self) {
206 let duration = Instant::now() - self.start;
207 self.histogram.observe(duration.as_secs_f64());
208 }
209}
210
211pub trait HistogramTimerExt {
212 fn start_timer(&self) -> HistogramTimer<'_>;
213}
214
215impl HistogramTimerExt for Histogram {
216 fn start_timer(&self) -> HistogramTimer<'_> {
217 HistogramTimer {
218 histogram: self,
219 start: Instant::now(),
220 }
221 }
222}