telemetry_subscribers/
span_latency_prom.rs

1// Copyright (c) 2022, Mysten Labs, Inc.
2// SPDX-License-Identifier: Apache-2.0
3
4//! This is a module that records Tokio-tracing [span](https://docs.rs/tracing/latest/tracing/span/index.html)
5//! latencies into Prometheus histograms directly.
6//! The name of the Prometheus histogram is "tracing_span_latencies[_sum/count/bucket]"
7//!
8//! There is also the tracing-timing crate, from which this differs significantly:
9//! - tracing-timing records latencies between events (logs).  We just want to record the latencies of spans.
10//! - tracing-timing does not output to Prometheus, and extracting data from its histograms takes extra CPU
11//! - tracing-timing records latencies using HDRHistogram, which is great, but uses extra memory when one
12//!   is already using Prometheus
13//! Thus this is a much smaller and more focused module.
14//!
15//! ## Making spans visible
16//! This module can only record latencies for spans that get created.  By default, this is controlled by
17//! env_filter and logging levels.
18
19use std::time::Instant;
20
21use prometheus::{exponential_buckets, register_histogram_vec_with_registry, Registry};
22use tracing::{span, Subscriber};
23
24/// A tokio_tracing Layer that records span latencies into Prometheus histograms
25pub struct PrometheusSpanLatencyLayer {
26    span_latencies: prometheus::HistogramVec,
27}
28
29#[derive(Debug)]
30pub enum PrometheusSpanError {
31    /// num_buckets must be positive >= 1
32    ZeroOrNegativeNumBuckets,
33    PromError(prometheus::Error),
34}
35
36impl From<prometheus::Error> for PrometheusSpanError {
37    fn from(err: prometheus::Error) -> Self {
38        Self::PromError(err)
39    }
40}
41
42const TOP_LATENCY_IN_NS: f64 = 300.0 * 1.0e9;
43const LOWEST_LATENCY_IN_NS: f64 = 500.0;
44
45impl PrometheusSpanLatencyLayer {
46    /// Create a new layer, injecting latencies into the given registry.
47    /// The num_buckets controls how many buckets thus how much memory and time series one
48    /// uses up in Prometheus (and in the application).  10 is probably a minimum.
49    pub fn try_new(registry: &Registry, num_buckets: usize) -> Result<Self, PrometheusSpanError> {
50        if num_buckets < 1 {
51            return Err(PrometheusSpanError::ZeroOrNegativeNumBuckets);
52        }
53
54        // Histogram for span latencies must accommodate a wide range of possible latencies, so
55        // don't use the default Prometheus buckets.  Latencies in NS.  Calculate the multiplier
56        // to go from LOWEST to TOP in num_bucket steps, step n+1 = step n * factor.
57        let factor = (TOP_LATENCY_IN_NS / LOWEST_LATENCY_IN_NS).powf(1.0 / (num_buckets as f64));
58        let buckets = exponential_buckets(LOWEST_LATENCY_IN_NS, factor, num_buckets)?;
59        let span_latencies = register_histogram_vec_with_registry!(
60            "tracing_span_latencies",
61            "Latencies from tokio-tracing spans",
62            &["span_name"],
63            buckets,
64            registry
65        )?;
66        Ok(Self { span_latencies })
67    }
68}
69
70struct PromSpanTimestamp(Instant);
71
72impl<S> tracing_subscriber::Layer<S> for PrometheusSpanLatencyLayer
73where
74    S: Subscriber + for<'span> tracing_subscriber::registry::LookupSpan<'span>,
75{
76    fn on_new_span(
77        &self,
78        _attrs: &span::Attributes,
79        id: &span::Id,
80        ctx: tracing_subscriber::layer::Context<S>,
81    ) {
82        let span = ctx.span(id).unwrap();
83        // NOTE: there are other extensions that insert timings.  For example,
84        // tracing_subscriber's with_span_events() inserts events at open and close that contain timings.
85        // However, we cannot be guaranteed that those events would be turned on.
86        span.extensions_mut()
87            .insert(PromSpanTimestamp(Instant::now()));
88    }
89
90    fn on_close(&self, id: span::Id, ctx: tracing_subscriber::layer::Context<'_, S>) {
91        let span = ctx.span(&id).unwrap();
92        let start_time = span
93            .extensions()
94            .get::<PromSpanTimestamp>()
95            .expect("Could not find saved timestamp on span")
96            .0;
97        let elapsed_ns = start_time.elapsed().as_nanos() as u64;
98        self.span_latencies
99            .with_label_values(&[span.name()])
100            .observe(elapsed_ns as f64);
101    }
102}
103
104#[cfg(test)]
105mod tests {
106    use super::*;
107
108    #[test]
109    fn test_prom_span_latency_init() {
110        let registry = prometheus::Registry::new();
111
112        let res = PrometheusSpanLatencyLayer::try_new(&registry, 0);
113        assert!(matches!(
114            res,
115            Err(PrometheusSpanError::ZeroOrNegativeNumBuckets)
116        ));
117    }
118}