reddb_server/telemetry/mod.rs
1//! Structured logging façade over `tracing` + `tracing-subscriber`.
2//!
3//! Entry point for the `red` binary and any embedder that wants RedDB
4//! to manage logging on its behalf. Sets up two layers:
5//!
6//! - **stderr** — pretty (TTY) or JSON (piped/CI) formatted logs
7//! - **file** — optional daily-rotating file in `log_dir`, non-blocking
8//! writer backed by `tracing-appender`
9//!
10//! A background janitor purges rotated files older than
11//! `rotation_keep_days` every hour.
12//!
13//! The returned `Option<TelemetryGuard>` must live for the process
14//! lifetime — dropping it flushes the non-blocking buffer so no log
15//! lines are lost on graceful shutdown.
16
17use std::path::PathBuf;
18
19use tracing_appender::non_blocking::{NonBlockingBuilder, WorkerGuard};
20use tracing_subscriber::fmt::writer::MakeWriterExt;
21use tracing_subscriber::prelude::*;
22use tracing_subscriber::{fmt, EnvFilter, Registry};
23
24/// Non-blocking buffer size — higher than the `tracing-appender`
25/// default of 128k because under log-heavy bursts (bulk imports,
26/// CDC storms) 128k drops lines silently via `DropCurrent`.
27///
28/// 1M entries × ~200 bytes per event ≈ 200 MB worst-case RAM —
29/// fine for a server process, and dropped log lines are far more
30/// painful than the memory.
31const LOG_BUFFER_LINES: usize = 1_000_000;
32
33pub mod admin_intent_log;
34pub mod janitor;
35pub mod operator_event;
36pub mod operator_event_router;
37#[cfg(feature = "otel")]
38pub mod otel;
39pub mod slow_query_logger;
40pub mod slow_query_store;
41pub mod span;
42
43/// Stdio / file output format. `Pretty` renders human-readable coloured
44/// lines; `Json` emits NDJSON suitable for Loki / ELK / Datadog.
45#[derive(Debug, Clone, Copy, PartialEq, Eq)]
46pub enum LogFormat {
47 Pretty,
48 Json,
49}
50
51impl LogFormat {
52 pub fn parse(s: &str) -> Option<Self> {
53 match s.to_ascii_lowercase().as_str() {
54 "pretty" | "text" | "human" => Some(Self::Pretty),
55 "json" | "ndjson" => Some(Self::Json),
56 _ => None,
57 }
58 }
59}
60
61#[derive(Debug, Clone)]
62pub struct TelemetryConfig {
63 /// Directory for rotating log files. `None` = stderr-only (CLI
64 /// one-shot / embedded default).
65 pub log_dir: Option<PathBuf>,
66 /// Prefix for rotated files; defaults to `"reddb.log"` when empty.
67 pub file_prefix: String,
68 /// `RUST_LOG`-style filter expression. Example:
69 /// `"info,reddb::wire=debug"`.
70 pub level_filter: String,
71 /// stderr output format. File output always matches.
72 pub format: LogFormat,
73 /// How many rotated files to keep (older ones deleted by janitor).
74 pub rotation_keep_days: u16,
75 /// Service name stamped on every record under the `service` field.
76 pub service_name: &'static str,
77
78 /// Per-invocation intent flags — set by the CLI parser to record
79 /// which fields the operator explicitly passed. The config merge
80 /// (red_config → CLI) uses these to decide whether a persisted
81 /// `red.logging.*` value should be promoted.
82 ///
83 /// Not serialised; always recomputed per process start.
84 pub level_explicit: bool,
85 pub format_explicit: bool,
86 pub rotation_keep_days_explicit: bool,
87 pub file_prefix_explicit: bool,
88 pub log_dir_explicit: bool,
89 /// `--no-log-file` was passed: file sink must stay off regardless
90 /// of `red.logging.dir`.
91 pub log_file_disabled: bool,
92}
93
94impl Default for TelemetryConfig {
95 fn default() -> Self {
96 Self {
97 log_dir: None,
98 file_prefix: "reddb.log".to_string(),
99 level_filter: "info".to_string(),
100 format: LogFormat::Pretty,
101 rotation_keep_days: 14,
102 service_name: "reddb",
103 level_explicit: false,
104 format_explicit: false,
105 rotation_keep_days_explicit: false,
106 file_prefix_explicit: false,
107 log_dir_explicit: false,
108 log_file_disabled: false,
109 }
110 }
111}
112
113/// Opaque handle that keeps the non-blocking log writers alive.
114/// Drop at process exit to flush the buffered records for stderr
115/// AND the rotating file sink. Both writers run on their own
116/// dedicated background threads — the hot path only pushes onto
117/// an MPSC channel, never touches stdio syscalls directly.
118pub struct TelemetryGuard {
119 _stderr_worker: Option<WorkerGuard>,
120 _file_worker: Option<WorkerGuard>,
121}
122
123/// Install the global `tracing` subscriber. Idempotent: if another
124/// subscriber is already registered (e.g. an embedder set up its own),
125/// we silently return `None` and let the caller proceed.
126///
127/// RedDB embedders that want to own the subscriber should simply not
128/// call this.
129pub fn init(cfg: TelemetryConfig) -> Option<TelemetryGuard> {
130 let env_filter =
131 EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new(&cfg.level_filter));
132
133 // stderr — wrapped in `non_blocking` so the hot path never
134 // blocks on `write(2)` syscalls. A dedicated thread owns the
135 // stderr file descriptor and drains an MPSC channel; on a
136 // full buffer we'd rather drop a line than stall a request.
137 let (stderr_writer, stderr_worker) = NonBlockingBuilder::default()
138 .buffered_lines_limit(LOG_BUFFER_LINES)
139 .lossy(true)
140 .finish(std::io::stderr());
141
142 // Optional file layer + worker guard
143 let (file_writer_opt, file_worker) = match cfg.log_dir.as_ref() {
144 Some(dir) => {
145 if let Err(err) = std::fs::create_dir_all(dir) {
146 // Surface the failure to stderr directly — the
147 // subscriber isn't up yet, so tracing::warn! wouldn't
148 // land anywhere. Skip file logging and continue with
149 // stderr-only.
150 eprintln!(
151 "telemetry: failed to create log dir {}: {err}",
152 dir.display()
153 );
154 (None, None)
155 } else {
156 let file_appender = tracing_appender::rolling::daily(dir, &cfg.file_prefix);
157 let (writer, guard) = NonBlockingBuilder::default()
158 .buffered_lines_limit(LOG_BUFFER_LINES)
159 .lossy(true)
160 .finish(file_appender);
161
162 // Spawn retention janitor (if tokio runtime active).
163 if cfg.rotation_keep_days > 0 {
164 janitor::spawn(dir.clone(), cfg.file_prefix.clone(), cfg.rotation_keep_days);
165 }
166 (Some(writer), Some(guard))
167 }
168 }
169 None => (None, None),
170 };
171
172 // Build the subscriber. We commit to one format branch at a time —
173 // mixing pretty + json per-layer is rarely useful, and the branching
174 // keeps the type signatures tractable.
175 let result = match cfg.format {
176 LogFormat::Pretty => {
177 let stderr_layer = fmt::layer()
178 .with_writer(stderr_writer.clone())
179 .with_target(true)
180 .with_thread_ids(false)
181 .with_thread_names(false);
182 let base = Registry::default().with(env_filter).with(stderr_layer);
183 if let Some(writer) = file_writer_opt.clone() {
184 let file_layer = fmt::layer()
185 .with_writer(writer.with_max_level(tracing::Level::TRACE))
186 .with_target(true)
187 .with_ansi(false);
188 base.with(file_layer).try_init()
189 } else {
190 base.try_init()
191 }
192 }
193 LogFormat::Json => {
194 let stderr_json = fmt::layer()
195 .with_writer(stderr_writer.clone())
196 .with_target(true)
197 .with_thread_ids(false)
198 .with_thread_names(false)
199 .json()
200 .with_current_span(true)
201 .with_span_list(false);
202 let base = Registry::default().with(env_filter).with(stderr_json);
203 if let Some(writer) = file_writer_opt {
204 let file_json = fmt::layer()
205 .with_writer(writer.with_max_level(tracing::Level::TRACE))
206 .with_target(true)
207 .json()
208 .with_current_span(true)
209 .with_span_list(false);
210 base.with(file_json).try_init()
211 } else {
212 base.try_init()
213 }
214 }
215 };
216
217 if result.is_err() {
218 // Subscriber already set — library mode. That's fine.
219 return None;
220 }
221
222 // Root event so users know telemetry is alive.
223 tracing::info!(
224 service = cfg.service_name,
225 log_dir = cfg.log_dir.as_ref().map(|p| p.display().to_string()).unwrap_or_else(|| "<none>".into()),
226 format = ?cfg.format,
227 "telemetry initialised"
228 );
229
230 Some(TelemetryGuard {
231 _stderr_worker: Some(stderr_worker),
232 _file_worker: file_worker,
233 })
234}