tailtriage_core/events.rs
1use serde::{Deserialize, Serialize};
2
3use crate::{CaptureMode, EffectiveCoreConfig};
4
5/// Current schema version for `Run` JSON artifacts.
6pub const SCHEMA_VERSION: u64 = 1;
7
8/// Logical request outcome categories used by the public API.
9#[derive(Debug, Clone, PartialEq, Eq)]
10pub enum Outcome {
11 /// Request completed successfully.
12 Ok,
13 /// Request completed with an error.
14 Error,
15 /// Request exceeded a timeout threshold.
16 Timeout,
17 /// Request was cancelled before completion.
18 Cancelled,
19 /// Request was rejected before normal execution.
20 Rejected,
21 /// Caller-provided custom outcome label.
22 Other(String),
23}
24
25impl Outcome {
26 /// Returns the canonical string label for this outcome.
27 #[must_use]
28 pub fn as_str(&self) -> &str {
29 match self {
30 Self::Ok => "ok",
31 Self::Error => "error",
32 Self::Timeout => "timeout",
33 Self::Cancelled => "cancelled",
34 Self::Rejected => "rejected",
35 Self::Other(value) => value.as_str(),
36 }
37 }
38
39 /// Converts this outcome into an owned string label.
40 #[must_use]
41 pub fn into_string(self) -> String {
42 match self {
43 Self::Ok => "ok".to_string(),
44 Self::Error => "error".to_string(),
45 Self::Timeout => "timeout".to_string(),
46 Self::Cancelled => "cancelled".to_string(),
47 Self::Rejected => "rejected".to_string(),
48 Self::Other(value) => value,
49 }
50 }
51}
52
53/// A full output artifact for one tailtriage capture run.
54#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
55pub struct Run {
56 /// Run artifact schema version.
57 pub schema_version: u64,
58 /// Metadata for the capture session.
59 pub metadata: RunMetadata,
60 /// Request timing events.
61 pub requests: Vec<RequestEvent>,
62 /// Stage timing events.
63 pub stages: Vec<StageEvent>,
64 /// Queue wait timing events.
65 pub queues: Vec<QueueEvent>,
66 /// In-flight gauge changes over time.
67 pub inflight: Vec<InFlightSnapshot>,
68 /// Tokio runtime metrics snapshots.
69 pub runtime_snapshots: Vec<RuntimeSnapshot>,
70 /// Capture truncation summary for bounded collection.
71 #[serde(default)]
72 pub truncation: TruncationSummary,
73}
74
75impl Run {
76 /// Creates an empty run with the provided metadata.
77 #[must_use]
78 pub fn new(metadata: RunMetadata) -> Self {
79 Self {
80 schema_version: SCHEMA_VERSION,
81 metadata,
82 requests: Vec::new(),
83 stages: Vec::new(),
84 queues: Vec::new(),
85 inflight: Vec::new(),
86 runtime_snapshots: Vec::new(),
87 truncation: TruncationSummary::default(),
88 }
89 }
90}
91
92/// Per-section counters indicating dropped samples due to capture limits.
93#[derive(Debug, Clone, PartialEq, Eq, Default, Serialize, Deserialize)]
94pub struct TruncationSummary {
95 /// Whether any capture limit was reached during this run.
96 #[serde(default)]
97 pub limits_hit: bool,
98 /// Number of request events dropped after `max_requests` was reached.
99 pub dropped_requests: u64,
100 /// Number of stage events dropped after `max_stages` was reached.
101 pub dropped_stages: u64,
102 /// Number of queue events dropped after `max_queues` was reached.
103 pub dropped_queues: u64,
104 /// Number of in-flight snapshots dropped after `max_inflight_snapshots` was reached.
105 pub dropped_inflight_snapshots: u64,
106 /// Number of runtime snapshots dropped after `max_runtime_snapshots` was reached.
107 pub dropped_runtime_snapshots: u64,
108}
109
110impl TruncationSummary {
111 /// Returns true when any capture section was truncated.
112 #[must_use]
113 pub const fn is_truncated(&self) -> bool {
114 self.limits_hit
115 || self.dropped_requests > 0
116 || self.dropped_stages > 0
117 || self.dropped_queues > 0
118 || self.dropped_inflight_snapshots > 0
119 || self.dropped_runtime_snapshots > 0
120 }
121}
122
123/// Top-level metadata for one capture run.
124#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
125pub struct RunMetadata {
126 /// Identifier for the run.
127 ///
128 /// When not supplied by the caller, `tailtriage-core` generates a UUID-based
129 /// identifier.
130 pub run_id: String,
131 /// Service/application name.
132 pub service_name: String,
133 /// Optional service version.
134 pub service_version: Option<String>,
135 /// Timestamp (milliseconds since epoch UTC) when collection started.
136 pub started_at_unix_ms: u64,
137 /// Timestamp (milliseconds since epoch UTC) when collection ended.
138 ///
139 /// During active capture, in-memory snapshots may still show the start-time
140 /// placeholder. `shutdown()` writes the finalized end timestamp to the
141 /// persisted artifact.
142 pub finished_at_unix_ms: u64,
143 /// Finalization timestamp (milliseconds since epoch UTC) for persisted artifacts.
144 ///
145 /// This is `None` for active in-memory snapshots and for older artifacts
146 /// written before this field existed.
147 #[serde(default)]
148 pub finalized_at_unix_ms: Option<u64>,
149 /// Capture mode, such as "light" or "investigation".
150 pub mode: CaptureMode,
151 /// Effective resolved core configuration after applying mode defaults and overrides.
152 ///
153 /// This field may be `None` for older artifacts that predate effective config capture.
154 #[serde(default)]
155 pub effective_core_config: Option<EffectiveCoreConfig>,
156 /// Effective resolved Tokio runtime sampler configuration for this run.
157 ///
158 /// This field is set only when a Tokio sampler is configured and started.
159 /// It may be `None` for runs without Tokio sampling and for older artifacts.
160 #[serde(default)]
161 pub effective_tokio_sampler_config: Option<EffectiveTokioSamplerConfig>,
162 /// Hostname captured at run creation when available as valid UTF-8.
163 pub host: Option<String>,
164 /// Process identifier if available.
165 pub pid: Option<u32>,
166 /// Lifecycle warnings generated during shutdown validation.
167 #[serde(default)]
168 pub lifecycle_warnings: Vec<String>,
169 /// Incomplete request summary captured at shutdown.
170 #[serde(default)]
171 pub unfinished_requests: UnfinishedRequests,
172 /// Why the run lifecycle ended.
173 ///
174 /// This field may be `None` for older artifacts and for runs that do not
175 /// record an explicit end reason (including direct `tailtriage-core` runs today).
176 #[serde(default)]
177 pub run_end_reason: Option<RunEndReason>,
178}
179
180/// Run lifecycle end reason recorded in artifact metadata.
181#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
182#[serde(rename_all = "snake_case")]
183pub enum RunEndReason {
184 /// Run ended because capture was disarmed manually.
185 ManualDisarm,
186 /// Run ended because process/controller shutdown finalized capture.
187 Shutdown,
188 /// Run auto-sealed after hitting capture limits.
189 AutoSealOnLimitsHit,
190}
191
192/// Stable, resolved Tokio runtime sampler configuration used by one run.
193#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
194pub struct EffectiveTokioSamplerConfig {
195 /// Capture mode selected in `tailtriage-core` that Tokio can inherit from.
196 pub inherited_mode: CaptureMode,
197 /// Optional explicit Tokio-side mode override.
198 pub explicit_mode_override: Option<CaptureMode>,
199 /// Effective mode used to resolve Tokio sampler defaults.
200 pub resolved_mode: CaptureMode,
201 /// Effective runtime sampler cadence in milliseconds.
202 pub resolved_sampler_cadence_ms: u64,
203 /// Effective runtime snapshot retention used by Tokio sampler.
204 pub resolved_runtime_snapshot_retention: usize,
205}
206
207/// Summary of unfinished requests detected at shutdown.
208#[derive(Debug, Clone, PartialEq, Eq, Default, Serialize, Deserialize)]
209pub struct UnfinishedRequests {
210 /// Count of requests still pending when shutdown ran.
211 pub count: u64,
212 /// Small sample of unfinished requests for debugging.
213 pub sample: Vec<UnfinishedRequestSample>,
214}
215
216/// One unfinished request sample captured for lifecycle warnings.
217#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
218pub struct UnfinishedRequestSample {
219 /// Correlation ID for the unfinished request.
220 pub request_id: String,
221 /// Route or operation name associated with the unfinished request.
222 pub route: String,
223}
224
225/// Per-request timing and status.
226#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
227pub struct RequestEvent {
228 /// Correlation ID for the request.
229 pub request_id: String,
230 /// Route name, operation, or endpoint.
231 pub route: String,
232 /// Semantic request kind.
233 pub kind: Option<String>,
234 /// Request start timestamp (milliseconds since epoch UTC).
235 pub started_at_unix_ms: u64,
236 /// Request completion timestamp (milliseconds since epoch UTC).
237 pub finished_at_unix_ms: u64,
238 /// Total request latency in microseconds.
239 pub latency_us: u64,
240 /// Logical outcome such as "ok", "error", or "timeout".
241 pub outcome: String,
242}
243
244/// Timing record for one named stage.
245#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
246pub struct StageEvent {
247 /// Parent request ID.
248 pub request_id: String,
249 /// Stage identifier.
250 pub stage: String,
251 /// Stage start timestamp (milliseconds since epoch UTC).
252 pub started_at_unix_ms: u64,
253 /// Stage completion timestamp (milliseconds since epoch UTC).
254 pub finished_at_unix_ms: u64,
255 /// Stage latency in microseconds.
256 pub latency_us: u64,
257 /// Whether the stage completed successfully (`Result::is_ok()` for
258 /// `StageTimer::await_on`, always `true` for `StageTimer::await_value`).
259 pub success: bool,
260}
261
262/// Queue wait measurement for a request waiting on a queue/permit.
263#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
264pub struct QueueEvent {
265 /// Parent request ID.
266 pub request_id: String,
267 /// Queue identifier.
268 pub queue: String,
269 /// Queue wait start timestamp (milliseconds since epoch UTC).
270 pub waited_from_unix_ms: u64,
271 /// Queue wait end timestamp (milliseconds since epoch UTC).
272 pub waited_until_unix_ms: u64,
273 /// Total wait time in microseconds.
274 pub wait_us: u64,
275 /// Queue depth sample captured at wait start, if known.
276 pub depth_at_start: Option<u64>,
277}
278
279/// Point-in-time in-flight gauge reading.
280#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
281pub struct InFlightSnapshot {
282 /// Gauge name.
283 pub gauge: String,
284 /// Timestamp (milliseconds since epoch UTC).
285 pub at_unix_ms: u64,
286 /// Number of in-flight units.
287 pub count: u64,
288}
289
290/// Point-in-time runtime metrics sample.
291#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
292pub struct RuntimeSnapshot {
293 /// Timestamp (milliseconds since epoch UTC).
294 pub at_unix_ms: u64,
295 /// Number of alive tasks.
296 pub alive_tasks: Option<u64>,
297 /// Runtime global queue depth.
298 pub global_queue_depth: Option<u64>,
299 /// Aggregated runtime local queue depth across worker threads.
300 pub local_queue_depth: Option<u64>,
301 /// Runtime blocking pool queue depth.
302 pub blocking_queue_depth: Option<u64>,
303 /// Runtime remote schedule count.
304 pub remote_schedule_count: Option<u64>,
305}