Skip to main content

tailtriage_core/
events.rs

1use serde::{Deserialize, Serialize};
2
3use crate::{CaptureMode, EffectiveCoreConfig};
4
5/// Current schema version for `Run` JSON artifacts.
6pub const SCHEMA_VERSION: u64 = 1;
7
8/// Logical request outcome categories used by the public API.
9#[derive(Debug, Clone, PartialEq, Eq)]
10pub enum Outcome {
11    /// Request completed successfully.
12    Ok,
13    /// Request completed with an error.
14    Error,
15    /// Request exceeded a timeout threshold.
16    Timeout,
17    /// Request was cancelled before completion.
18    Cancelled,
19    /// Request was rejected before normal execution.
20    Rejected,
21    /// Caller-provided custom outcome label.
22    Other(String),
23}
24
25impl Outcome {
26    /// Returns the canonical string label for this outcome.
27    #[must_use]
28    pub fn as_str(&self) -> &str {
29        match self {
30            Self::Ok => "ok",
31            Self::Error => "error",
32            Self::Timeout => "timeout",
33            Self::Cancelled => "cancelled",
34            Self::Rejected => "rejected",
35            Self::Other(value) => value.as_str(),
36        }
37    }
38
39    /// Converts this outcome into an owned string label.
40    #[must_use]
41    pub fn into_string(self) -> String {
42        match self {
43            Self::Ok => "ok".to_string(),
44            Self::Error => "error".to_string(),
45            Self::Timeout => "timeout".to_string(),
46            Self::Cancelled => "cancelled".to_string(),
47            Self::Rejected => "rejected".to_string(),
48            Self::Other(value) => value,
49        }
50    }
51}
52
53/// A full output artifact for one tailtriage capture run.
54#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
55pub struct Run {
56    /// Run artifact schema version.
57    pub schema_version: u64,
58    /// Metadata for the capture session.
59    pub metadata: RunMetadata,
60    /// Request timing events.
61    pub requests: Vec<RequestEvent>,
62    /// Stage timing events.
63    pub stages: Vec<StageEvent>,
64    /// Queue wait timing events.
65    pub queues: Vec<QueueEvent>,
66    /// In-flight gauge changes over time.
67    pub inflight: Vec<InFlightSnapshot>,
68    /// Tokio runtime metrics snapshots.
69    pub runtime_snapshots: Vec<RuntimeSnapshot>,
70    /// Capture truncation summary for bounded collection.
71    #[serde(default)]
72    pub truncation: TruncationSummary,
73}
74
75impl Run {
76    /// Creates an empty run with the provided metadata.
77    #[must_use]
78    pub fn new(metadata: RunMetadata) -> Self {
79        Self {
80            schema_version: SCHEMA_VERSION,
81            metadata,
82            requests: Vec::new(),
83            stages: Vec::new(),
84            queues: Vec::new(),
85            inflight: Vec::new(),
86            runtime_snapshots: Vec::new(),
87            truncation: TruncationSummary::default(),
88        }
89    }
90}
91
92/// Per-section counters indicating dropped samples due to capture limits.
93#[derive(Debug, Clone, PartialEq, Eq, Default, Serialize, Deserialize)]
94pub struct TruncationSummary {
95    /// Whether any capture limit was reached during this run.
96    #[serde(default)]
97    pub limits_hit: bool,
98    /// Number of request events dropped after `max_requests` was reached.
99    pub dropped_requests: u64,
100    /// Number of stage events dropped after `max_stages` was reached.
101    pub dropped_stages: u64,
102    /// Number of queue events dropped after `max_queues` was reached.
103    pub dropped_queues: u64,
104    /// Number of in-flight snapshots dropped after `max_inflight_snapshots` was reached.
105    pub dropped_inflight_snapshots: u64,
106    /// Number of runtime snapshots dropped after `max_runtime_snapshots` was reached.
107    pub dropped_runtime_snapshots: u64,
108}
109
110impl TruncationSummary {
111    /// Returns true when any capture section was truncated.
112    #[must_use]
113    pub const fn is_truncated(&self) -> bool {
114        self.limits_hit
115            || self.dropped_requests > 0
116            || self.dropped_stages > 0
117            || self.dropped_queues > 0
118            || self.dropped_inflight_snapshots > 0
119            || self.dropped_runtime_snapshots > 0
120    }
121}
122
123/// Top-level metadata for one capture run.
124#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
125pub struct RunMetadata {
126    /// Identifier for the run.
127    ///
128    /// When not supplied by the caller, `tailtriage-core` generates a UUID-based
129    /// identifier.
130    pub run_id: String,
131    /// Service/application name.
132    pub service_name: String,
133    /// Optional service version.
134    pub service_version: Option<String>,
135    /// Timestamp (milliseconds since epoch UTC) when collection started.
136    pub started_at_unix_ms: u64,
137    /// Timestamp (milliseconds since epoch UTC) when collection ended.
138    ///
139    /// During active capture, in-memory snapshots may still show the start-time
140    /// placeholder. `shutdown()` writes the finalized end timestamp to the
141    /// persisted artifact.
142    pub finished_at_unix_ms: u64,
143    /// Finalization timestamp (milliseconds since epoch UTC) for persisted artifacts.
144    ///
145    /// This is `None` for active in-memory snapshots and for older artifacts
146    /// written before this field existed.
147    #[serde(default)]
148    pub finalized_at_unix_ms: Option<u64>,
149    /// Capture mode, such as "light" or "investigation".
150    pub mode: CaptureMode,
151    /// Effective resolved core configuration after applying mode defaults and overrides.
152    ///
153    /// This field may be `None` for older artifacts that predate effective config capture.
154    #[serde(default)]
155    pub effective_core_config: Option<EffectiveCoreConfig>,
156    /// Effective resolved Tokio runtime sampler configuration for this run.
157    ///
158    /// This field is set only when a Tokio sampler is configured and started.
159    /// It may be `None` for runs without Tokio sampling and for older artifacts.
160    #[serde(default)]
161    pub effective_tokio_sampler_config: Option<EffectiveTokioSamplerConfig>,
162    /// Hostname captured at run creation when available as valid UTF-8.
163    pub host: Option<String>,
164    /// Process identifier if available.
165    pub pid: Option<u32>,
166    /// Lifecycle warnings generated during shutdown validation.
167    #[serde(default)]
168    pub lifecycle_warnings: Vec<String>,
169    /// Incomplete request summary captured at shutdown.
170    #[serde(default)]
171    pub unfinished_requests: UnfinishedRequests,
172    /// Why the run lifecycle ended.
173    ///
174    /// This field may be `None` for older artifacts and for runs that do not
175    /// record an explicit end reason (including direct `tailtriage-core` runs today).
176    #[serde(default)]
177    pub run_end_reason: Option<RunEndReason>,
178}
179
180/// Run lifecycle end reason recorded in artifact metadata.
181#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
182#[serde(rename_all = "snake_case")]
183pub enum RunEndReason {
184    /// Run ended because capture was disarmed manually.
185    ManualDisarm,
186    /// Run ended because process/controller shutdown finalized capture.
187    Shutdown,
188    /// Run auto-sealed after hitting capture limits.
189    AutoSealOnLimitsHit,
190}
191
192/// Stable, resolved Tokio runtime sampler configuration used by one run.
193#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
194pub struct EffectiveTokioSamplerConfig {
195    /// Capture mode selected in `tailtriage-core` that Tokio can inherit from.
196    pub inherited_mode: CaptureMode,
197    /// Optional explicit Tokio-side mode override.
198    pub explicit_mode_override: Option<CaptureMode>,
199    /// Effective mode used to resolve Tokio sampler defaults.
200    pub resolved_mode: CaptureMode,
201    /// Effective runtime sampler cadence in milliseconds.
202    pub resolved_sampler_cadence_ms: u64,
203    /// Effective runtime snapshot retention used by Tokio sampler.
204    pub resolved_runtime_snapshot_retention: usize,
205}
206
207/// Summary of unfinished requests detected at shutdown.
208#[derive(Debug, Clone, PartialEq, Eq, Default, Serialize, Deserialize)]
209pub struct UnfinishedRequests {
210    /// Count of requests still pending when shutdown ran.
211    pub count: u64,
212    /// Small sample of unfinished requests for debugging.
213    pub sample: Vec<UnfinishedRequestSample>,
214}
215
216/// One unfinished request sample captured for lifecycle warnings.
217#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
218pub struct UnfinishedRequestSample {
219    /// Correlation ID for the unfinished request.
220    pub request_id: String,
221    /// Route or operation name associated with the unfinished request.
222    pub route: String,
223}
224
225/// Per-request timing and status.
226#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
227pub struct RequestEvent {
228    /// Correlation ID for the request.
229    pub request_id: String,
230    /// Route name, operation, or endpoint.
231    pub route: String,
232    /// Semantic request kind.
233    pub kind: Option<String>,
234    /// Request start timestamp (milliseconds since epoch UTC).
235    pub started_at_unix_ms: u64,
236    /// Request completion timestamp (milliseconds since epoch UTC).
237    pub finished_at_unix_ms: u64,
238    /// Total request latency in microseconds.
239    pub latency_us: u64,
240    /// Logical outcome such as "ok", "error", or "timeout".
241    pub outcome: String,
242}
243
244/// Timing record for one named stage.
245#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
246pub struct StageEvent {
247    /// Parent request ID.
248    pub request_id: String,
249    /// Stage identifier.
250    pub stage: String,
251    /// Stage start timestamp (milliseconds since epoch UTC).
252    pub started_at_unix_ms: u64,
253    /// Stage completion timestamp (milliseconds since epoch UTC).
254    pub finished_at_unix_ms: u64,
255    /// Stage latency in microseconds.
256    pub latency_us: u64,
257    /// Whether the stage completed successfully (`Result::is_ok()` for
258    /// `StageTimer::await_on`, always `true` for `StageTimer::await_value`).
259    pub success: bool,
260}
261
262/// Queue wait measurement for a request waiting on a queue/permit.
263#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
264pub struct QueueEvent {
265    /// Parent request ID.
266    pub request_id: String,
267    /// Queue identifier.
268    pub queue: String,
269    /// Queue wait start timestamp (milliseconds since epoch UTC).
270    pub waited_from_unix_ms: u64,
271    /// Queue wait end timestamp (milliseconds since epoch UTC).
272    pub waited_until_unix_ms: u64,
273    /// Total wait time in microseconds.
274    pub wait_us: u64,
275    /// Queue depth sample captured at wait start, if known.
276    pub depth_at_start: Option<u64>,
277}
278
279/// Point-in-time in-flight gauge reading.
280#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
281pub struct InFlightSnapshot {
282    /// Gauge name.
283    pub gauge: String,
284    /// Timestamp (milliseconds since epoch UTC).
285    pub at_unix_ms: u64,
286    /// Number of in-flight units.
287    pub count: u64,
288}
289
290/// Point-in-time runtime metrics sample.
291#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
292pub struct RuntimeSnapshot {
293    /// Timestamp (milliseconds since epoch UTC).
294    pub at_unix_ms: u64,
295    /// Number of alive tasks.
296    pub alive_tasks: Option<u64>,
297    /// Runtime global queue depth.
298    pub global_queue_depth: Option<u64>,
299    /// Aggregated runtime local queue depth across worker threads.
300    pub local_queue_depth: Option<u64>,
301    /// Runtime blocking pool queue depth.
302    pub blocking_queue_depth: Option<u64>,
303    /// Runtime remote schedule count.
304    pub remote_schedule_count: Option<u64>,
305}