tasker-orchestration 0.1.4

Orchestration system for tasker workflow coordination
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
//! # Health Status Types
//!
//! TAS-75 Phase 5: Shared types for the health monitoring subsystem.
//!
//! These types are used throughout the health module for:
//! - Database health status
//! - Channel saturation status
//! - Queue depth monitoring
//! - Backpressure decisions

use std::collections::HashMap;

// =============================================================================
// Queue Depth Types (migrated from web/state.rs)
// =============================================================================

/// Queue depth tier classification
///
/// Used to categorize queue depth severity for backpressure decisions.
///
/// ## Unknown Variant
///
/// The `Unknown` variant is used when we cannot determine the actual queue depth:
/// - Lock contention prevents cache access
/// - Configuration disables queue depth checks
/// - Database query fails
///
/// This avoids false positives where "we don't know" gets interpreted as "looks clear".
/// Consuming systems should handle `Unknown` explicitly rather than treating it as healthy.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum QueueDepthTier {
    /// Unknown - could not determine queue depth (explicit "we don't know")
    #[default]
    Unknown = 0,
    /// Normal operation (below warning threshold)
    Normal = 1,
    /// Warning level (above warning, below critical)
    Warning = 2,
    /// Critical level (above critical, API returns 503)
    Critical = 3,
    /// Overflow level (above overflow threshold, emergency)
    Overflow = 4,
}

impl PartialOrd for QueueDepthTier {
    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
        Some(self.cmp(other))
    }
}

impl Ord for QueueDepthTier {
    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
        // Unknown is treated as "no information" - it compares as less than Normal
        // This ensures that Unknown doesn't "win" when finding worst tier,
        // but also doesn't accidentally look like Normal
        (*self as u8).cmp(&(*other as u8))
    }
}

impl QueueDepthTier {
    /// Check if this tier represents an evaluated state (not Unknown)
    #[must_use]
    pub const fn is_evaluated(&self) -> bool {
        !matches!(self, Self::Unknown)
    }

    /// Check if this tier indicates a warning condition
    #[must_use]
    pub const fn is_warning(&self) -> bool {
        matches!(self, Self::Warning | Self::Critical | Self::Overflow)
    }

    /// Check if this tier indicates a critical condition (Critical or worse)
    #[must_use]
    pub const fn is_critical(&self) -> bool {
        matches!(self, Self::Critical | Self::Overflow)
    }

    /// Check if this tier indicates overflow condition
    #[must_use]
    pub const fn is_overflow(&self) -> bool {
        matches!(self, Self::Overflow)
    }
}

/// Queue depth status across all monitored queues
#[derive(Debug, Clone, Default)]
pub struct QueueDepthStatus {
    /// Worst tier across all queues
    pub tier: QueueDepthTier,
    /// Maximum depth across all queues
    pub max_depth: i64,
    /// Queue with worst depth
    pub worst_queue: String,
    /// Individual queue depths
    pub queue_depths: HashMap<String, i64>,
}

// =============================================================================
// Channel Health Types
// =============================================================================

/// Channel saturation status
///
/// ## Evaluated Field
///
/// The `evaluated` field indicates whether we actually checked the channel status:
/// - `true`: Status reflects real channel metrics
/// - `false`: Status is unknown (config disabled checks, lock contention, etc.)
///
/// This avoids false positives where "we don't know" gets interpreted as "looks clear".
#[derive(Debug, Clone)]
pub struct ChannelHealthStatus {
    /// Whether this status was actually evaluated (vs unknown/default)
    pub evaluated: bool,
    /// Command channel saturation percentage (0-100)
    pub command_saturation_percent: f64,
    /// Available capacity in command channel
    pub command_available_capacity: usize,
    /// Total messages sent through command channel
    pub command_messages_sent: u64,
    /// Total overflow events on command channel
    pub command_overflow_events: u64,
    /// Whether command channel is saturated (>80%)
    pub is_saturated: bool,
    /// Whether command channel is critical (>95%)
    pub is_critical: bool,
}

impl Default for ChannelHealthStatus {
    fn default() -> Self {
        Self {
            evaluated: false, // Default is unknown, not healthy
            command_saturation_percent: 0.0,
            command_available_capacity: 0,
            command_messages_sent: 0,
            command_overflow_events: 0,
            is_saturated: false,
            is_critical: false,
        }
    }
}

// =============================================================================
// Database Health Types
// =============================================================================

/// Database health status
///
/// ## Evaluated Field
///
/// The `evaluated` field indicates whether we actually checked the database status:
/// - `true`: Status reflects real database/circuit breaker state
/// - `false`: Status is unknown (config disabled checks, lock contention, etc.)
///
/// This avoids false positives where "we don't know" gets interpreted as "looks clear".
#[derive(Debug, Clone, Default)]
pub struct DatabaseHealthStatus {
    /// Whether this status was actually evaluated (vs unknown/default)
    pub evaluated: bool,
    /// Whether the database is connected and responsive
    pub is_connected: bool,
    /// Whether the circuit breaker is open
    pub circuit_breaker_open: bool,
    /// Current failure count on circuit breaker
    pub circuit_breaker_failures: u32,
    /// Last successful health check duration (milliseconds)
    pub last_check_duration_ms: u64,
    /// Error message if unhealthy
    pub error_message: Option<String>,
}

// =============================================================================
// Backpressure Types
// =============================================================================

/// Source of backpressure condition
#[derive(Debug, Clone)]
pub enum BackpressureSource {
    /// Circuit breaker is open
    CircuitBreaker,
    /// Channel saturation exceeds threshold
    ChannelSaturation {
        channel: String,
        saturation_percent: f64,
    },
    /// Queue depth exceeds threshold
    QueueDepth {
        queue: String,
        depth: i64,
        tier: QueueDepthTier,
    },
}

/// Pre-computed backpressure decision
///
/// This is computed by the StatusEvaluator and cached for fast access.
#[derive(Debug, Clone, Default)]
pub struct BackpressureStatus {
    /// Is backpressure currently active?
    pub active: bool,
    /// Reason for backpressure (if active)
    pub reason: Option<String>,
    /// Suggested retry-after seconds
    pub retry_after_secs: Option<u64>,
    /// Source of backpressure (if active)
    pub source: Option<BackpressureSource>,
}

/// Detailed backpressure metrics for health endpoint
///
/// Aggregates all health status information for monitoring and debugging.
#[derive(Debug, Clone)]
pub struct BackpressureMetrics {
    /// Whether the web database circuit breaker is open
    pub circuit_breaker_open: bool,
    /// Current failure count on circuit breaker
    pub circuit_breaker_failures: u32,
    /// Command channel saturation percentage (0-100)
    pub command_channel_saturation_percent: f64,
    /// Available slots in command channel
    pub command_channel_available_capacity: usize,
    /// Total messages sent through command channel
    pub command_channel_messages_sent: u64,
    /// Total overflow events on command channel
    pub command_channel_overflow_events: u64,
    /// Whether any backpressure condition is active
    pub backpressure_active: bool,
    /// Queue depth tier (Normal/Warning/Critical/Overflow)
    pub queue_depth_tier: String,
    /// Maximum queue depth across all queues
    pub queue_depth_max: i64,
    /// Queue with worst depth
    pub queue_depth_worst_queue: String,
    /// Individual queue depths
    pub queue_depths: HashMap<String, i64>,
}

impl Default for BackpressureMetrics {
    fn default() -> Self {
        Self {
            circuit_breaker_open: false,
            circuit_breaker_failures: 0,
            command_channel_saturation_percent: 0.0,
            command_channel_available_capacity: 0,
            command_channel_messages_sent: 0,
            command_channel_overflow_events: 0,
            backpressure_active: false,
            queue_depth_tier: "Normal".to_string(),
            queue_depth_max: 0,
            queue_depth_worst_queue: String::new(),
            queue_depths: HashMap::new(),
        }
    }
}

// =============================================================================
// Configuration Types
// =============================================================================

/// Health status evaluation configuration
///
/// Loaded from `orchestration.toml` \[health\] section.
#[derive(Debug, Clone)]
pub struct HealthConfig {
    /// Background evaluation interval (milliseconds)
    pub evaluation_interval_ms: u64,
    /// Enable database health checks
    pub check_database: bool,
    /// Enable channel health checks
    pub check_channels: bool,
    /// Enable queue depth checks
    pub check_queues: bool,
    /// Stale data threshold (milliseconds) - log warning if cache is older
    pub stale_threshold_ms: u64,
    /// Database health check configuration
    pub database: DatabaseHealthConfig,
    /// Channel health check configuration
    pub channels: ChannelHealthConfig,
    /// Queue health check configuration
    pub queues: QueueHealthConfig,
}

impl Default for HealthConfig {
    fn default() -> Self {
        Self {
            evaluation_interval_ms: 5000, // 5 seconds
            check_database: true,
            check_channels: true,
            check_queues: true,
            stale_threshold_ms: 30000, // 30 seconds
            database: DatabaseHealthConfig::default(),
            channels: ChannelHealthConfig::default(),
            queues: QueueHealthConfig::default(),
        }
    }
}

/// Database health check configuration
#[derive(Debug, Clone)]
pub struct DatabaseHealthConfig {
    /// Query timeout (milliseconds)
    pub query_timeout_ms: u64,
}

impl Default for DatabaseHealthConfig {
    fn default() -> Self {
        Self {
            query_timeout_ms: 1000,
        }
    }
}

/// Channel health check configuration
#[derive(Debug, Clone)]
pub struct ChannelHealthConfig {
    /// Warning threshold (percent)
    pub warning_threshold: f64,
    /// Critical threshold (percent)
    pub critical_threshold: f64,
    /// Emergency threshold (percent)
    pub emergency_threshold: f64,
}

impl Default for ChannelHealthConfig {
    fn default() -> Self {
        Self {
            warning_threshold: 70.0,
            critical_threshold: 80.0,
            emergency_threshold: 95.0,
        }
    }
}

/// Queue health check configuration
#[derive(Debug, Clone)]
pub struct QueueHealthConfig {
    /// Warning threshold (message count)
    pub warning_threshold: i64,
    /// Critical threshold (message count)
    pub critical_threshold: i64,
    /// Overflow threshold (message count)
    pub overflow_threshold: i64,
}

impl Default for QueueHealthConfig {
    fn default() -> Self {
        Self {
            warning_threshold: 1000,
            critical_threshold: 5000,
            overflow_threshold: 10000,
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_queue_depth_tier_ordering() {
        // Unknown is lowest (least severe) - it's "we don't know", not "looks bad"
        assert!(QueueDepthTier::Unknown < QueueDepthTier::Normal);
        assert!(QueueDepthTier::Normal < QueueDepthTier::Warning);
        assert!(QueueDepthTier::Warning < QueueDepthTier::Critical);
        assert!(QueueDepthTier::Critical < QueueDepthTier::Overflow);
    }

    #[test]
    fn test_queue_depth_tier_default() {
        // Default is now Unknown, not Normal - this is intentional
        // We default to "we don't know" rather than "looks clear"
        assert_eq!(QueueDepthTier::default(), QueueDepthTier::Unknown);
    }

    #[test]
    fn test_backpressure_status_default() {
        let status = BackpressureStatus::default();
        assert!(!status.active);
        assert!(status.reason.is_none());
        assert!(status.retry_after_secs.is_none());
        assert!(status.source.is_none());
    }

    #[test]
    fn test_health_config_default() {
        let config = HealthConfig::default();
        assert_eq!(config.evaluation_interval_ms, 5000);
        assert!(config.check_database);
        assert!(config.check_channels);
        assert!(config.check_queues);
        assert_eq!(config.channels.critical_threshold, 80.0);
    }

    #[test]
    fn test_database_health_status_default_is_unknown() {
        let status = DatabaseHealthStatus::default();
        // Default should be explicitly unevaluated (unknown)
        assert!(!status.evaluated);
        assert!(!status.is_connected);
        assert!(!status.circuit_breaker_open);
    }

    #[test]
    fn test_channel_health_status_default_is_unknown() {
        let status = ChannelHealthStatus::default();
        // Default should be explicitly unevaluated (unknown)
        assert!(!status.evaluated);
        assert!(!status.is_saturated);
        assert!(!status.is_critical);
    }

    #[test]
    fn test_queue_depth_status_default_is_unknown() {
        let status = QueueDepthStatus::default();
        // Default tier should be Unknown
        assert_eq!(status.tier, QueueDepthTier::Unknown);
    }
}