1use prometheus_client::encoding::EncodeLabelSet;
25use prometheus_client::metrics::counter::Counter;
26use prometheus_client::metrics::family::Family;
27use prometheus_client::metrics::gauge::Gauge;
28use prometheus_client::metrics::histogram::{exponential_buckets, Histogram};
29pub use prometheus_client::registry::Registry;
30
31#[derive(Clone, Debug, Hash, PartialEq, Eq, EncodeLabelSet)]
37pub struct StateLabel {
38 pub state: String,
39}
40
41#[derive(Clone, Debug, Hash, PartialEq, Eq, EncodeLabelSet)]
43pub struct TaskLabel {
44 pub state: String,
45 pub command_class: String,
46}
47
48#[derive(Clone, Debug, Hash, PartialEq, Eq, EncodeLabelSet)]
50pub struct GateLabel {
51 pub gate: String,
52}
53
54#[derive(Clone, Debug, Hash, PartialEq, Eq, EncodeLabelSet)]
56pub struct CommandLabel {
57 pub command_class: String,
58 pub exit_reason: String,
59}
60
61#[derive(Clone, Debug, Hash, PartialEq, Eq, EncodeLabelSet)]
63pub struct CommandClassLabel {
64 pub command_class: String,
65}
66
67#[derive(Clone, Debug, Hash, PartialEq, Eq, EncodeLabelSet)]
69pub struct TickStageLabel {
70 pub stage: String,
71}
72
73#[derive(Clone, Debug, Hash, PartialEq, Eq, EncodeLabelSet)]
75pub struct CommandOverheadLabel {
76 pub command_class: String,
77 pub phase: String,
78}
79
80#[derive(Clone, Debug, Hash, PartialEq, Eq, EncodeLabelSet)]
82pub struct StoreOpLabel {
83 pub operation: String,
84}
85
86#[derive(Clone, Debug, Hash, PartialEq, Eq, EncodeLabelSet)]
87pub struct RunMetricLabel {
88 pub run_id: String,
89 pub metric: String,
90}
91
92#[derive(Clone, Debug, Hash, PartialEq, Eq, EncodeLabelSet)]
94pub struct MergeOutcomeLabel {
95 pub outcome: String,
96}
97
98#[derive(Clone, Debug, Hash, PartialEq, Eq, EncodeLabelSet)]
100pub struct ConflictTypeLabel {
101 pub conflict_type: String,
102}
103
104#[derive(Clone, Debug)]
112pub struct YarliMetrics {
113 pub queue_depth: Gauge,
116 pub queue_lease_timeouts_total: Counter,
118
119 pub scheduler_tick_duration_seconds: Family<TickStageLabel, Histogram>,
122
123 pub runs_total: Family<StateLabel, Counter>,
126
127 pub tasks_total: Family<TaskLabel, Counter>,
130
131 pub gate_failures_total: Family<GateLabel, Counter>,
134
135 pub commands_total: Family<CommandLabel, Counter>,
138 pub command_duration_seconds: Family<CommandClassLabel, Histogram>,
140 pub command_overhead_duration_seconds: Family<CommandOverheadLabel, Histogram>,
142
143 pub store_duration_seconds: Family<StoreOpLabel, Histogram>,
146 pub store_slow_queries_total: Family<StoreOpLabel, Counter>,
148
149 pub worktree_state_total: Family<StateLabel, Counter>,
152
153 pub merge_attempts_total: Family<MergeOutcomeLabel, Counter>,
156 pub merge_conflicts_total: Family<ConflictTypeLabel, Counter>,
158
159 pub run_resource_usage: Family<RunMetricLabel, Gauge>,
162 pub run_token_usage: Family<RunMetricLabel, Gauge>,
164}
165
166impl YarliMetrics {
167 pub fn new(registry: &mut Registry) -> Self {
169 let queue_depth = Gauge::default();
171 registry.register(
172 "yarli_queue_depth",
173 "Current number of tasks in queue",
174 queue_depth.clone(),
175 );
176
177 let queue_lease_timeouts_total = Counter::default();
180 registry.register(
181 "yarli_queue_lease_timeouts",
182 "Total number of lease timeouts",
183 queue_lease_timeouts_total.clone(),
184 );
185
186 let scheduler_tick_duration_seconds =
188 Family::<TickStageLabel, Histogram>::new_with_constructor(|| {
189 Histogram::new(exponential_buckets(0.0001, 4.0, 8))
191 });
192 registry.register(
193 "yarli_scheduler_tick_duration_seconds",
194 "Scheduler tick duration in seconds by stage",
195 scheduler_tick_duration_seconds.clone(),
196 );
197
198 let runs_total = Family::<StateLabel, Counter>::default();
200 registry.register(
201 "yarli_runs",
202 "Total run state transitions by target state",
203 runs_total.clone(),
204 );
205
206 let tasks_total = Family::<TaskLabel, Counter>::default();
208 registry.register(
209 "yarli_tasks",
210 "Total task state transitions by state and command class",
211 tasks_total.clone(),
212 );
213
214 let gate_failures_total = Family::<GateLabel, Counter>::default();
216 registry.register(
217 "yarli_gate_failures",
218 "Total gate evaluation failures by gate name",
219 gate_failures_total.clone(),
220 );
221
222 let commands_total = Family::<CommandLabel, Counter>::default();
224 registry.register(
225 "yarli_commands",
226 "Total command executions by class and exit reason",
227 commands_total.clone(),
228 );
229
230 let command_duration_seconds =
231 Family::<CommandClassLabel, Histogram>::new_with_constructor(|| {
232 Histogram::new(exponential_buckets(0.1, 2.0, 10))
234 });
235 registry.register(
236 "yarli_command_duration_seconds",
237 "Command execution duration in seconds",
238 command_duration_seconds.clone(),
239 );
240
241 let command_overhead_duration_seconds =
242 Family::<CommandOverheadLabel, Histogram>::new_with_constructor(|| {
243 Histogram::new(exponential_buckets(0.001, 4.0, 8))
245 });
246 registry.register(
247 "yarli_command_overhead_duration_seconds",
248 "Command overhead duration in seconds by phase",
249 command_overhead_duration_seconds.clone(),
250 );
251
252 let store_duration_seconds =
254 Family::<StoreOpLabel, Histogram>::new_with_constructor(|| {
255 Histogram::new(exponential_buckets(0.001, 4.0, 8))
257 });
258 registry.register(
259 "yarli_store_duration_seconds",
260 "Store operation duration in seconds by operation",
261 store_duration_seconds.clone(),
262 );
263
264 let store_slow_queries_total = Family::<StoreOpLabel, Counter>::default();
265 registry.register(
266 "yarli_store_slow_queries",
267 "Total slow queries observed by operation",
268 store_slow_queries_total.clone(),
269 );
270
271 let worktree_state_total = Family::<StateLabel, Counter>::default();
273 registry.register(
274 "yarli_worktree_state",
275 "Total worktree state transitions by target state",
276 worktree_state_total.clone(),
277 );
278
279 let merge_attempts_total = Family::<MergeOutcomeLabel, Counter>::default();
281 registry.register(
282 "yarli_merge_attempts",
283 "Total merge attempts by outcome",
284 merge_attempts_total.clone(),
285 );
286
287 let merge_conflicts_total = Family::<ConflictTypeLabel, Counter>::default();
288 registry.register(
289 "yarli_merge_conflicts",
290 "Total merge conflicts by conflict type",
291 merge_conflicts_total.clone(),
292 );
293
294 let run_resource_usage = Family::<RunMetricLabel, Gauge>::default();
295 registry.register(
296 "yarli_run_resource_usage",
297 "Current run-level resource usage totals",
298 run_resource_usage.clone(),
299 );
300
301 let run_token_usage = Family::<RunMetricLabel, Gauge>::default();
302 registry.register(
303 "yarli_run_token_usage",
304 "Current run-level token usage totals",
305 run_token_usage.clone(),
306 );
307
308 Self {
309 queue_depth,
310 queue_lease_timeouts_total,
311 scheduler_tick_duration_seconds,
312 runs_total,
313 tasks_total,
314 gate_failures_total,
315 commands_total,
316 command_duration_seconds,
317 command_overhead_duration_seconds,
318 store_duration_seconds,
319 store_slow_queries_total,
320 worktree_state_total,
321 merge_attempts_total,
322 merge_conflicts_total,
323 run_resource_usage,
324 run_token_usage,
325 }
326 }
327
328 pub fn set_queue_depth(&self, depth: usize) {
330 self.queue_depth.set(depth as i64);
331 }
332
333 pub fn record_run_transition(&self, state: &str) {
337 self.runs_total
338 .get_or_create(&StateLabel {
339 state: state.to_string(),
340 })
341 .inc();
342 }
343
344 pub fn record_task_transition(&self, state: &str, command_class: &str) {
346 self.tasks_total
347 .get_or_create(&TaskLabel {
348 state: state.to_string(),
349 command_class: command_class.to_string(),
350 })
351 .inc();
352 }
353
354 pub fn record_gate_failure(&self, gate: &str) {
356 self.gate_failures_total
357 .get_or_create(&GateLabel {
358 gate: gate.to_string(),
359 })
360 .inc();
361 }
362
363 pub fn record_command(&self, command_class: &str, exit_reason: &str) {
365 self.commands_total
366 .get_or_create(&CommandLabel {
367 command_class: command_class.to_string(),
368 exit_reason: exit_reason.to_string(),
369 })
370 .inc();
371 }
372
373 pub fn record_command_duration(&self, command_class: &str, duration_secs: f64) {
375 self.command_duration_seconds
376 .get_or_create(&CommandClassLabel {
377 command_class: command_class.to_string(),
378 })
379 .observe(duration_secs);
380 }
381
382 pub fn record_worktree_transition(&self, state: &str) {
384 self.worktree_state_total
385 .get_or_create(&StateLabel {
386 state: state.to_string(),
387 })
388 .inc();
389 }
390
391 pub fn record_merge_attempt(&self, outcome: &str) {
393 self.merge_attempts_total
394 .get_or_create(&MergeOutcomeLabel {
395 outcome: outcome.to_string(),
396 })
397 .inc();
398 }
399
400 pub fn record_merge_conflict(&self, conflict_type: &str) {
402 self.merge_conflicts_total
403 .get_or_create(&ConflictTypeLabel {
404 conflict_type: conflict_type.to_string(),
405 })
406 .inc();
407 }
408
409 pub fn set_run_resource_usage(&self, run_id: &str, metric: &str, value: u64) {
411 self.run_resource_usage
412 .get_or_create(&RunMetricLabel {
413 run_id: run_id.to_string(),
414 metric: metric.to_string(),
415 })
416 .set(value as i64);
417 }
418
419 pub fn set_run_token_usage(&self, run_id: &str, metric: &str, value: u64) {
421 self.run_token_usage
422 .get_or_create(&RunMetricLabel {
423 run_id: run_id.to_string(),
424 metric: metric.to_string(),
425 })
426 .set(value as i64);
427 }
428
429 pub fn record_scheduler_tick_duration(&self, stage: &str, duration_secs: f64) {
431 self.scheduler_tick_duration_seconds
432 .get_or_create(&TickStageLabel {
433 stage: stage.to_string(),
434 })
435 .observe(duration_secs);
436 }
437
438 pub fn record_command_overhead_duration(
440 &self,
441 command_class: &str,
442 phase: &str,
443 duration_secs: f64,
444 ) {
445 self.command_overhead_duration_seconds
446 .get_or_create(&CommandOverheadLabel {
447 command_class: command_class.to_string(),
448 phase: phase.to_string(),
449 })
450 .observe(duration_secs);
451 }
452
453 pub fn record_store_duration(&self, operation: &str, duration_secs: f64) {
455 self.store_duration_seconds
456 .get_or_create(&StoreOpLabel {
457 operation: operation.to_string(),
458 })
459 .observe(duration_secs);
460 }
461
462 pub fn record_store_slow_query(&self, operation: &str) {
464 self.store_slow_queries_total
465 .get_or_create(&StoreOpLabel {
466 operation: operation.to_string(),
467 })
468 .inc();
469 }
470}
471
472pub fn encode_metrics(registry: &Registry) -> String {
474 let mut buf = String::new();
475 prometheus_client::encoding::text::encode(&mut buf, registry)
476 .expect("encoding to String never fails");
477 buf
478}
479
480#[cfg(test)]
485mod tests {
486 use super::*;
487
488 fn setup() -> (Registry, YarliMetrics) {
489 let mut registry = Registry::default();
490 let metrics = YarliMetrics::new(&mut registry);
491 (registry, metrics)
492 }
493
494 #[test]
497 fn metrics_register_without_panic() {
498 let (_registry, _metrics) = setup();
499 }
500
501 #[test]
502 fn encode_empty_registry() {
503 let (registry, _metrics) = setup();
504 let output = encode_metrics(®istry);
505 assert!(output.contains("yarli_queue_depth"));
507 assert!(output.contains("yarli_queue_lease_timeouts_total"));
508 }
509
510 #[test]
513 fn queue_depth_gauge() {
514 let (registry, metrics) = setup();
515 metrics.queue_depth.set(42);
516 let output = encode_metrics(®istry);
517 assert!(output.contains("yarli_queue_depth 42"));
518 }
519
520 #[test]
521 fn queue_depth_inc_dec() {
522 let (_registry, metrics) = setup();
523 metrics.queue_depth.inc();
524 metrics.queue_depth.inc();
525 metrics.queue_depth.dec();
526 assert_eq!(metrics.queue_depth.get(), 1);
527 }
528
529 #[test]
530 fn queue_lease_timeouts_counter() {
531 let (registry, metrics) = setup();
532 metrics.queue_lease_timeouts_total.inc();
533 metrics.queue_lease_timeouts_total.inc();
534 let output = encode_metrics(®istry);
535 assert!(output.contains("yarli_queue_lease_timeouts_total 2"));
536 }
537
538 #[test]
541 fn runs_total_counter() {
542 let (registry, metrics) = setup();
543 metrics.record_run_transition("RUN_ACTIVE");
544 metrics.record_run_transition("RUN_ACTIVE");
545 metrics.record_run_transition("RUN_COMPLETED");
546 let output = encode_metrics(®istry);
547 assert!(output.contains("yarli_runs_total"));
548 assert!(output.contains("RUN_ACTIVE"));
549 assert!(output.contains("RUN_COMPLETED"));
550 }
551
552 #[test]
555 fn tasks_total_counter() {
556 let (registry, metrics) = setup();
557 metrics.record_task_transition("TASK_EXECUTING", "cpu");
558 metrics.record_task_transition("TASK_COMPLETE", "io");
559 let output = encode_metrics(®istry);
560 assert!(output.contains("yarli_tasks_total"));
561 assert!(output.contains("TASK_EXECUTING"));
562 assert!(output.contains("TASK_COMPLETE"));
563 }
564
565 #[test]
568 fn gate_failures_counter() {
569 let (registry, metrics) = setup();
570 metrics.record_gate_failure("tests_passed");
571 metrics.record_gate_failure("tests_passed");
572 metrics.record_gate_failure("policy_clean");
573 let output = encode_metrics(®istry);
574 assert!(output.contains("yarli_gate_failures_total"));
575 assert!(output.contains("tests_passed"));
576 assert!(output.contains("policy_clean"));
577 }
578
579 #[test]
582 fn commands_total_counter() {
583 let (registry, metrics) = setup();
584 metrics.record_command("io", "exited");
585 metrics.record_command("cpu", "timed_out");
586 let output = encode_metrics(®istry);
587 assert!(output.contains("yarli_commands_total"));
588 assert!(output.contains("exited"));
589 assert!(output.contains("timed_out"));
590 }
591
592 #[test]
593 fn command_duration_histogram() {
594 let (registry, metrics) = setup();
595 metrics.record_command_duration("io", 0.5);
596 metrics.record_command_duration("io", 1.5);
597 metrics.record_command_duration("cpu", 10.0);
598 let output = encode_metrics(®istry);
599 assert!(output.contains("yarli_command_duration_seconds"));
600 }
601
602 #[test]
605 fn worktree_state_counter() {
606 let (registry, metrics) = setup();
607 metrics.record_worktree_transition("WT_BOUND_HOME");
608 metrics.record_worktree_transition("WT_CLOSED");
609 let output = encode_metrics(®istry);
610 assert!(output.contains("yarli_worktree_state_total"));
611 assert!(output.contains("WT_BOUND_HOME"));
612 }
613
614 #[test]
617 fn merge_attempts_counter() {
618 let (registry, metrics) = setup();
619 metrics.record_merge_attempt("done");
620 metrics.record_merge_attempt("conflict");
621 metrics.record_merge_attempt("aborted");
622 let output = encode_metrics(®istry);
623 assert!(output.contains("yarli_merge_attempts_total"));
624 assert!(output.contains("done"));
625 assert!(output.contains("conflict"));
626 }
627
628 #[test]
629 fn merge_conflicts_counter() {
630 let (registry, metrics) = setup();
631 metrics.record_merge_conflict("text");
632 metrics.record_merge_conflict("rename_rename");
633 metrics.record_merge_conflict("submodule_pointer");
634 let output = encode_metrics(®istry);
635 assert!(output.contains("yarli_merge_conflicts_total"));
636 assert!(output.contains("text"));
637 assert!(output.contains("rename_rename"));
638 }
639
640 #[test]
643 fn scheduler_tick_duration_histogram() {
644 let (registry, metrics) = setup();
645 metrics.record_scheduler_tick_duration("scan", 0.1);
646 metrics.record_scheduler_tick_duration("claim", 0.05);
647 let output = encode_metrics(®istry);
648 assert!(output.contains("yarli_scheduler_tick_duration_seconds"));
649 assert!(output.contains("scan"));
650 assert!(output.contains("claim"));
651 }
652
653 #[test]
656 fn command_overhead_duration_histogram() {
657 let (registry, metrics) = setup();
658 metrics.record_command_overhead_duration("io", "spawn", 0.01);
659 metrics.record_command_overhead_duration("cpu", "capture", 0.02);
660 let output = encode_metrics(®istry);
661 assert!(output.contains("yarli_command_overhead_duration_seconds"));
662 assert!(output.contains("spawn"));
663 assert!(output.contains("capture"));
664 }
665
666 #[test]
669 fn store_metrics() {
670 let (registry, metrics) = setup();
671 metrics.record_store_duration("append", 0.05);
672 metrics.record_store_slow_query("query");
673 let output = encode_metrics(®istry);
674 assert!(output.contains("yarli_store_duration_seconds"));
675 assert!(output.contains("yarli_store_slow_queries"));
676 assert!(output.contains("append"));
677 assert!(output.contains("query"));
678 }
679
680 #[test]
683 fn metrics_are_cloneable_and_share_state() {
684 let (_registry, metrics) = setup();
685 let metrics2 = metrics.clone();
686 metrics.queue_depth.set(99);
687 assert_eq!(metrics2.queue_depth.get(), 99);
688 }
689
690 #[test]
693 fn full_encode_includes_all_metric_names() {
694 let (registry, metrics) = setup();
695 metrics.queue_depth.set(1);
697 metrics.queue_lease_timeouts_total.inc();
698 metrics.record_run_transition("RUN_OPEN");
699 metrics.record_task_transition("TASK_OPEN", "io");
700 metrics.record_gate_failure("tests_passed");
701 metrics.record_command("io", "exited");
702 metrics.record_command_duration("io", 1.0);
703 metrics.record_worktree_transition("WT_CREATING");
704 metrics.record_merge_attempt("done");
705 metrics.record_merge_conflict("text");
706 metrics.record_scheduler_tick_duration("scan", 0.001);
707 metrics.record_command_overhead_duration("io", "spawn", 0.001);
708 metrics.record_store_duration("query", 0.001);
709 metrics.record_store_slow_query("append");
710
711 let output = encode_metrics(®istry);
712
713 let expected_names = [
714 "yarli_queue_depth",
715 "yarli_queue_lease_timeouts_total",
716 "yarli_scheduler_tick_duration_seconds",
717 "yarli_runs_total",
718 "yarli_tasks_total",
719 "yarli_gate_failures_total",
720 "yarli_commands_total",
721 "yarli_command_duration_seconds",
722 "yarli_command_overhead_duration_seconds",
723 "yarli_store_duration_seconds",
724 "yarli_store_slow_queries",
725 "yarli_worktree_state_total",
726 "yarli_merge_attempts_total",
727 "yarli_merge_conflicts_total",
728 "yarli_run_resource_usage",
729 "yarli_run_token_usage",
730 ];
731 for name in &expected_names {
732 assert!(output.contains(name), "missing metric: {name}");
733 }
734 }
735}