entelix-core 0.5.4

entelix DAG root — IR, codecs, transports, Tool trait + ToolRegistry, auth, ExecutionContext, ModelInvocation/ToolInvocation Service spine, StreamAggregator
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
//! `RunBudget` — six-axis usage cap checked across one logical
//! run, including sub-agent fan-out.
//!
//! | Axis                    | Type      | Pre-call check | Post-call accumulation |
//! |-------------------------|-----------|----------------|------------------------|
//! | `request_limit`         | u32       | ✓              | accumulate on Ok       |
//! | `input_tokens_limit`    | u64       | —              | check on Ok            |
//! | `output_tokens_limit`   | u64       | —              | check on Ok            |
//! | `total_tokens_limit`    | u64       | —              | check on Ok            |
//! | `tool_calls_limit`      | u32       | ✓              | accumulate on Ok       |
//! | `cost_usd_limit`        | `Decimal` | —              | check on Ok            |
//!
//! Pre-call axes (`request_limit`, `tool_calls_limit`) are checked
//! before the dispatch reaches the wire — the SDK knows the
//! caller is about to issue request `N+1` and refuses if the cap
//! is `N`. Token axes are post-call: the budget sees the response
//! `Usage` only after the codec decodes, so the breach surfaces
//! on the call that pushed the cumulative total past the limit.
//!
//! ## Sub-agent fan-out
//!
//! `RunBudget` carries an `Arc<RunBudgetState>` of atomic
//! counters. Cloning the budget — done implicitly when an
//! `ExecutionContext` flows into a `Subagent::execute` — bumps
//! the Arc refcount; the sub-agent's calls accumulate into the
//! same counters as the parent's. Compared to per-instance
//! counters, this is `(a)` cheaper (no message-passing between
//! parent and child runtimes) and `(b)` correct under tokio's
//! work-stealing executor (atomic ordering is the cross-task
//! synchronisation primitive).
//!
//! ## Wiring
//!
//! Operators attach a `RunBudget` to the `ExecutionContext` via
//! [`crate::ExecutionContext::with_run_budget`]. Every `ChatModel`
//! dispatch site (`complete_full`, `complete_typed`,
//! `stream_deltas`) reads the budget from `ctx`, calls
//! `check_pre_request` before the wire roundtrip, and calls
//! `observe_usage` on the `Ok` branch (invariant 12 — never on
//! the error branch, otherwise a network failure would still
//! drain the budget). A budget breach surfaces as
//! [`crate::Error::UsageLimitExceeded`] with the breaching axis
//! and observed value.

use std::sync::Arc;
use std::sync::Mutex;
use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};

use rust_decimal::Decimal;
use serde::{Deserialize, Serialize};

use crate::error::{Error, Result};
use crate::ir::Usage;

/// One [`RunBudget`] axis breach — typed pair of axis-discriminator
/// and magnitude. Each variant carries the magnitude shape the axis
/// uses (`u64` count for token / request / tool-call axes,
/// [`Decimal`] USD for the cost axis), so axis-magnitude pairing is
/// type-enforced rather than runtime-validated.
///
/// Carried on [`crate::Error::UsageLimitExceeded`] and
/// `entelix_session::GraphEvent::UsageLimitExceeded`; emitted to
/// `AuditSink::record_usage_limit_exceeded` for compliance /
/// billing replay.
///
/// `non_exhaustive` so post-1.0 axes ship as MINOR. Construct via
/// the typed variants directly — `UsageLimitBreach::Requests {
/// limit, observed }` is the canonical shape.
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
#[serde(tag = "axis", rename_all = "snake_case")]
#[non_exhaustive]
pub enum UsageLimitBreach {
    /// Request-count cap breached. Pre-call check fired.
    Requests {
        /// Configured cap (model dispatches per run).
        limit: u64,
        /// Counter value when the cap was hit.
        observed: u64,
    },
    /// Cumulative-input-tokens cap breached. Post-call check fired.
    InputTokens {
        /// Configured cap (cumulative input tokens).
        limit: u64,
        /// Cumulative input tokens after the breaching call.
        observed: u64,
    },
    /// Cumulative-output-tokens cap breached. Post-call check fired.
    OutputTokens {
        /// Configured cap (cumulative output tokens).
        limit: u64,
        /// Cumulative output tokens after the breaching call.
        observed: u64,
    },
    /// Cumulative input + output tokens cap breached.
    TotalTokens {
        /// Configured cap (cumulative input + output tokens).
        limit: u64,
        /// Cumulative total after the breaching call.
        observed: u64,
    },
    /// Tool-call-count cap breached. Pre-call check fired.
    ToolCalls {
        /// Configured cap (tool dispatches per run).
        limit: u64,
        /// Counter value when the cap was hit.
        observed: u64,
    },
    /// USD cost cap breached. Post-call check fired after
    /// [`RunBudget::observe_cost`] accumulated the per-call charge.
    CostUsd {
        /// Configured cap in USD.
        limit: Decimal,
        /// Cumulative cost after the breaching charge.
        observed: Decimal,
    },
}

impl UsageLimitBreach {
    /// Stable axis-name string used for OTel attribute keys,
    /// dashboards, and `AuditSink` filtering. Matches the
    /// snake-case `serde` tag.
    ///
    /// Operator-facing API even though [`std::fmt::Display`] also
    /// encodes the axis — dashboards, log filters, and serde-keyed
    /// attribute emitters need a stable `&'static str` they can
    /// compare without parsing the human-readable Display
    /// rendering.
    #[must_use]
    pub const fn axis_name(&self) -> &'static str {
        match self {
            Self::Requests { .. } => "requests",
            Self::InputTokens { .. } => "input_tokens",
            Self::OutputTokens { .. } => "output_tokens",
            Self::TotalTokens { .. } => "total_tokens",
            Self::ToolCalls { .. } => "tool_calls",
            Self::CostUsd { .. } => "cost_usd",
        }
    }
}

impl std::fmt::Display for UsageLimitBreach {
    /// Grep-consistent rendering across every axis:
    /// `run budget exceeded on <axis> axis: observed <N>, limit <N>`.
    /// Token / request / tool-call axes render `<N>` as a bare
    /// number; the cost axis renders `<N>` as a `Decimal` rendered
    /// in plain (un-prefixed) form. Dashboards regex-extracting
    /// `observed (\S+), limit (\S+)` get the magnitude on every
    /// axis without a polarity-by-axis branch.
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        let axis = self.axis_name();
        match self {
            Self::Requests { limit, observed }
            | Self::InputTokens { limit, observed }
            | Self::OutputTokens { limit, observed }
            | Self::TotalTokens { limit, observed }
            | Self::ToolCalls { limit, observed } => {
                write!(
                    f,
                    "run budget exceeded on {axis} axis: observed {observed}, limit {limit}"
                )
            }
            Self::CostUsd { limit, observed } => {
                write!(
                    f,
                    "run budget exceeded on {axis} axis: observed {observed}, limit {limit}"
                )
            }
        }
    }
}

/// Six-axis usage cap shared across one logical run (parent
/// agent + every sub-agent it dispatches). Cloning the budget —
/// done implicitly per `ExecutionContext` clone — bumps the
/// internal `Arc` refcount; sub-agent calls accumulate into the
/// same counters as the parent's.
///
/// Construct via [`Self::unlimited`] (every axis disabled — the
/// default) or via [`Self::default`] (alias). Set per-axis caps
/// with the `with_*_limit` builders. The result is a snapshot
/// builder; call sites read it through
/// [`crate::ExecutionContext::run_budget`].
#[derive(Clone, Debug, Default)]
pub struct RunBudget {
    request_limit: Option<u32>,
    input_tokens_limit: Option<u64>,
    output_tokens_limit: Option<u64>,
    total_tokens_limit: Option<u64>,
    tool_calls_limit: Option<u32>,
    cost_usd_limit: Option<Decimal>,
    state: Arc<RunBudgetState>,
}

#[derive(Debug, Default)]
struct RunBudgetState {
    requests: AtomicU32,
    input_tokens: AtomicU64,
    output_tokens: AtomicU64,
    tool_calls: AtomicU32,
    cost_usd: Mutex<Decimal>,
}

impl RunBudget {
    /// Build with every axis disabled. Caps are set via
    /// `with_*_limit` chained calls.
    #[must_use]
    pub fn unlimited() -> Self {
        Self::default()
    }

    /// Cap the number of model dispatches per run. Pre-call
    /// check — the SDK refuses request `N+1` when the cap is
    /// `N`, before the wire roundtrip.
    #[must_use]
    pub const fn with_request_limit(mut self, n: u32) -> Self {
        self.request_limit = Some(n);
        self
    }

    /// Cap cumulative input tokens. Post-call check — the SDK
    /// observes `response.usage.input_tokens` after the codec
    /// decodes and surfaces a breach on the call that pushed the
    /// running total past the cap.
    #[must_use]
    pub const fn with_input_tokens_limit(mut self, n: u64) -> Self {
        self.input_tokens_limit = Some(n);
        self
    }

    /// Cap cumulative output tokens. Post-call.
    #[must_use]
    pub const fn with_output_tokens_limit(mut self, n: u64) -> Self {
        self.output_tokens_limit = Some(n);
        self
    }

    /// Cap cumulative input + output tokens. Post-call. Lets
    /// operators set one ceiling without splitting the
    /// per-direction caps.
    #[must_use]
    pub const fn with_total_tokens_limit(mut self, n: u64) -> Self {
        self.total_tokens_limit = Some(n);
        self
    }

    /// Cap the number of tool dispatches per run. Pre-call check
    /// — the SDK refuses tool call `N+1` when the cap is `N`,
    /// before the dispatched tool's [`crate::tools::Tool::execute`]
    /// runs.
    #[must_use]
    pub const fn with_tool_calls_limit(mut self, n: u32) -> Self {
        self.tool_calls_limit = Some(n);
        self
    }

    /// Cap cumulative USD cost across the run. Operators wire a
    /// [`crate::BudgetCostEstimator`] (the same one
    /// `entelix-policy::CostMeter` implements); the dispatch site
    /// calls [`Self::observe_cost`] on the `Ok` branch with the
    /// per-call charge, and the budget surfaces a breach when the
    /// running total crosses `limit`. Decimal precision matches the
    /// cost estimator's `rust_decimal` precision.
    #[must_use]
    pub const fn with_cost_limit_usd(mut self, limit: Decimal) -> Self {
        self.cost_usd_limit = Some(limit);
        self
    }

    /// Pre-request gate — checks the request-count cap and, on
    /// success, increments the request counter. Call from the
    /// dispatch site **before** the wire roundtrip. Returns
    /// `Error::UsageLimitExceeded(UsageLimitBreach::Requests {.})`
    /// when the cap is hit; the counter is not incremented on
    /// failure (the request did not actually fire).
    pub fn check_pre_request(&self) -> Result<()> {
        if let Some(limit) = self.request_limit {
            // Atomic compare-and-swap loop: read current, refuse
            // when at-or-over cap, otherwise increment. Avoids
            // the race where two concurrent calls both pass a
            // `load() < limit` check and then both `fetch_add`,
            // overshooting the cap by one.
            loop {
                let current = self.state.requests.load(Ordering::Acquire);
                if u64::from(current) >= u64::from(limit) {
                    return Err(Error::UsageLimitExceeded(UsageLimitBreach::Requests {
                        limit: u64::from(limit),
                        observed: u64::from(current),
                    }));
                }
                if self
                    .state
                    .requests
                    .compare_exchange_weak(
                        current,
                        current.saturating_add(1),
                        Ordering::AcqRel,
                        Ordering::Acquire,
                    )
                    .is_ok()
                {
                    return Ok(());
                }
            }
        }
        Ok(())
    }

    /// Configured request-count cap. `None` when the axis is
    /// unbounded. Operators integrating custom pre-call gates read
    /// this alongside [`Self::snapshot`] to compute their own
    /// projected counter.
    #[must_use]
    pub const fn request_limit(&self) -> Option<u32> {
        self.request_limit
    }

    /// Configured input-tokens cap.
    #[must_use]
    pub const fn input_tokens_limit(&self) -> Option<u64> {
        self.input_tokens_limit
    }

    /// Configured output-tokens cap.
    #[must_use]
    pub const fn output_tokens_limit(&self) -> Option<u64> {
        self.output_tokens_limit
    }

    /// Configured combined-tokens cap.
    #[must_use]
    pub const fn total_tokens_limit(&self) -> Option<u64> {
        self.total_tokens_limit
    }

    /// Configured tool-call-count cap.
    #[must_use]
    pub const fn tool_calls_limit(&self) -> Option<u32> {
        self.tool_calls_limit
    }

    /// Configured USD-cost cap.
    #[must_use]
    pub const fn cost_usd_limit(&self) -> Option<Decimal> {
        self.cost_usd_limit
    }

    /// Pre-call token gate — compares `(observed + estimate)` against
    /// each token axis cap and surfaces a breach before the wire
    /// roundtrip fires. Operators compute the estimate from a
    /// [`crate::TokenCounter`] over the prompt; the projected
    /// `output_tokens` is typically `ModelRequest::max_tokens`
    /// (worst-case) or a conservative heuristic when the request
    /// leaves `max_tokens` unset.
    ///
    /// Increments no counters — the post-call
    /// [`Self::observe_usage`] is the only mutation path
    /// (invariant 12: a failed call never drains the budget). The
    /// pre-call gate is purely projective and idempotent, so a
    /// retry attempt re-evaluates against the current observed
    /// total without double-charging.
    ///
    /// When multiple axes would breach, the first to fire wins in
    /// the order `InputTokens` → `OutputTokens` → `TotalTokens`,
    /// matching [`Self::observe_usage`].
    pub fn check_pre_request_tokens(
        &self,
        estimated_input: u64,
        estimated_output: u64,
    ) -> Result<()> {
        let observed_in = self.state.input_tokens.load(Ordering::Acquire);
        let observed_out = self.state.output_tokens.load(Ordering::Acquire);
        let projected_in = observed_in.saturating_add(estimated_input);
        let projected_out = observed_out.saturating_add(estimated_output);
        if let Some(limit) = self.input_tokens_limit
            && projected_in > limit
        {
            return Err(Error::UsageLimitExceeded(UsageLimitBreach::InputTokens {
                limit,
                observed: projected_in,
            }));
        }
        if let Some(limit) = self.output_tokens_limit
            && projected_out > limit
        {
            return Err(Error::UsageLimitExceeded(UsageLimitBreach::OutputTokens {
                limit,
                observed: projected_out,
            }));
        }
        if let Some(limit) = self.total_tokens_limit {
            let projected_total = projected_in.saturating_add(projected_out);
            if projected_total > limit {
                return Err(Error::UsageLimitExceeded(UsageLimitBreach::TotalTokens {
                    limit,
                    observed: projected_total,
                }));
            }
        }
        Ok(())
    }

    /// Pre-call cost gate — compares `(observed + estimated_charge)`
    /// against `cost_usd_limit` and surfaces a breach before the
    /// wire roundtrip fires. Operators wire a
    /// [`crate::BudgetCostEstimator`] to compute the worst-case charge
    /// from `(ModelRequest, vendor tariff)` and thread the result
    /// here.
    ///
    /// Increments no counters; [`Self::observe_cost`] on the `Ok`
    /// branch of the dispatch is the only mutation path. The pre
    /// gate is conservative — false-positive rejections are
    /// recoverable (the operator surfaces a budget-soon error to
    /// the caller), silent cap overrun is not (invariant 15).
    pub fn check_pre_request_cost(&self, estimated_charge: Decimal) -> Result<()> {
        if let Some(limit) = self.cost_usd_limit {
            let observed = *self
                .state
                .cost_usd
                .lock()
                .unwrap_or_else(std::sync::PoisonError::into_inner);
            let projected = observed.saturating_add(estimated_charge);
            if projected > limit {
                return Err(Error::UsageLimitExceeded(UsageLimitBreach::CostUsd {
                    limit,
                    observed: projected,
                }));
            }
        }
        Ok(())
    }

    /// Pre-tool-call gate — same shape as
    /// [`Self::check_pre_request`] but for the `tool_calls_limit`
    /// axis. Call from the tool dispatch site
    /// (`ToolRegistry::dispatch_*`) **before** the tool's
    /// `execute` runs.
    pub fn check_pre_tool_call(&self) -> Result<()> {
        if let Some(limit) = self.tool_calls_limit {
            loop {
                let current = self.state.tool_calls.load(Ordering::Acquire);
                if u64::from(current) >= u64::from(limit) {
                    return Err(Error::UsageLimitExceeded(UsageLimitBreach::ToolCalls {
                        limit: u64::from(limit),
                        observed: u64::from(current),
                    }));
                }
                if self
                    .state
                    .tool_calls
                    .compare_exchange_weak(
                        current,
                        current.saturating_add(1),
                        Ordering::AcqRel,
                        Ordering::Acquire,
                    )
                    .is_ok()
                {
                    return Ok(());
                }
            }
        }
        Ok(())
    }

    /// Post-call accumulation — adds the observed usage to the
    /// token counters and surfaces a breach on the axis that
    /// crossed its cap. Call from the dispatch site on the
    /// **`Ok` branch only** (invariant 12 — failed calls never
    /// drain the budget).
    ///
    /// When multiple axes breach simultaneously (e.g. a single
    /// large response trips both `output_tokens_limit` and
    /// `total_tokens_limit`), the function reports the first
    /// axis it encounters in the order `InputTokens` →
    /// `OutputTokens` → `TotalTokens`. Operators that need
    /// every breach surface attach observers via
    /// [`Self::snapshot`].
    pub fn observe_usage(&self, usage: &Usage) -> Result<()> {
        let new_in = self
            .state
            .input_tokens
            .fetch_add(u64::from(usage.input_tokens), Ordering::AcqRel)
            .saturating_add(u64::from(usage.input_tokens));
        let new_out = self
            .state
            .output_tokens
            .fetch_add(u64::from(usage.output_tokens), Ordering::AcqRel)
            .saturating_add(u64::from(usage.output_tokens));
        if let Some(limit) = self.input_tokens_limit
            && new_in > limit
        {
            return Err(Error::UsageLimitExceeded(UsageLimitBreach::InputTokens {
                limit,
                observed: new_in,
            }));
        }
        if let Some(limit) = self.output_tokens_limit
            && new_out > limit
        {
            return Err(Error::UsageLimitExceeded(UsageLimitBreach::OutputTokens {
                limit,
                observed: new_out,
            }));
        }
        if let Some(limit) = self.total_tokens_limit {
            let total = new_in.saturating_add(new_out);
            if total > limit {
                return Err(Error::UsageLimitExceeded(UsageLimitBreach::TotalTokens {
                    limit,
                    observed: total,
                }));
            }
        }
        Ok(())
    }

    /// Post-call cost accumulation — adds the per-call USD charge
    /// to the running total and surfaces a breach if it crossed
    /// `cost_usd_limit`. Call from the dispatch site on the **`Ok`
    /// branch only** (invariant 12 — failed calls never drain the
    /// budget). Operators integrate by computing the charge from a
    /// [`crate::BudgetCostEstimator`] and threading the result here.
    pub fn observe_cost(&self, charge_usd: Decimal) -> Result<()> {
        let observed = {
            let mut accumulated = self
                .state
                .cost_usd
                .lock()
                .unwrap_or_else(std::sync::PoisonError::into_inner);
            *accumulated = accumulated.saturating_add(charge_usd);
            *accumulated
        };
        if let Some(limit) = self.cost_usd_limit
            && observed > limit
        {
            return Err(Error::UsageLimitExceeded(UsageLimitBreach::CostUsd {
                limit,
                observed,
            }));
        }
        Ok(())
    }

    /// Snapshot the current counter state. Returns owned values
    /// at a single point in time; subsequent mutations on the
    /// budget do not affect the returned snapshot. Used by
    /// [`crate::ExecutionContext`] consumers and by the
    /// `AgentRunResult<S>` envelope (B-5) to expose the final
    /// usage to callers without leaking the live `Arc`.
    #[must_use]
    pub fn snapshot(&self) -> UsageSnapshot {
        let cost_usd = *self
            .state
            .cost_usd
            .lock()
            .unwrap_or_else(std::sync::PoisonError::into_inner);
        UsageSnapshot {
            requests: self.state.requests.load(Ordering::Acquire),
            input_tokens: self.state.input_tokens.load(Ordering::Acquire),
            output_tokens: self.state.output_tokens.load(Ordering::Acquire),
            tool_calls: self.state.tool_calls.load(Ordering::Acquire),
            cost_usd,
        }
    }
}

/// Frozen snapshot of [`RunBudget`] counters at one point in
/// time. Carried in `AgentRunResult<S>::usage` (B-5) so callers
/// see the final tally without needing to clone the budget.
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Serialize, Deserialize)]
#[non_exhaustive]
pub struct UsageSnapshot {
    /// Total model dispatches the run made.
    pub requests: u32,
    /// Cumulative input tokens.
    pub input_tokens: u64,
    /// Cumulative output tokens.
    pub output_tokens: u64,
    /// Total tool dispatches.
    pub tool_calls: u32,
    /// Cumulative USD cost across the run (sum of every
    /// `observe_cost` charge). Operators that don't wire a
    /// [`crate::BudgetCostEstimator`] see [`Decimal::ZERO`].
    pub cost_usd: Decimal,
}

impl UsageSnapshot {
    /// Sum of [`Self::input_tokens`] and [`Self::output_tokens`].
    #[must_use]
    pub const fn total_tokens(&self) -> u64 {
        self.input_tokens.saturating_add(self.output_tokens)
    }
}

#[cfg(test)]
#[allow(clippy::unwrap_used)]
mod tests {
    use super::*;
    use crate::ir::Usage;

    #[test]
    fn unlimited_budget_passes_every_check() {
        let budget = RunBudget::unlimited();
        budget.check_pre_request().unwrap();
        budget.check_pre_tool_call().unwrap();
        budget
            .observe_usage(&Usage::new(1_000_000, 1_000_000))
            .unwrap();
    }

    #[test]
    fn request_limit_pre_check_increments_then_breaks() {
        let budget = RunBudget::unlimited().with_request_limit(2);
        budget.check_pre_request().unwrap();
        budget.check_pre_request().unwrap();
        let err = budget.check_pre_request().unwrap_err();
        match err {
            Error::UsageLimitExceeded(UsageLimitBreach::Requests {
                limit: 2,
                observed: 2,
            }) => {}
            other => panic!("unexpected: {other:?}"),
        }
        // Counter was not incremented past the cap.
        assert_eq!(budget.snapshot().requests, 2);
    }

    #[test]
    fn tool_calls_limit_pre_check_breaks() {
        let budget = RunBudget::unlimited().with_tool_calls_limit(1);
        budget.check_pre_tool_call().unwrap();
        let err = budget.check_pre_tool_call().unwrap_err();
        assert!(matches!(
            err,
            Error::UsageLimitExceeded(UsageLimitBreach::ToolCalls { .. })
        ));
    }

    #[test]
    fn input_tokens_limit_post_observe_breaks() {
        let budget = RunBudget::unlimited().with_input_tokens_limit(100);
        budget.observe_usage(&Usage::new(50, 0)).unwrap();
        let err = budget.observe_usage(&Usage::new(60, 0)).unwrap_err();
        match err {
            Error::UsageLimitExceeded(UsageLimitBreach::InputTokens {
                limit: 100,
                observed: 110,
            }) => {}
            other => panic!("unexpected: {other:?}"),
        }
    }

    #[test]
    fn output_tokens_limit_post_observe_breaks() {
        let budget = RunBudget::unlimited().with_output_tokens_limit(100);
        budget.observe_usage(&Usage::new(0, 99)).unwrap();
        let err = budget.observe_usage(&Usage::new(0, 2)).unwrap_err();
        assert!(matches!(
            err,
            Error::UsageLimitExceeded(UsageLimitBreach::OutputTokens { .. })
        ));
    }

    #[test]
    fn total_tokens_limit_combines_input_and_output() {
        let budget = RunBudget::unlimited().with_total_tokens_limit(100);
        budget.observe_usage(&Usage::new(40, 40)).unwrap();
        let err = budget.observe_usage(&Usage::new(20, 20)).unwrap_err();
        match err {
            Error::UsageLimitExceeded(UsageLimitBreach::TotalTokens {
                limit: 100,
                observed: 120,
            }) => {}
            other => panic!("unexpected: {other:?}"),
        }
    }

    #[test]
    fn cost_usd_limit_post_observe_breaks() {
        use rust_decimal::Decimal;
        let cap = Decimal::new(50, 2); // $0.50
        let budget = RunBudget::unlimited().with_cost_limit_usd(cap);
        budget.observe_cost(Decimal::new(30, 2)).unwrap(); // $0.30
        let err = budget.observe_cost(Decimal::new(25, 2)).unwrap_err();
        match err {
            Error::UsageLimitExceeded(UsageLimitBreach::CostUsd { limit, observed }) => {
                assert_eq!(limit, cap);
                assert_eq!(observed, Decimal::new(55, 2)); // $0.55
            }
            other => panic!("unexpected: {other:?}"),
        }
        assert_eq!(budget.snapshot().cost_usd, Decimal::new(55, 2));
    }

    #[test]
    fn cost_unlimited_accumulates_without_breaching() {
        use rust_decimal::Decimal;
        let budget = RunBudget::unlimited();
        budget.observe_cost(Decimal::new(100, 2)).unwrap(); // $1.00
        budget.observe_cost(Decimal::new(200, 2)).unwrap(); // $2.00
        assert_eq!(budget.snapshot().cost_usd, Decimal::new(300, 2));
    }

    #[test]
    fn clone_shares_atomic_state() {
        // Sub-agent fan-out invariant: cloning the budget shares
        // the underlying counters via Arc — the parent's budget
        // and the sub-agent's budget are the same logical run.
        let parent = RunBudget::unlimited().with_request_limit(2);
        let child = parent.clone();
        parent.check_pre_request().unwrap();
        child.check_pre_request().unwrap();
        // Both views see two pre-checks; the third on either side
        // breaches.
        let err = parent.check_pre_request().unwrap_err();
        assert!(matches!(
            err,
            Error::UsageLimitExceeded(UsageLimitBreach::Requests { .. })
        ));
    }

    #[test]
    fn cost_clone_shares_arc_state() {
        // Sub-agent fan-out for cost — parent + child share the
        // same `Mutex<Decimal>` accumulator via `Arc`. A cost
        // observation on either side accumulates into the
        // single logical-run total, and the cap fires from
        // whichever side pushes it over (audit gap noted in
        // post-S104 review).
        use rust_decimal::Decimal;
        let cap = Decimal::new(100, 2); // $1.00
        let parent = RunBudget::unlimited().with_cost_limit_usd(cap);
        let child = parent.clone();
        parent.observe_cost(Decimal::new(60, 2)).unwrap(); // $0.60
        child.observe_cost(Decimal::new(30, 2)).unwrap(); // $0.30 — total $0.90, under
        let err = child.observe_cost(Decimal::new(20, 2)).unwrap_err();
        match err {
            Error::UsageLimitExceeded(UsageLimitBreach::CostUsd { limit, observed }) => {
                assert_eq!(limit, cap);
                // $0.60 + $0.30 + $0.20 = $1.10
                assert_eq!(observed, Decimal::new(110, 2));
            }
            other => panic!("unexpected: {other:?}"),
        }
        assert_eq!(parent.snapshot().cost_usd, Decimal::new(110, 2));
        assert_eq!(child.snapshot().cost_usd, Decimal::new(110, 2));
    }

    // ── pre-call gates (Slice A) ────────────────────────────────

    #[test]
    fn limit_accessors_reflect_configuration() {
        let budget = RunBudget::unlimited()
            .with_request_limit(10)
            .with_input_tokens_limit(100)
            .with_output_tokens_limit(50)
            .with_total_tokens_limit(140)
            .with_tool_calls_limit(5)
            .with_cost_limit_usd(Decimal::new(150, 2));
        assert_eq!(budget.request_limit(), Some(10));
        assert_eq!(budget.input_tokens_limit(), Some(100));
        assert_eq!(budget.output_tokens_limit(), Some(50));
        assert_eq!(budget.total_tokens_limit(), Some(140));
        assert_eq!(budget.tool_calls_limit(), Some(5));
        assert_eq!(budget.cost_usd_limit(), Some(Decimal::new(150, 2)));
        let unbounded = RunBudget::unlimited();
        assert_eq!(unbounded.request_limit(), None);
        assert_eq!(unbounded.cost_usd_limit(), None);
    }

    #[test]
    fn check_pre_request_cost_blocks_when_estimate_overshoots() {
        let budget = RunBudget::unlimited().with_cost_limit_usd(Decimal::new(100, 2)); // $1.00
        budget.observe_cost(Decimal::new(98, 2)).unwrap(); // $0.98 observed
        let err = budget
            .check_pre_request_cost(Decimal::new(5, 2)) // estimate $0.05 → $1.03
            .unwrap_err();
        match err {
            Error::UsageLimitExceeded(UsageLimitBreach::CostUsd { limit, observed }) => {
                assert_eq!(limit, Decimal::new(100, 2));
                assert_eq!(observed, Decimal::new(103, 2));
            }
            other => panic!("unexpected: {other:?}"),
        }
        // Pre-call check is read-only — counter unchanged.
        assert_eq!(budget.snapshot().cost_usd, Decimal::new(98, 2));
    }

    #[test]
    fn check_pre_request_cost_passes_when_estimate_fits() {
        let budget = RunBudget::unlimited().with_cost_limit_usd(Decimal::new(100, 2));
        budget.observe_cost(Decimal::new(50, 2)).unwrap();
        budget.check_pre_request_cost(Decimal::new(30, 2)).unwrap();
        assert_eq!(budget.snapshot().cost_usd, Decimal::new(50, 2));
    }

    #[test]
    fn check_pre_request_cost_no_op_when_axis_unbounded() {
        let budget = RunBudget::unlimited();
        budget
            .check_pre_request_cost(Decimal::new(10_000_000, 0))
            .unwrap();
    }

    #[test]
    fn check_pre_request_tokens_blocks_on_input_axis() {
        let budget = RunBudget::unlimited().with_input_tokens_limit(100);
        budget.observe_usage(&Usage::new(80, 0)).unwrap();
        let err = budget.check_pre_request_tokens(30, 0).unwrap_err();
        match err {
            Error::UsageLimitExceeded(UsageLimitBreach::InputTokens { limit, observed }) => {
                assert_eq!(limit, 100);
                assert_eq!(observed, 110);
            }
            other => panic!("unexpected: {other:?}"),
        }
        // Pre-call check is read-only.
        assert_eq!(budget.snapshot().input_tokens, 80);
    }

    #[test]
    fn check_pre_request_tokens_blocks_on_output_axis() {
        let budget = RunBudget::unlimited().with_output_tokens_limit(100);
        budget.observe_usage(&Usage::new(0, 80)).unwrap();
        let err = budget.check_pre_request_tokens(0, 30).unwrap_err();
        assert!(
            matches!(
                err,
                Error::UsageLimitExceeded(UsageLimitBreach::OutputTokens { .. })
            ),
            "got: {err:?}"
        );
    }

    #[test]
    fn check_pre_request_tokens_blocks_on_total_axis() {
        let budget = RunBudget::unlimited().with_total_tokens_limit(150);
        budget.observe_usage(&Usage::new(50, 50)).unwrap();
        let err = budget.check_pre_request_tokens(40, 40).unwrap_err();
        match err {
            Error::UsageLimitExceeded(UsageLimitBreach::TotalTokens { limit, observed }) => {
                assert_eq!(limit, 150);
                assert_eq!(observed, 180);
            }
            other => panic!("unexpected: {other:?}"),
        }
    }

    #[test]
    fn check_pre_request_tokens_input_fires_before_total() {
        // When both axes would breach, the deterministic order is
        // input → output → total. Ensures consumers branching on the
        // first breach get a stable signal.
        let budget = RunBudget::unlimited()
            .with_input_tokens_limit(50)
            .with_total_tokens_limit(60);
        budget.observe_usage(&Usage::new(40, 0)).unwrap();
        let err = budget.check_pre_request_tokens(20, 20).unwrap_err();
        assert!(matches!(
            err,
            Error::UsageLimitExceeded(UsageLimitBreach::InputTokens { .. })
        ));
    }

    #[test]
    fn check_pre_request_tokens_no_op_when_all_axes_unbounded() {
        let budget = RunBudget::unlimited();
        budget
            .check_pre_request_tokens(u64::MAX / 2, u64::MAX / 2)
            .unwrap();
    }

    #[test]
    fn snapshot_returns_owned_values() {
        // `with_request_limit` so the pre-call CAS actually
        // increments — `check_pre_request` early-returns when no
        // cap is set (`unlimited` alone leaves every counter at
        // zero, hiding the snapshot's frozen-at-call contract).
        let budget = RunBudget::unlimited().with_request_limit(100);
        budget.check_pre_request().unwrap();
        budget.observe_usage(&Usage::new(10, 5)).unwrap();
        let snap = budget.snapshot();
        assert_eq!(snap.requests, 1);
        assert_eq!(snap.input_tokens, 10);
        assert_eq!(snap.output_tokens, 5);
        assert_eq!(snap.total_tokens(), 15);
        // Subsequent mutations don't reflect on the snapshot.
        budget.check_pre_request().unwrap();
        assert_eq!(snap.requests, 1);
    }
}