1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
use crate::repl::conversation::ConversationHistory;
use crate::session::DisplayMessage;
#[cfg(test)]
mod tests {
use super::*;
use crate::api::Usage;
use crate::repl::conversation::ConversationHistory;
fn usage_with_inputs(input_tokens: u32, output_tokens: u32) -> Usage {
Usage {
input_tokens,
output_tokens,
cache_read_input_tokens: None,
cache_creation_input_tokens: None,
}
}
#[test]
fn add_usage_saturates_at_u32_ceiling() {
// A long-running session that crosses 2^32 tokens used to wrap
// silently in release and panic in debug. Saturation keeps the
// displayed total truthful as a lower bound.
let mut state = SessionState::new("test".to_string(), ConversationHistory::new());
state.total_input_tokens = u32::MAX - 5;
state.total_output_tokens = u32::MAX - 5;
state.add_usage(&usage_with_inputs(10, 10));
assert_eq!(state.total_input_tokens, u32::MAX);
assert_eq!(state.total_output_tokens, u32::MAX);
}
#[test]
fn add_usage_normal_path_unchanged() {
// The non-saturating path keeps its previous semantics so the
// shift to `saturating_add` doesn't perturb cost reporting in
// the common case.
let mut state = SessionState::new("test".to_string(), ConversationHistory::new());
state.add_usage(&usage_with_inputs(1_000, 200));
state.add_usage(&usage_with_inputs(2_500, 600));
assert_eq!(state.total_input_tokens, 3_500);
assert_eq!(state.total_output_tokens, 800);
assert_eq!(state.peak_single_turn_input_tokens, 2_500);
}
}
/// Manages the state of a single REPL session
#[derive(Clone)]
pub struct SessionState {
/// Unique identifier for this session
pub session_id: String,
/// Conversation history for API
pub conversation: ConversationHistory,
/// Display-friendly message history for UI
pub display_messages: Vec<DisplayMessage>,
/// Total input tokens consumed in this session.
/// Provider semantics differ:
///
/// - OpenAI Responses API: this is the **total** count, of which
/// `total_cache_read_tokens` is a subset.
/// - Anthropic Messages API: this is **uncached** new tokens only;
/// cache read/creation are tracked separately and disjoint.
///
/// `calculate_cost` normalizes this when computing the bill.
pub total_input_tokens: u32,
/// Total output tokens generated in this session
pub total_output_tokens: u32,
/// Tokens served from the provider prompt cache (charged at a
/// reduced rate). Both providers report this; semantics relative to
/// `total_input_tokens` differ as documented above.
pub total_cache_read_tokens: u32,
/// Tokens written to the Anthropic prompt cache (charged at a
/// premium). OpenAI does not surface a creation counter and leaves
/// this at 0.
pub total_cache_creation_tokens: u32,
/// Largest input-token count observed on any single API call this
/// session. Used to detect tiered-pricing cliffs (gpt-5.4/5.5
/// flip the entire session to premium rates once any prompt
/// crosses 272K input tokens). Compared against
/// `ModelInfo::premium_tier.input_threshold` in `calculate_cost`
/// so the displayed cost reflects what the provider actually
/// bills, not the standard-tier rate.
///
/// All five counters above are persisted through
/// [`SessionTokenCounters`](crate::session::SessionTokenCounters)
/// so a `--resume` keeps the cost summary accurate and the cliff
/// detector remembers whether the threshold had already been
/// crossed. Session files written before persistence was added
/// default every counter to 0 via `#[serde(default)]`.
pub peak_single_turn_input_tokens: u32,
}
impl SessionState {
pub fn new(session_id: String, conversation: ConversationHistory) -> Self {
Self {
session_id,
conversation,
display_messages: Vec::new(),
total_input_tokens: 0,
total_output_tokens: 0,
total_cache_read_tokens: 0,
total_cache_creation_tokens: 0,
peak_single_turn_input_tokens: 0,
}
}
pub fn clear(&mut self, new_session_id: String) {
self.session_id = new_session_id;
self.conversation.clear();
self.display_messages.clear();
self.total_input_tokens = 0;
self.total_output_tokens = 0;
self.total_cache_read_tokens = 0;
self.total_cache_creation_tokens = 0;
self.peak_single_turn_input_tokens = 0;
}
pub fn add_usage(&mut self, usage: &crate::api::Usage) {
// `saturating_add` instead of `+=`: each counter is `u32`, and a
// session that survives across `--resume` invocations
// accumulates over multiple turns. The 4.29-billion ceiling is
// well above any realistic single session, but a wraparound
// would silently corrupt the displayed cost summary, and a
// debug build would panic. Saturating at the ceiling keeps the
// displayed total honest about "at least this many" rather
// than wrapping to a tiny number.
self.total_input_tokens = self.total_input_tokens.saturating_add(usage.input_tokens);
self.total_output_tokens = self.total_output_tokens.saturating_add(usage.output_tokens);
self.total_cache_read_tokens = self
.total_cache_read_tokens
.saturating_add(usage.cache_read_input_tokens.unwrap_or(0));
self.total_cache_creation_tokens = self
.total_cache_creation_tokens
.saturating_add(usage.cache_creation_input_tokens.unwrap_or(0));
// Per-call high-water mark on input tokens. For OpenAI, the
// figure already includes cached input (the provider's
// documented basis for the 272K premium cliff); for Anthropic,
// cache reads come on a separate counter, so this is uncached
// input only — neither model has a documented Anthropic cliff,
// so the asymmetry doesn't matter today.
if usage.input_tokens > self.peak_single_turn_input_tokens {
self.peak_single_turn_input_tokens = usage.input_tokens;
}
}
}