1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
//! Configuration for the token optimization engine
use serde::{Deserialize, Serialize};
/// Configuration for the token optimization engine.
///
/// Controls context window budgeting, compaction triggers, summarization
/// limits, and output stream monitoring. Loaded from the `[token_optimization]`
/// section in `config.toml`.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TokenOptimizationConfig {
/// Whether token optimization is enabled (default: true)
#[serde(default = "default_true")]
pub enabled: bool,
/// Total context window size in tokens (default: 8192)
///
/// Should match the `num_ctx` value configured for the active Ollama model.
#[serde(default = "default_context_window")]
pub context_window_tokens: u32,
/// Fraction of context window reserved for the LLM response (default: 0.25)
///
/// A value of 0.25 means 25% of the context window is kept free for output
/// generation. The remaining 75% is available for system prompt, RAG, history,
/// and tool definitions.
#[serde(default = "default_response_headroom")]
pub response_headroom_ratio: f32,
/// Fraction of context window usage that triggers compaction (default: 0.70)
///
/// When the estimated input tokens exceed this ratio of the available budget
/// (context window minus response headroom), conversation history is compacted.
#[serde(default = "default_compaction_trigger")]
pub compaction_trigger_ratio: f32,
/// Maximum tokens for the rolling conversation summary (default: 256)
#[serde(default = "default_max_summary_tokens")]
pub max_summary_tokens: u32,
/// Maximum fraction of available budget for the system prompt (default: 0.15)
#[serde(default = "default_system_prompt_budget")]
pub system_prompt_budget_ratio: f32,
/// Maximum fraction of available budget for RAG context (default: 0.15)
#[serde(default = "default_rag_budget")]
pub rag_budget_ratio: f32,
/// Whether output stream repetition detection is enabled (default: true)
#[serde(default = "default_true")]
pub repetition_detection_enabled: bool,
/// N-gram size for repetition detection (default: 3)
#[serde(default = "default_ngram_size")]
pub repetition_ngram_size: usize,
/// Threshold ratio of repeated n-grams to trigger early stream abort (default: 0.3)
///
/// When 30% or more of recent n-grams in the output stream are repeats,
/// the stream is terminated early to prevent wasting tokens.
#[serde(default = "default_repetition_threshold")]
pub repetition_threshold: f32,
/// Maximum number of tools to send to the LLM per request (default: 8)
#[serde(default = "default_max_tools")]
pub max_tools_per_request: usize,
/// HuggingFace tokenizer model identifier for exact token counting.
///
/// When set, the optimizer loads the specified tokenizer from the
/// HuggingFace Hub (or a local file path) for exact token counts.
/// Falls back to heuristic estimation on failure.
///
/// Examples: `"meta-llama/Llama-3.2-3B"`, `"/path/to/tokenizer.json"`
///
/// Requires the `hf-tokenizer` feature (enabled by default).
#[serde(default)]
pub tokenizer_model: Option<String>,
/// Hard cap on the dynamic output token budget (default: None = uncapped).
///
/// When set, the `recommended_max_tokens` produced by the output budget
/// calculator is clamped to this value. Useful for keeping responses
/// concise even when the context window would allow larger output.
#[serde(default)]
pub output_max_tokens: Option<u32>,
/// Repetition penalty sent to Ollama as `repeat_penalty` (default: None).
///
/// Values > 1.0 discourage the model from repeating tokens it has already
/// generated. Ollama's own default is `1.1`. Recommended: `1.1`–`1.3`.
/// `None` means the Ollama default is used.
#[serde(default)]
pub frequency_penalty: Option<f32>,
/// Presence penalty sent to Ollama as `presence_penalty` (default: None).
///
/// Adds a constant penalty for each unique token that has appeared in the
/// output so far, promoting topic diversity. Recommended: `0.4`–`0.8`.
/// `None` means the Ollama default (`0.0`) is used.
#[serde(default)]
pub presence_penalty: Option<f32>,
/// Whether progressive tool compression is enabled (default: true).
///
/// When enabled, tool definitions that the LLM has already seen in the
/// current conversation have their descriptions stripped on subsequent
/// turns, reducing token usage while preserving the tool schema.
#[serde(default = "default_true")]
pub progressive_tool_compression: bool,
/// Token pressure threshold for injecting conciseness directives (default: 0.7).
///
/// When the ratio of estimated input tokens to budget exceeds this value,
/// a brevity instruction is appended to the system prompt. At > 0.9,
/// an additional structured-format hint is injected.
#[serde(default = "default_conciseness_threshold")]
pub conciseness_pressure_threshold: f32,
/// Maximum tokens for tool result content in historical messages (default: 100).
///
/// Tool results from previous turns are truncated to this budget using
/// extractive summarization (key JSON fields, priority text lines).
/// Current-turn tool results are left intact so the LLM can reason
/// over the full output.
#[serde(default = "default_tool_result_max_tokens")]
pub tool_result_max_tokens: u32,
/// Token budget for conversation history windowing (default: auto-computed).
///
/// When set, the application layer trims conversation history to
/// fit within this budget before the token-optimization decorator
/// performs precise compaction. Computed automatically from context
/// window, system prompt, RAG, and response headroom when `None`.
#[serde(default)]
pub max_history_tokens: Option<u32>,
/// Maximum tokens for agent profile prompt content (default: 300).
///
/// Caps the token budget consumed by agent profile descriptions when
/// building the system prompt for agentic sub-tasks. Prevents verbose
/// profile definitions from crowding out other context.
#[serde(default = "default_max_profile_prompt_tokens")]
pub max_profile_prompt_tokens: u32,
/// Directory path for runtime prompt template overrides (default: None).
///
/// When set, the [`TemplateLoader`](crate::prompt::template_loader::TemplateLoader)
/// attempts to read `{prompt_template_dir}/{name}.prompt.txt` before falling
/// back to the compiled-in templates bundled via `build.rs`.
///
/// Set to a writable directory to customise individual prompt templates
/// without recompiling the binary.
///
/// [`TemplateLoader`](crate::prompt::template_loader::TemplateLoader)
#[serde(default)]
pub prompt_template_dir: Option<String>,
}
impl TokenOptimizationConfig {
/// Auto-configure the context window from a model metadata source.
///
/// Queries the given [`ModelInfoPort`](crate::ports::ModelInfoPort) for the model's context window
/// size and, if successful, updates `context_window_tokens`. The original
/// value is kept when the query fails (e.g. Ollama is unreachable).
///
/// Returns the detected [`ModelInfo`](crate::profile::ModelInfo) on success for further use
/// (e.g. hardware profile adjustment).
///
/// # Errors
///
/// Returns [`TokenOptError`](crate::error::TokenOptError) if the port call fails.
pub async fn auto_detect_context_window(
&mut self,
port: &dyn crate::ports::ModelInfoPort,
model: &str,
) -> Result<crate::profile::ModelInfo, crate::error::TokenOptError> {
let info = port.get_model_info(model).await?;
tracing::info!(
model,
detected = info.context_length,
previous = self.context_window_tokens,
"Auto-detected context window from model metadata"
);
self.context_window_tokens = info.context_length;
Ok(info)
}
/// Apply hardware-profile-based defaults to this config.
///
/// Uses [`detect_profile`](crate::profile::detect_profile) to determine
/// the hardware tier, optionally adjusts it based on model info, and
/// applies the profile's settings where they provide a better fit than
/// the generic defaults.
///
/// Only adjusts `compaction_trigger_ratio`, `max_tools_per_request`,
/// and `progressive_tool_compression` — fields that benefit from
/// hardware-aware tuning.
pub fn apply_hardware_profile(&mut self, model_info: Option<&crate::profile::ModelInfo>) {
let base = crate::profile::detect_profile();
let profile = model_info.map_or(base, |mi| crate::profile::adjust_profile(base, mi));
let pc = profile.config();
#[allow(clippy::cast_possible_truncation)]
{
self.compaction_trigger_ratio = pc.compaction_threshold as f32;
}
self.max_tools_per_request = pc.max_tools;
self.progressive_tool_compression = pc.progressive_tools;
tracing::info!(
?profile,
compaction_trigger = self.compaction_trigger_ratio,
max_tools = self.max_tools_per_request,
progressive_tools = self.progressive_tool_compression,
"Applied hardware profile defaults"
);
}
/// Compute the effective history token budget.
///
/// Returns the configured `max_history_tokens` if set, otherwise
/// derives a budget from the context window minus system prompt,
/// RAG, and response headroom allocations.
#[must_use]
pub fn effective_max_history_tokens(&self) -> u32 {
if let Some(explicit) = self.max_history_tokens {
return explicit;
}
#[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
let available = (f64::from(self.context_window_tokens)
* f64::from(1.0 - self.response_headroom_ratio)) as u32;
#[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
let reserved = (f64::from(available)
* f64::from(self.system_prompt_budget_ratio + self.rag_budget_ratio))
as u32;
available.saturating_sub(reserved)
}
}
impl Default for TokenOptimizationConfig {
fn default() -> Self {
Self {
enabled: true,
context_window_tokens: default_context_window(),
response_headroom_ratio: default_response_headroom(),
compaction_trigger_ratio: default_compaction_trigger(),
max_summary_tokens: default_max_summary_tokens(),
system_prompt_budget_ratio: default_system_prompt_budget(),
rag_budget_ratio: default_rag_budget(),
repetition_detection_enabled: true,
repetition_ngram_size: default_ngram_size(),
repetition_threshold: default_repetition_threshold(),
max_tools_per_request: default_max_tools(),
tokenizer_model: None,
output_max_tokens: None,
frequency_penalty: None,
presence_penalty: None,
progressive_tool_compression: true,
conciseness_pressure_threshold: default_conciseness_threshold(),
tool_result_max_tokens: default_tool_result_max_tokens(),
max_history_tokens: None,
max_profile_prompt_tokens: default_max_profile_prompt_tokens(),
prompt_template_dir: None,
}
}
}
const fn default_true() -> bool {
true
}
const fn default_context_window() -> u32 {
8192
}
const fn default_response_headroom() -> f32 {
0.25
}
const fn default_compaction_trigger() -> f32 {
0.70
}
const fn default_max_summary_tokens() -> u32 {
256
}
const fn default_system_prompt_budget() -> f32 {
0.15
}
const fn default_rag_budget() -> f32 {
0.15
}
const fn default_ngram_size() -> usize {
3
}
const fn default_repetition_threshold() -> f32 {
0.3
}
const fn default_max_tools() -> usize {
8
}
const fn default_conciseness_threshold() -> f32 {
0.7
}
const fn default_tool_result_max_tokens() -> u32 {
100
}
const fn default_max_profile_prompt_tokens() -> u32 {
300
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn default_config_has_sane_values() {
let config = TokenOptimizationConfig::default();
assert!(config.enabled);
assert_eq!(config.context_window_tokens, 8192);
assert!((config.response_headroom_ratio - 0.25).abs() < f32::EPSILON);
assert!((config.compaction_trigger_ratio - 0.70).abs() < f32::EPSILON);
assert_eq!(config.max_summary_tokens, 256);
assert!(config.repetition_detection_enabled);
assert_eq!(config.repetition_ngram_size, 3);
}
#[test]
fn budget_ratios_leave_room_for_history() {
let config = TokenOptimizationConfig::default();
let fixed_ratio = config.response_headroom_ratio
+ config.system_prompt_budget_ratio
+ config.rag_budget_ratio;
// Fixed allocations should leave at least 40% for history
assert!(fixed_ratio < 0.60, "fixed ratio {fixed_ratio} too high");
}
#[test]
fn deserialization_with_defaults() {
let toml = r"
enabled = true
context_window_tokens = 4096
";
let config: TokenOptimizationConfig = toml::from_str(toml).unwrap();
assert_eq!(config.context_window_tokens, 4096);
// Non-specified fields should use defaults
assert!((config.response_headroom_ratio - 0.25).abs() < f32::EPSILON);
assert_eq!(config.max_summary_tokens, 256);
assert!(config.output_max_tokens.is_none());
}
#[test]
fn default_output_max_tokens_is_none() {
let config = TokenOptimizationConfig::default();
assert!(config.output_max_tokens.is_none());
}
#[test]
fn output_max_tokens_deserialization() {
let toml = r"
enabled = true
context_window_tokens = 8192
output_max_tokens = 1024
";
let config: TokenOptimizationConfig = toml::from_str(toml).unwrap();
assert_eq!(config.output_max_tokens, Some(1024));
}
#[test]
fn sampling_params_deserialization() {
let toml = r"
enabled = true
frequency_penalty = 1.2
presence_penalty = 0.6
";
let config: TokenOptimizationConfig = toml::from_str(toml).unwrap();
assert!((config.frequency_penalty.unwrap() - 1.2).abs() < f32::EPSILON);
assert!((config.presence_penalty.unwrap() - 0.6).abs() < f32::EPSILON);
}
#[test]
fn default_sampling_params_are_none() {
let config = TokenOptimizationConfig::default();
assert!(config.frequency_penalty.is_none());
assert!(config.presence_penalty.is_none());
}
#[test]
fn default_progressive_tool_compression_is_true() {
let config = TokenOptimizationConfig::default();
assert!(config.progressive_tool_compression);
}
#[test]
fn progressive_tool_compression_deserialization() {
let toml = r"
enabled = true
progressive_tool_compression = false
";
let config: TokenOptimizationConfig = toml::from_str(toml).unwrap();
assert!(!config.progressive_tool_compression);
}
}