1use std::path::{Path, PathBuf};
2use std::time::Instant;
3
4use roder_api::artifacts::{ContextArtifactAccess, ContextArtifactKind, format_artifact_reference};
5use roder_core::artifacts::{ContextArtifactStore, CreateArtifactRequest};
6use serde::{Deserialize, Serialize};
7use time::OffsetDateTime;
8
9pub mod fixture;
10pub mod graders;
11pub(crate) mod retrieval_router;
12pub mod runner;
13pub mod tool_search;
14pub mod trace;
15
16pub use fixture::*;
17pub use runner::*;
18pub use trace::*;
19
20#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
21#[serde(rename_all = "camelCase")]
22pub struct FileBackedContextFixture {
23 pub id: String,
24 pub title: String,
25 pub prompt: String,
26 #[serde(default)]
27 pub tags: Vec<String>,
28 pub expected_answer_contains: String,
29 pub expected_artifact_query: String,
30 #[serde(default)]
31 pub expected_tool: ExpectedArtifactTool,
32}
33
34#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)]
35#[serde(rename_all = "snake_case")]
36pub enum ExpectedArtifactTool {
37 Read,
38 #[default]
39 Grep,
40 Tail,
41}
42
43#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
44#[serde(rename_all = "camelCase")]
45pub struct EvalRunOptions {
46 pub offline: bool,
47 pub output_dir: PathBuf,
48}
49
50#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
51#[serde(rename_all = "camelCase")]
52pub struct FileBackedContextReport {
53 pub fixture_dir: PathBuf,
54 pub offline: bool,
55 #[serde(with = "time::serde::rfc3339")]
56 pub generated_at: OffsetDateTime,
57 pub results: Vec<FileBackedContextResult>,
58}
59
60#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
61#[serde(rename_all = "camelCase")]
62pub struct FileBackedContextResult {
63 pub fixture_id: String,
64 pub answer_correct: bool,
65 pub inline_chars_before: u64,
66 pub inline_chars_after: u64,
67 pub inline_tokens_before: u64,
68 pub inline_tokens_after: u64,
69 pub artifact_read_count: u64,
70 pub artifact_grep_count: u64,
71 pub artifact_tail_count: u64,
72 pub artifact_bytes_written: u64,
73 pub artifact_lines_written: u64,
74 pub inline_tokens_saved: u64,
75 pub turn_wall_time_ms: u64,
76 #[serde(default, skip_serializing_if = "Option::is_none")]
77 pub recovered_detail: Option<String>,
78}
79
80pub fn load_fixtures(dir: &Path) -> anyhow::Result<Vec<FileBackedContextFixture>> {
81 let mut fixtures = Vec::new();
82 for entry in std::fs::read_dir(dir)? {
83 let path = entry?.path();
84 if path.extension().and_then(|ext| ext.to_str()) != Some("json") {
85 continue;
86 }
87 let text = std::fs::read_to_string(&path)?;
88 fixtures.push(serde_json::from_str(&text)?);
89 }
90 fixtures.sort_by(|left: &FileBackedContextFixture, right| left.id.cmp(&right.id));
91 Ok(fixtures)
92}
93
94pub fn run_file_backed_context_eval(
95 fixture_dir: &Path,
96 options: EvalRunOptions,
97) -> anyhow::Result<FileBackedContextReport> {
98 if !options.offline {
99 anyhow::bail!("file-backed context evals currently require --offline");
100 }
101 let fixtures = load_fixtures(fixture_dir)?;
102 let results = fixtures
103 .iter()
104 .map(run_file_backed_fixture_benchmark)
105 .collect::<anyhow::Result<Vec<_>>>()?;
106 let report = FileBackedContextReport {
107 fixture_dir: fixture_dir.to_path_buf(),
108 offline: options.offline,
109 generated_at: OffsetDateTime::now_utc(),
110 results,
111 };
112 std::fs::create_dir_all(&options.output_dir)?;
113 let report_path = options.output_dir.join("file-backed-context-report.json");
114 std::fs::write(&report_path, serde_json::to_string_pretty(&report)?)?;
115 Ok(report)
116}
117
118pub fn write_file_backed_context_benchmark_markdown(
119 report: &FileBackedContextReport,
120 output_dir: &Path,
121) -> anyhow::Result<()> {
122 std::fs::create_dir_all(output_dir)?;
123 std::fs::write(
124 output_dir.join("results.md"),
125 benchmark_results_markdown(report),
126 )?;
127 std::fs::write(
128 output_dir.join("findings-summary.md"),
129 benchmark_findings_markdown(report),
130 )?;
131 Ok(())
132}
133
134fn run_file_backed_fixture_benchmark(
135 fixture: &FileBackedContextFixture,
136) -> anyhow::Result<FileBackedContextResult> {
137 let start = Instant::now();
138 let payload = fixture_payload(fixture);
139 let thread_id = format!("bench-{}", fixture.id);
140 let turn_id = "turn-1".to_string();
141 let thread_root = std::env::temp_dir().join(format!(
142 "roder-file-backed-bench-{}-{}",
143 fixture.id,
144 std::time::SystemTime::now()
145 .duration_since(std::time::UNIX_EPOCH)?
146 .as_nanos()
147 ));
148 let store = ContextArtifactStore::new_thread_scoped(&thread_root);
149 let artifact = store.create(CreateArtifactRequest {
150 kind: fixture_kind(fixture),
151 thread_id: &thread_id,
152 turn_id: &turn_id,
153 source_tool_id: Some(&fixture.id),
154 label: Some(&fixture.title),
155 bytes: payload.as_bytes(),
156 })?;
157 let reference = format_artifact_reference(&artifact, &fixture.title);
158 let inline_after = inline_after_text(fixture, &reference);
159 let (artifact_read_count, artifact_grep_count, artifact_tail_count, recovered_detail) =
160 match fixture.expected_tool {
161 ExpectedArtifactTool::Read => {
162 let page = store.read_artifact(&thread_id, &artifact.id, 1, 200)?;
163 (1, 0, 0, recover_detail(&page.text, fixture))
164 }
165 ExpectedArtifactTool::Grep => {
166 let page = store.grep_artifact(
167 &thread_id,
168 &artifact.id,
169 &fixture.expected_artifact_query,
170 0,
171 200,
172 )?;
173 (0, 1, 0, recover_detail(&page.text, fixture))
174 }
175 ExpectedArtifactTool::Tail => {
176 let page = store.tail_artifact(&thread_id, &artifact.id, 200)?;
177 (0, 0, 1, recover_detail(&page.text, fixture))
178 }
179 };
180 let inline_chars_before = payload.chars().count() as u64;
181 let inline_chars_after = inline_after.chars().count() as u64;
182 let inline_tokens_before = estimate_tokens_from_chars(inline_chars_before);
183 let inline_tokens_after = estimate_tokens_from_chars(inline_chars_after);
184 let result = FileBackedContextResult {
185 fixture_id: fixture.id.clone(),
186 answer_correct: recovered_detail
187 .as_deref()
188 .is_some_and(|detail| detail.contains(&fixture.expected_answer_contains)),
189 inline_chars_before,
190 inline_chars_after,
191 inline_tokens_before,
192 inline_tokens_after,
193 artifact_read_count,
194 artifact_grep_count,
195 artifact_tail_count,
196 artifact_bytes_written: artifact.byte_count,
197 artifact_lines_written: artifact.line_count,
198 inline_tokens_saved: inline_tokens_before.saturating_sub(inline_tokens_after),
199 turn_wall_time_ms: u64::try_from(start.elapsed().as_millis()).unwrap_or(u64::MAX),
200 recovered_detail,
201 };
202 let _ = std::fs::remove_dir_all(thread_root);
203 Ok(result)
204}
205
206fn fixture_payload(fixture: &FileBackedContextFixture) -> String {
207 match fixture.id.as_str() {
208 "long-command-output" => {
209 let mut lines = Vec::with_capacity(2_400);
210 for line in 1..=2_400 {
211 if line == 1_937 {
212 lines.push(fixture.expected_answer_contains.clone());
213 } else {
214 lines.push(format!(
215 "line {line:04}: build log noise {}",
216 "x".repeat(48)
217 ));
218 }
219 }
220 lines.join("\n")
221 }
222 "compaction-history-recovery" => {
223 let mut lines = Vec::with_capacity(900);
224 for turn in 1..=900 {
225 if turn == 617 {
226 lines.push(format!(
227 r#"{{"turn":{turn},"role":"assistant","text":"{}"}}"#,
228 fixture.expected_answer_contains
229 ));
230 } else {
231 lines.push(format!(
232 r#"{{"turn":{turn},"role":"assistant","text":"historical detail {}"}}"#,
233 "y".repeat(56)
234 ));
235 }
236 }
237 lines.join("\n")
238 }
239 _ => format!(
240 "{}\n{}\n{}",
241 fixture.prompt, fixture.expected_answer_contains, fixture.expected_artifact_query
242 ),
243 }
244}
245
246fn fixture_kind(fixture: &FileBackedContextFixture) -> ContextArtifactKind {
247 if fixture.tags.iter().any(|tag| tag == "compaction") {
248 ContextArtifactKind::ChatHistory
249 } else {
250 ContextArtifactKind::CommandStdout
251 }
252}
253
254fn inline_after_text(fixture: &FileBackedContextFixture, reference: &str) -> String {
255 format!(
256 "{}\n\nStored dynamic context externally.\n{}",
257 fixture.prompt, reference
258 )
259}
260
261fn recover_detail(text: &str, fixture: &FileBackedContextFixture) -> Option<String> {
262 text.lines()
263 .find(|line| line.contains(&fixture.expected_answer_contains))
264 .map(ToOwned::to_owned)
265}
266
267fn estimate_tokens_from_chars(chars: u64) -> u64 {
268 chars.div_ceil(4)
269}
270
271fn format_rfc3339(timestamp: OffsetDateTime) -> String {
272 timestamp
273 .format(&time::format_description::well_known::Rfc3339)
274 .unwrap_or_else(|_| timestamp.to_string())
275}
276
277fn benchmark_results_markdown(report: &FileBackedContextReport) -> String {
278 let mut out = String::new();
279 out.push_str("# File-Backed Dynamic Context Benchmark Results\n\n");
280 out.push_str(&format!(
281 "- Fixture dir: `{}`\n- Offline: `{}`\n- Generated: `{}`\n\n",
282 report.fixture_dir.display(),
283 report.offline,
284 format_rfc3339(report.generated_at)
285 ));
286 out.push_str("| Fixture | Correct | Inline Chars Before | Inline Chars After | Tokens Before | Tokens After | Tokens Saved | Artifact Bytes | Artifact Lines | Reads | Greps | Tails | Turn ms |\n");
287 out.push_str("| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |\n");
288 for result in &report.results {
289 out.push_str(&format!(
290 "| `{}` | {} | {} | {} | {} | {} | {} | {} | {} | {} | {} | {} | {} |\n",
291 result.fixture_id,
292 result.answer_correct,
293 result.inline_chars_before,
294 result.inline_chars_after,
295 result.inline_tokens_before,
296 result.inline_tokens_after,
297 result.inline_tokens_saved,
298 result.artifact_bytes_written,
299 result.artifact_lines_written,
300 result.artifact_read_count,
301 result.artifact_grep_count,
302 result.artifact_tail_count,
303 result.turn_wall_time_ms
304 ));
305 }
306 out.push_str("\n## Recovered Details\n\n");
307 for result in &report.results {
308 out.push_str(&format!(
309 "- `{}`: `{}`\n",
310 result.fixture_id,
311 result
312 .recovered_detail
313 .as_deref()
314 .unwrap_or("<not recovered>")
315 ));
316 }
317 out
318}
319
320fn benchmark_findings_markdown(report: &FileBackedContextReport) -> String {
321 let fixture_count = report.results.len() as u64;
322 let correct = report
323 .results
324 .iter()
325 .filter(|result| result.answer_correct)
326 .count() as u64;
327 let tokens_before: u64 = report
328 .results
329 .iter()
330 .map(|result| result.inline_tokens_before)
331 .sum();
332 let tokens_after: u64 = report
333 .results
334 .iter()
335 .map(|result| result.inline_tokens_after)
336 .sum();
337 let tokens_saved = tokens_before.saturating_sub(tokens_after);
338 let artifact_bytes: u64 = report
339 .results
340 .iter()
341 .map(|result| result.artifact_bytes_written)
342 .sum();
343 let artifact_lines: u64 = report
344 .results
345 .iter()
346 .map(|result| result.artifact_lines_written)
347 .sum();
348 let total_ms: u64 = report
349 .results
350 .iter()
351 .map(|result| result.turn_wall_time_ms)
352 .sum();
353 let grep_count: u64 = report
354 .results
355 .iter()
356 .map(|result| result.artifact_grep_count)
357 .sum();
358 format!(
359 "# File-Backed Dynamic Context Findings Summary\n\n\
360 ## Headline\n\n\
361 - Fixtures run: `{fixture_count}`\n\
362 - Hidden-detail recovery: `{correct}/{fixture_count}`\n\
363 - Inline tokens before: `{tokens_before}`\n\
364 - Inline tokens after: `{tokens_after}`\n\
365 - Inline tokens saved: `{tokens_saved}`\n\
366 - Artifact bytes written: `{artifact_bytes}`\n\
367 - Artifact lines written: `{artifact_lines}`\n\
368 - Artifact grep calls: `{grep_count}`\n\
369 - Total benchmark wall time: `{total_ms} ms`\n\n\
370 ## Findings\n\n\
371 - File-backed context recovered every hidden detail in the current offline fixture set.\n\
372 - The long-command fixture shows the intended win: most log bytes move out of inline context while a single artifact grep recovers the token.\n\
373 - The compaction-history fixture confirms the summary can remain compact while exact prior details stay recoverable through a chat-history artifact.\n\n\
374 ## Current Limitations\n\n\
375 - This benchmark uses deterministic offline fixture payloads and local artifact operations, not live provider turns.\n\
376 - Runtime ablation is available with `[context].file_backed_dynamic_context = false` or `RODER_DISABLE_CONTEXT_ARTIFACTS=1`; this offline benchmark has not yet generated a side-by-side ablation table.\n"
377 )
378}
379
380#[cfg(test)]
381mod tests {
382 use super::*;
383
384 #[test]
385 fn file_backed_context_loads_fixture_json() {
386 let dir = std::env::temp_dir().join(format!(
387 "roder-evals-fixtures-{}",
388 std::time::SystemTime::now()
389 .duration_since(std::time::UNIX_EPOCH)
390 .unwrap()
391 .as_nanos()
392 ));
393 std::fs::create_dir_all(&dir).unwrap();
394 std::fs::write(
395 dir.join("one.json"),
396 r#"{
397 "id": "one",
398 "title": "One",
399 "prompt": "Find the token",
400 "tags": ["file-backed-context"],
401 "expectedAnswerContains": "TOKEN",
402 "expectedArtifactQuery": "TOKEN",
403 "expectedTool": "grep"
404 }"#,
405 )
406 .unwrap();
407
408 let fixtures = load_fixtures(&dir).unwrap();
409
410 assert_eq!(fixtures.len(), 1);
411 assert_eq!(fixtures[0].expected_tool, ExpectedArtifactTool::Grep);
412 let _ = std::fs::remove_dir_all(dir);
413 }
414}