Skip to main content

roder_evals/
lib.rs

1use std::path::{Path, PathBuf};
2use std::time::Instant;
3
4use roder_api::artifacts::{ContextArtifactAccess, ContextArtifactKind, format_artifact_reference};
5use roder_core::artifacts::{ContextArtifactStore, CreateArtifactRequest};
6use serde::{Deserialize, Serialize};
7use time::OffsetDateTime;
8
9pub mod fixture;
10pub mod graders;
11pub(crate) mod retrieval_router;
12pub mod runner;
13pub mod tool_search;
14pub mod trace;
15
16pub use fixture::*;
17pub use runner::*;
18pub use trace::*;
19
20#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
21#[serde(rename_all = "camelCase")]
22pub struct FileBackedContextFixture {
23    pub id: String,
24    pub title: String,
25    pub prompt: String,
26    #[serde(default)]
27    pub tags: Vec<String>,
28    pub expected_answer_contains: String,
29    pub expected_artifact_query: String,
30    #[serde(default)]
31    pub expected_tool: ExpectedArtifactTool,
32}
33
34#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)]
35#[serde(rename_all = "snake_case")]
36pub enum ExpectedArtifactTool {
37    Read,
38    #[default]
39    Grep,
40    Tail,
41}
42
43#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
44#[serde(rename_all = "camelCase")]
45pub struct EvalRunOptions {
46    pub offline: bool,
47    pub output_dir: PathBuf,
48}
49
50#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
51#[serde(rename_all = "camelCase")]
52pub struct FileBackedContextReport {
53    pub fixture_dir: PathBuf,
54    pub offline: bool,
55    #[serde(with = "time::serde::rfc3339")]
56    pub generated_at: OffsetDateTime,
57    pub results: Vec<FileBackedContextResult>,
58}
59
60#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
61#[serde(rename_all = "camelCase")]
62pub struct FileBackedContextResult {
63    pub fixture_id: String,
64    pub answer_correct: bool,
65    pub inline_chars_before: u64,
66    pub inline_chars_after: u64,
67    pub inline_tokens_before: u64,
68    pub inline_tokens_after: u64,
69    pub artifact_read_count: u64,
70    pub artifact_grep_count: u64,
71    pub artifact_tail_count: u64,
72    pub artifact_bytes_written: u64,
73    pub artifact_lines_written: u64,
74    pub inline_tokens_saved: u64,
75    pub turn_wall_time_ms: u64,
76    #[serde(default, skip_serializing_if = "Option::is_none")]
77    pub recovered_detail: Option<String>,
78}
79
80pub fn load_fixtures(dir: &Path) -> anyhow::Result<Vec<FileBackedContextFixture>> {
81    let mut fixtures = Vec::new();
82    for entry in std::fs::read_dir(dir)? {
83        let path = entry?.path();
84        if path.extension().and_then(|ext| ext.to_str()) != Some("json") {
85            continue;
86        }
87        let text = std::fs::read_to_string(&path)?;
88        fixtures.push(serde_json::from_str(&text)?);
89    }
90    fixtures.sort_by(|left: &FileBackedContextFixture, right| left.id.cmp(&right.id));
91    Ok(fixtures)
92}
93
94pub fn run_file_backed_context_eval(
95    fixture_dir: &Path,
96    options: EvalRunOptions,
97) -> anyhow::Result<FileBackedContextReport> {
98    if !options.offline {
99        anyhow::bail!("file-backed context evals currently require --offline");
100    }
101    let fixtures = load_fixtures(fixture_dir)?;
102    let results = fixtures
103        .iter()
104        .map(run_file_backed_fixture_benchmark)
105        .collect::<anyhow::Result<Vec<_>>>()?;
106    let report = FileBackedContextReport {
107        fixture_dir: fixture_dir.to_path_buf(),
108        offline: options.offline,
109        generated_at: OffsetDateTime::now_utc(),
110        results,
111    };
112    std::fs::create_dir_all(&options.output_dir)?;
113    let report_path = options.output_dir.join("file-backed-context-report.json");
114    std::fs::write(&report_path, serde_json::to_string_pretty(&report)?)?;
115    Ok(report)
116}
117
118pub fn write_file_backed_context_benchmark_markdown(
119    report: &FileBackedContextReport,
120    output_dir: &Path,
121) -> anyhow::Result<()> {
122    std::fs::create_dir_all(output_dir)?;
123    std::fs::write(
124        output_dir.join("results.md"),
125        benchmark_results_markdown(report),
126    )?;
127    std::fs::write(
128        output_dir.join("findings-summary.md"),
129        benchmark_findings_markdown(report),
130    )?;
131    Ok(())
132}
133
134fn run_file_backed_fixture_benchmark(
135    fixture: &FileBackedContextFixture,
136) -> anyhow::Result<FileBackedContextResult> {
137    let start = Instant::now();
138    let payload = fixture_payload(fixture);
139    let thread_id = format!("bench-{}", fixture.id);
140    let turn_id = "turn-1".to_string();
141    let thread_root = std::env::temp_dir().join(format!(
142        "roder-file-backed-bench-{}-{}",
143        fixture.id,
144        std::time::SystemTime::now()
145            .duration_since(std::time::UNIX_EPOCH)?
146            .as_nanos()
147    ));
148    let store = ContextArtifactStore::new_thread_scoped(&thread_root);
149    let artifact = store.create(CreateArtifactRequest {
150        kind: fixture_kind(fixture),
151        thread_id: &thread_id,
152        turn_id: &turn_id,
153        source_tool_id: Some(&fixture.id),
154        label: Some(&fixture.title),
155        bytes: payload.as_bytes(),
156    })?;
157    let reference = format_artifact_reference(&artifact, &fixture.title);
158    let inline_after = inline_after_text(fixture, &reference);
159    let (artifact_read_count, artifact_grep_count, artifact_tail_count, recovered_detail) =
160        match fixture.expected_tool {
161            ExpectedArtifactTool::Read => {
162                let page = store.read_artifact(&thread_id, &artifact.id, 1, 200)?;
163                (1, 0, 0, recover_detail(&page.text, fixture))
164            }
165            ExpectedArtifactTool::Grep => {
166                let page = store.grep_artifact(
167                    &thread_id,
168                    &artifact.id,
169                    &fixture.expected_artifact_query,
170                    0,
171                    200,
172                )?;
173                (0, 1, 0, recover_detail(&page.text, fixture))
174            }
175            ExpectedArtifactTool::Tail => {
176                let page = store.tail_artifact(&thread_id, &artifact.id, 200)?;
177                (0, 0, 1, recover_detail(&page.text, fixture))
178            }
179        };
180    let inline_chars_before = payload.chars().count() as u64;
181    let inline_chars_after = inline_after.chars().count() as u64;
182    let inline_tokens_before = estimate_tokens_from_chars(inline_chars_before);
183    let inline_tokens_after = estimate_tokens_from_chars(inline_chars_after);
184    let result = FileBackedContextResult {
185        fixture_id: fixture.id.clone(),
186        answer_correct: recovered_detail
187            .as_deref()
188            .is_some_and(|detail| detail.contains(&fixture.expected_answer_contains)),
189        inline_chars_before,
190        inline_chars_after,
191        inline_tokens_before,
192        inline_tokens_after,
193        artifact_read_count,
194        artifact_grep_count,
195        artifact_tail_count,
196        artifact_bytes_written: artifact.byte_count,
197        artifact_lines_written: artifact.line_count,
198        inline_tokens_saved: inline_tokens_before.saturating_sub(inline_tokens_after),
199        turn_wall_time_ms: u64::try_from(start.elapsed().as_millis()).unwrap_or(u64::MAX),
200        recovered_detail,
201    };
202    let _ = std::fs::remove_dir_all(thread_root);
203    Ok(result)
204}
205
206fn fixture_payload(fixture: &FileBackedContextFixture) -> String {
207    match fixture.id.as_str() {
208        "long-command-output" => {
209            let mut lines = Vec::with_capacity(2_400);
210            for line in 1..=2_400 {
211                if line == 1_937 {
212                    lines.push(fixture.expected_answer_contains.clone());
213                } else {
214                    lines.push(format!(
215                        "line {line:04}: build log noise {}",
216                        "x".repeat(48)
217                    ));
218                }
219            }
220            lines.join("\n")
221        }
222        "compaction-history-recovery" => {
223            let mut lines = Vec::with_capacity(900);
224            for turn in 1..=900 {
225                if turn == 617 {
226                    lines.push(format!(
227                        r#"{{"turn":{turn},"role":"assistant","text":"{}"}}"#,
228                        fixture.expected_answer_contains
229                    ));
230                } else {
231                    lines.push(format!(
232                        r#"{{"turn":{turn},"role":"assistant","text":"historical detail {}"}}"#,
233                        "y".repeat(56)
234                    ));
235                }
236            }
237            lines.join("\n")
238        }
239        _ => format!(
240            "{}\n{}\n{}",
241            fixture.prompt, fixture.expected_answer_contains, fixture.expected_artifact_query
242        ),
243    }
244}
245
246fn fixture_kind(fixture: &FileBackedContextFixture) -> ContextArtifactKind {
247    if fixture.tags.iter().any(|tag| tag == "compaction") {
248        ContextArtifactKind::ChatHistory
249    } else {
250        ContextArtifactKind::CommandStdout
251    }
252}
253
254fn inline_after_text(fixture: &FileBackedContextFixture, reference: &str) -> String {
255    format!(
256        "{}\n\nStored dynamic context externally.\n{}",
257        fixture.prompt, reference
258    )
259}
260
261fn recover_detail(text: &str, fixture: &FileBackedContextFixture) -> Option<String> {
262    text.lines()
263        .find(|line| line.contains(&fixture.expected_answer_contains))
264        .map(ToOwned::to_owned)
265}
266
267fn estimate_tokens_from_chars(chars: u64) -> u64 {
268    chars.div_ceil(4)
269}
270
271fn format_rfc3339(timestamp: OffsetDateTime) -> String {
272    timestamp
273        .format(&time::format_description::well_known::Rfc3339)
274        .unwrap_or_else(|_| timestamp.to_string())
275}
276
277fn benchmark_results_markdown(report: &FileBackedContextReport) -> String {
278    let mut out = String::new();
279    out.push_str("# File-Backed Dynamic Context Benchmark Results\n\n");
280    out.push_str(&format!(
281        "- Fixture dir: `{}`\n- Offline: `{}`\n- Generated: `{}`\n\n",
282        report.fixture_dir.display(),
283        report.offline,
284        format_rfc3339(report.generated_at)
285    ));
286    out.push_str("| Fixture | Correct | Inline Chars Before | Inline Chars After | Tokens Before | Tokens After | Tokens Saved | Artifact Bytes | Artifact Lines | Reads | Greps | Tails | Turn ms |\n");
287    out.push_str("| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |\n");
288    for result in &report.results {
289        out.push_str(&format!(
290            "| `{}` | {} | {} | {} | {} | {} | {} | {} | {} | {} | {} | {} | {} |\n",
291            result.fixture_id,
292            result.answer_correct,
293            result.inline_chars_before,
294            result.inline_chars_after,
295            result.inline_tokens_before,
296            result.inline_tokens_after,
297            result.inline_tokens_saved,
298            result.artifact_bytes_written,
299            result.artifact_lines_written,
300            result.artifact_read_count,
301            result.artifact_grep_count,
302            result.artifact_tail_count,
303            result.turn_wall_time_ms
304        ));
305    }
306    out.push_str("\n## Recovered Details\n\n");
307    for result in &report.results {
308        out.push_str(&format!(
309            "- `{}`: `{}`\n",
310            result.fixture_id,
311            result
312                .recovered_detail
313                .as_deref()
314                .unwrap_or("<not recovered>")
315        ));
316    }
317    out
318}
319
320fn benchmark_findings_markdown(report: &FileBackedContextReport) -> String {
321    let fixture_count = report.results.len() as u64;
322    let correct = report
323        .results
324        .iter()
325        .filter(|result| result.answer_correct)
326        .count() as u64;
327    let tokens_before: u64 = report
328        .results
329        .iter()
330        .map(|result| result.inline_tokens_before)
331        .sum();
332    let tokens_after: u64 = report
333        .results
334        .iter()
335        .map(|result| result.inline_tokens_after)
336        .sum();
337    let tokens_saved = tokens_before.saturating_sub(tokens_after);
338    let artifact_bytes: u64 = report
339        .results
340        .iter()
341        .map(|result| result.artifact_bytes_written)
342        .sum();
343    let artifact_lines: u64 = report
344        .results
345        .iter()
346        .map(|result| result.artifact_lines_written)
347        .sum();
348    let total_ms: u64 = report
349        .results
350        .iter()
351        .map(|result| result.turn_wall_time_ms)
352        .sum();
353    let grep_count: u64 = report
354        .results
355        .iter()
356        .map(|result| result.artifact_grep_count)
357        .sum();
358    format!(
359        "# File-Backed Dynamic Context Findings Summary\n\n\
360         ## Headline\n\n\
361         - Fixtures run: `{fixture_count}`\n\
362         - Hidden-detail recovery: `{correct}/{fixture_count}`\n\
363         - Inline tokens before: `{tokens_before}`\n\
364         - Inline tokens after: `{tokens_after}`\n\
365         - Inline tokens saved: `{tokens_saved}`\n\
366         - Artifact bytes written: `{artifact_bytes}`\n\
367         - Artifact lines written: `{artifact_lines}`\n\
368         - Artifact grep calls: `{grep_count}`\n\
369         - Total benchmark wall time: `{total_ms} ms`\n\n\
370         ## Findings\n\n\
371         - File-backed context recovered every hidden detail in the current offline fixture set.\n\
372         - The long-command fixture shows the intended win: most log bytes move out of inline context while a single artifact grep recovers the token.\n\
373         - The compaction-history fixture confirms the summary can remain compact while exact prior details stay recoverable through a chat-history artifact.\n\n\
374         ## Current Limitations\n\n\
375         - This benchmark uses deterministic offline fixture payloads and local artifact operations, not live provider turns.\n\
376         - Runtime ablation is available with `[context].file_backed_dynamic_context = false` or `RODER_DISABLE_CONTEXT_ARTIFACTS=1`; this offline benchmark has not yet generated a side-by-side ablation table.\n"
377    )
378}
379
380#[cfg(test)]
381mod tests {
382    use super::*;
383
384    #[test]
385    fn file_backed_context_loads_fixture_json() {
386        let dir = std::env::temp_dir().join(format!(
387            "roder-evals-fixtures-{}",
388            std::time::SystemTime::now()
389                .duration_since(std::time::UNIX_EPOCH)
390                .unwrap()
391                .as_nanos()
392        ));
393        std::fs::create_dir_all(&dir).unwrap();
394        std::fs::write(
395            dir.join("one.json"),
396            r#"{
397              "id": "one",
398              "title": "One",
399              "prompt": "Find the token",
400              "tags": ["file-backed-context"],
401              "expectedAnswerContains": "TOKEN",
402              "expectedArtifactQuery": "TOKEN",
403              "expectedTool": "grep"
404            }"#,
405        )
406        .unwrap();
407
408        let fixtures = load_fixtures(&dir).unwrap();
409
410        assert_eq!(fixtures.len(), 1);
411        assert_eq!(fixtures[0].expected_tool, ExpectedArtifactTool::Grep);
412        let _ = std::fs::remove_dir_all(dir);
413    }
414}