Skip to main content

visual_rubric/cli/
audit.rs

1use std::fs;
2use std::io::{BufRead as _, BufReader, Write as _};
3use std::net::TcpStream;
4use std::path::{Path, PathBuf};
5use std::process::Command as ProcessCommand;
6use std::thread;
7use std::time::{Duration, Instant};
8
9use anyhow::{Context as _, Result, anyhow, bail};
10use serde::{Deserialize, Serialize};
11
12use super::static_server::StaticServer;
13use super::{AuditArgs, ImageArgs, ViewportArg, evaluate_image};
14
15/// Aggregate status for an audit run.
16#[derive(Clone, Copy, Debug, Deserialize, Serialize, PartialEq, Eq)]
17#[serde(rename_all = "snake_case")]
18pub enum AuditStatus {
19    /// Every evaluated rubric passed.
20    Pass,
21    /// At least one evaluated rubric failed.
22    Fail,
23    /// At least one screenshot produced an execution or model error.
24    Error,
25    /// All screenshots skipped AI evaluation.
26    Skipped,
27}
28
29/// JSON report produced by the `audit` command.
30#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)]
31pub struct AuditReport {
32    /// Report schema version.
33    pub schema_version: u32,
34    /// Aggregate status across all screenshots.
35    pub aggregate_status: AuditStatus,
36    /// Hosted URL captured during the audit.
37    pub url: String,
38    /// Total audit elapsed time in milliseconds.
39    pub elapsed_ms: u128,
40    /// Options recorded for reproducibility.
41    pub options: AuditOptionsReport,
42    /// Per-screenshot audit results.
43    pub screenshots: Vec<ScreenshotReport>,
44}
45
46/// Options recorded in an [`AuditReport`].
47#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)]
48pub struct AuditOptionsReport {
49    /// Rubric question used for each screenshot.
50    pub question: String,
51    /// Model override used for the run.
52    pub model: Option<String>,
53    /// Reasoning effort override used for the run.
54    pub effort: Option<String>,
55    /// Whether a system prompt override was provided.
56    pub system_prompt_provided: bool,
57    /// Whether AI evaluation was skipped.
58    pub skip_ai: bool,
59    /// Whether deterministic pass verdicts were generated.
60    pub fake_pass: bool,
61}
62
63/// One screenshot entry in an [`AuditReport`].
64#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)]
65pub struct ScreenshotReport {
66    /// Viewport name.
67    pub name: String,
68    /// Viewport width in CSS pixels.
69    pub width: u32,
70    /// Viewport height in CSS pixels.
71    pub height: u32,
72    /// Screenshot path.
73    pub path: PathBuf,
74    /// Rubric result for this screenshot.
75    pub rubric: RubricReport,
76}
77
78/// Per-screenshot rubric outcome.
79#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)]
80#[serde(tag = "status", rename_all = "snake_case")]
81pub enum RubricReport {
82    /// The rubric passed.
83    Pass {
84        /// Pass reason.
85        reason: String,
86        /// Reported anomalies.
87        anomalies: Vec<String>,
88    },
89    /// The rubric failed.
90    Fail {
91        /// Failure reason.
92        reason: String,
93        /// Reported anomalies.
94        anomalies: Vec<String>,
95    },
96    /// The rubric could not be evaluated.
97    Error {
98        /// Error message.
99        message: String,
100    },
101    /// AI evaluation was skipped.
102    Skipped {
103        /// Skip reason.
104        reason: String,
105    },
106}
107
108pub(super) fn run_audit(args: AuditArgs) -> Result<()> {
109    let started = Instant::now();
110    let question = args.questions.resolve().map_err(|e| anyhow!(e))?;
111    create_clean_dir(&args.screenshots)?;
112    let viewports = if args.viewports.is_empty() {
113        vec![
114            ViewportArg {
115                name: "desktop".into(),
116                width: 1440,
117                height: 1100,
118            },
119            ViewportArg {
120                name: "mobile".into(),
121                width: 390,
122                height: 1200,
123            },
124        ]
125    } else {
126        args.viewports.clone()
127    };
128    let server = StaticServer::start(args.root.clone(), 0)?;
129    let url = format!("{}{}", server.base_url(), args.path.trim_start_matches('/'));
130    ensure_hosted_path_ok(&url)?;
131    let mut screenshots = Vec::new();
132
133    for viewport in viewports {
134        let path = args.screenshots.join(format!("{}.png", viewport.name));
135        capture_screenshot(&args, &url, &viewport, &path)?;
136        let rubric = if args.fake_pass {
137            RubricReport::Pass {
138                reason: "fake pass requested".into(),
139                anomalies: Vec::new(),
140            }
141        } else if args.skip_ai {
142            RubricReport::Skipped {
143                reason: "AI rubric skipped by flag".into(),
144            }
145        } else {
146            evaluate_audit_image(&args, &path)
147        };
148        screenshots.push(ScreenshotReport {
149            name: viewport.name,
150            width: viewport.width,
151            height: viewport.height,
152            path,
153            rubric,
154        });
155    }
156
157    let aggregate_status = aggregate_status(&screenshots);
158    let report = AuditReport {
159        schema_version: 1,
160        aggregate_status,
161        url,
162        elapsed_ms: started.elapsed().as_millis(),
163        options: AuditOptionsReport {
164            question: question.clone(),
165            model: args.model.clone(),
166            effort: args.effort.clone(),
167            system_prompt_provided: args.system_prompt.is_some(),
168            skip_ai: args.skip_ai,
169            fake_pass: args.fake_pass,
170        },
171        screenshots,
172    };
173    write_report(&args.report, &report)?;
174    if args.fail_on_rubric && matches!(aggregate_status, AuditStatus::Fail | AuditStatus::Error) {
175        bail!("visual rubric audit finished with aggregate status {aggregate_status:?}");
176    }
177    Ok(())
178}
179
180fn evaluate_audit_image(args: &AuditArgs, image: &Path) -> RubricReport {
181    let image_args = ImageArgs {
182        image: image.to_path_buf(),
183        questions: args.questions.clone(),
184        system_prompt: args.system_prompt.clone(),
185        model: args.model.clone(),
186        effort: args.effort.clone(),
187        codex_acp: args.codex_acp.clone(),
188        name: image.display().to_string(),
189        json: false,
190    };
191    match evaluate_image(&image_args) {
192        Ok(verdict) if verdict.verdict.is_pass() => RubricReport::Pass {
193            reason: verdict.reason,
194            anomalies: verdict.anomalies,
195        },
196        Ok(verdict) => RubricReport::Fail {
197            reason: verdict.reason,
198            anomalies: verdict.anomalies,
199        },
200        Err(error) => RubricReport::Error {
201            message: error.to_string(),
202        },
203    }
204}
205
206fn create_clean_dir(path: &Path) -> Result<()> {
207    if path.exists() {
208        match fs::remove_dir_all(path) {
209            Ok(()) => {}
210            Err(error) if error.kind() == std::io::ErrorKind::NotFound => {}
211            Err(error) => return Err(error).with_context(|| format!("clean {}", path.display())),
212        }
213    }
214    fs::create_dir_all(path).with_context(|| format!("create {}", path.display()))
215}
216
217fn write_report(path: &Path, report: &AuditReport) -> Result<()> {
218    if let Some(parent) = path.parent() {
219        fs::create_dir_all(parent).with_context(|| format!("create {}", parent.display()))?;
220    }
221    let json = serde_json::to_string_pretty(report)?;
222    fs::write(path, json).with_context(|| format!("write {}", path.display()))
223}
224
225fn aggregate_status(screenshots: &[ScreenshotReport]) -> AuditStatus {
226    if screenshots
227        .iter()
228        .any(|screenshot| matches!(screenshot.rubric, RubricReport::Error { .. }))
229    {
230        AuditStatus::Error
231    } else if screenshots
232        .iter()
233        .any(|screenshot| matches!(screenshot.rubric, RubricReport::Fail { .. }))
234    {
235        AuditStatus::Fail
236    } else if screenshots
237        .iter()
238        .all(|screenshot| matches!(screenshot.rubric, RubricReport::Skipped { .. }))
239    {
240        AuditStatus::Skipped
241    } else {
242        AuditStatus::Pass
243    }
244}
245
246fn ensure_hosted_path_ok(url: &str) -> Result<()> {
247    let status = http_status(url).with_context(|| format!("check hosted path {url}"))?;
248    if status != 200 {
249        bail!("hosted path {url} returned HTTP {status}");
250    }
251    Ok(())
252}
253
254fn http_status(url: &str) -> Result<u16> {
255    let rest = url
256        .strip_prefix("http://127.0.0.1:")
257        .with_context(|| format!("audit URL {url} must start with http://127.0.0.1:"))?;
258    let (port, path) = rest
259        .split_once('/')
260        .with_context(|| format!("audit URL {url} is missing a path after the port"))?;
261    let port = port
262        .parse::<u16>()
263        .with_context(|| format!("audit URL {url} has invalid port {port:?}"))?;
264    let mut stream = TcpStream::connect(("127.0.0.1", port))
265        .with_context(|| format!("connect local audit server on 127.0.0.1:{port}"))?;
266    write!(
267        stream,
268        "GET /{path} HTTP/1.1\r\nHost: 127.0.0.1\r\nConnection: close\r\n\r\n"
269    )?;
270    let mut status_line = String::new();
271    BufReader::new(stream)
272        .read_line(&mut status_line)
273        .with_context(|| format!("read HTTP status from local audit URL {url}"))?;
274    status_line
275        .split_whitespace()
276        .nth(1)
277        .with_context(|| format!("local audit URL {url} returned no HTTP status code"))?
278        .parse()
279        .with_context(|| format!("parse HTTP status from {status_line:?}"))
280}
281
282fn capture_screenshot(
283    args: &AuditArgs,
284    url: &str,
285    viewport: &ViewportArg,
286    output: &Path,
287) -> Result<()> {
288    let mut last_error = None;
289    for attempt in 0..=args.capture_retries {
290        if args.wait_ms > 0 {
291            thread::sleep(Duration::from_millis(args.wait_ms));
292        }
293        match capture_screenshot_once(args, url, viewport, output) {
294            Ok(()) => return Ok(()),
295            Err(error) => last_error = Some(error),
296        }
297        if attempt < args.capture_retries {
298            thread::sleep(Duration::from_millis(100));
299        }
300    }
301    Err(last_error.unwrap_or_else(|| anyhow!("browser capture failed")))
302}
303
304fn capture_screenshot_once(
305    args: &AuditArgs,
306    url: &str,
307    viewport: &ViewportArg,
308    output: &Path,
309) -> Result<()> {
310    let mut command = ProcessCommand::new(&args.browser);
311    command
312        .arg("--headless")
313        .arg("--disable-gpu")
314        .arg("--hide-scrollbars")
315        .arg("--no-sandbox")
316        .arg(format!(
317            "--window-size={},{}",
318            viewport.width, viewport.height
319        ));
320    if let Some(scale) = args.device_scale_factor {
321        command.arg(format!("--force-device-scale-factor={scale}"));
322    }
323    command
324        .args(&args.browser_args)
325        .arg(format!("--screenshot={}", output.display()))
326        .arg(url);
327    let status = command
328        .status()
329        .with_context(|| format!("run browser {}", args.browser.display()))?;
330    if !status.success() {
331        bail!(
332            "browser {} failed for {} with status {status}",
333            args.browser.display(),
334            viewport.name
335        );
336    }
337    if !output.exists() {
338        bail!("browser did not write {}", output.display());
339    }
340    Ok(())
341}