Skip to main content

visual_rubric/cli/
audit.rs

1use std::fs;
2use std::io::{BufRead as _, BufReader, Write as _};
3use std::net::TcpStream;
4use std::path::{Path, PathBuf};
5use std::process::Command as ProcessCommand;
6use std::thread;
7use std::time::{Duration, Instant};
8
9use anyhow::{Context as _, Result, anyhow, bail};
10use serde::{Deserialize, Serialize};
11
12use super::static_server::StaticServer;
13use super::{AuditArgs, ImageArgs, ViewportArg, evaluate_image};
14
15/// Aggregate status for an audit run.
16#[derive(Clone, Copy, Debug, Deserialize, Serialize, PartialEq, Eq)]
17#[serde(rename_all = "snake_case")]
18pub enum AuditStatus {
19    /// Every evaluated rubric passed.
20    Pass,
21    /// At least one evaluated rubric failed.
22    Fail,
23    /// At least one screenshot produced an execution or model error.
24    Error,
25    /// All screenshots skipped AI evaluation.
26    Skipped,
27}
28
29/// JSON report produced by the `audit` command.
30#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)]
31pub struct AuditReport {
32    /// Report schema version.
33    pub schema_version: u32,
34    /// Aggregate status across all screenshots.
35    pub aggregate_status: AuditStatus,
36    /// Hosted URL captured during the audit.
37    pub url: String,
38    /// Total audit elapsed time in milliseconds.
39    pub elapsed_ms: u128,
40    /// Options recorded for reproducibility.
41    pub options: AuditOptionsReport,
42    /// Per-screenshot audit results.
43    pub screenshots: Vec<ScreenshotReport>,
44}
45
46/// Options recorded in an [`AuditReport`].
47#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)]
48pub struct AuditOptionsReport {
49    /// Rubric question used for each screenshot.
50    pub question: String,
51    /// Model override used for the run.
52    pub model: Option<String>,
53    /// Reasoning effort override used for the run.
54    pub effort: Option<String>,
55    /// Whether a system prompt override was provided.
56    pub system_prompt_provided: bool,
57    /// Whether AI evaluation was skipped.
58    pub skip_ai: bool,
59    /// Whether deterministic pass verdicts were generated.
60    pub fake_pass: bool,
61}
62
63/// One screenshot entry in an [`AuditReport`].
64#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)]
65pub struct ScreenshotReport {
66    /// Viewport name.
67    pub name: String,
68    /// Viewport width in CSS pixels.
69    pub width: u32,
70    /// Viewport height in CSS pixels.
71    pub height: u32,
72    /// Screenshot path.
73    pub path: PathBuf,
74    /// Rubric result for this screenshot.
75    pub rubric: RubricReport,
76}
77
78/// Per-screenshot rubric outcome.
79#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)]
80#[serde(tag = "status", rename_all = "snake_case")]
81pub enum RubricReport {
82    /// The rubric passed.
83    Pass {
84        /// Pass reason.
85        reason: String,
86        /// Reported anomalies.
87        anomalies: Vec<String>,
88    },
89    /// The rubric failed.
90    Fail {
91        /// Failure reason.
92        reason: String,
93        /// Reported anomalies.
94        anomalies: Vec<String>,
95    },
96    /// The rubric could not be evaluated.
97    Error {
98        /// Error message.
99        message: String,
100    },
101    /// AI evaluation was skipped.
102    Skipped {
103        /// Skip reason.
104        reason: String,
105    },
106}
107
108pub(super) fn run_audit(args: AuditArgs) -> Result<()> {
109    let started = Instant::now();
110    create_clean_dir(&args.screenshots)?;
111    let viewports = if args.viewports.is_empty() {
112        vec![
113            ViewportArg {
114                name: "desktop".into(),
115                width: 1440,
116                height: 1100,
117            },
118            ViewportArg {
119                name: "mobile".into(),
120                width: 390,
121                height: 1200,
122            },
123        ]
124    } else {
125        args.viewports.clone()
126    };
127    let server = StaticServer::start(args.root.clone(), 0)?;
128    let url = format!("{}{}", server.base_url(), args.path.trim_start_matches('/'));
129    ensure_hosted_path_ok(&url)?;
130    let mut screenshots = Vec::new();
131
132    for viewport in viewports {
133        let path = args.screenshots.join(format!("{}.png", viewport.name));
134        capture_screenshot(&args, &url, &viewport, &path)?;
135        let rubric = if args.fake_pass {
136            RubricReport::Pass {
137                reason: "fake pass requested".into(),
138                anomalies: Vec::new(),
139            }
140        } else if args.skip_ai {
141            RubricReport::Skipped {
142                reason: "AI rubric skipped by flag".into(),
143            }
144        } else {
145            evaluate_audit_image(&args, &path)
146        };
147        screenshots.push(ScreenshotReport {
148            name: viewport.name,
149            width: viewport.width,
150            height: viewport.height,
151            path,
152            rubric,
153        });
154    }
155
156    let aggregate_status = aggregate_status(&screenshots);
157    let report = AuditReport {
158        schema_version: 1,
159        aggregate_status,
160        url,
161        elapsed_ms: started.elapsed().as_millis(),
162        options: AuditOptionsReport {
163            question: args.question.clone(),
164            model: args.model.clone(),
165            effort: args.effort.clone(),
166            system_prompt_provided: args.system_prompt.is_some(),
167            skip_ai: args.skip_ai,
168            fake_pass: args.fake_pass,
169        },
170        screenshots,
171    };
172    write_report(&args.report, &report)?;
173    if args.fail_on_rubric && matches!(aggregate_status, AuditStatus::Fail | AuditStatus::Error) {
174        bail!("visual rubric audit finished with aggregate status {aggregate_status:?}");
175    }
176    Ok(())
177}
178
179fn evaluate_audit_image(args: &AuditArgs, image: &Path) -> RubricReport {
180    let image_args = ImageArgs {
181        image: image.to_path_buf(),
182        question: args.question.clone(),
183        system_prompt: args.system_prompt.clone(),
184        model: args.model.clone(),
185        effort: args.effort.clone(),
186        codex_acp: args.codex_acp.clone(),
187        name: image.display().to_string(),
188        json: false,
189    };
190    match evaluate_image(&image_args) {
191        Ok(verdict) if verdict.verdict.is_pass() => RubricReport::Pass {
192            reason: verdict.reason,
193            anomalies: verdict.anomalies,
194        },
195        Ok(verdict) => RubricReport::Fail {
196            reason: verdict.reason,
197            anomalies: verdict.anomalies,
198        },
199        Err(error) => RubricReport::Error {
200            message: error.to_string(),
201        },
202    }
203}
204
205fn create_clean_dir(path: &Path) -> Result<()> {
206    if path.exists() {
207        match fs::remove_dir_all(path) {
208            Ok(()) => {}
209            Err(error) if error.kind() == std::io::ErrorKind::NotFound => {}
210            Err(error) => return Err(error).with_context(|| format!("clean {}", path.display())),
211        }
212    }
213    fs::create_dir_all(path).with_context(|| format!("create {}", path.display()))
214}
215
216fn write_report(path: &Path, report: &AuditReport) -> Result<()> {
217    if let Some(parent) = path.parent() {
218        fs::create_dir_all(parent).with_context(|| format!("create {}", parent.display()))?;
219    }
220    let json = serde_json::to_string_pretty(report)?;
221    fs::write(path, json).with_context(|| format!("write {}", path.display()))
222}
223
224fn aggregate_status(screenshots: &[ScreenshotReport]) -> AuditStatus {
225    if screenshots
226        .iter()
227        .any(|screenshot| matches!(screenshot.rubric, RubricReport::Error { .. }))
228    {
229        AuditStatus::Error
230    } else if screenshots
231        .iter()
232        .any(|screenshot| matches!(screenshot.rubric, RubricReport::Fail { .. }))
233    {
234        AuditStatus::Fail
235    } else if screenshots
236        .iter()
237        .all(|screenshot| matches!(screenshot.rubric, RubricReport::Skipped { .. }))
238    {
239        AuditStatus::Skipped
240    } else {
241        AuditStatus::Pass
242    }
243}
244
245fn ensure_hosted_path_ok(url: &str) -> Result<()> {
246    let status = http_status(url).with_context(|| format!("check hosted path {url}"))?;
247    if status != 200 {
248        bail!("hosted path {url} returned HTTP {status}");
249    }
250    Ok(())
251}
252
253fn http_status(url: &str) -> Result<u16> {
254    let rest = url
255        .strip_prefix("http://127.0.0.1:")
256        .context("only local audit URLs are supported")?;
257    let (port, path) = rest
258        .split_once('/')
259        .context("local audit URL missing path")?;
260    let port = port.parse::<u16>().context("local audit URL port")?;
261    let mut stream = TcpStream::connect(("127.0.0.1", port)).context("connect local server")?;
262    write!(
263        stream,
264        "GET /{path} HTTP/1.1\r\nHost: 127.0.0.1\r\nConnection: close\r\n\r\n"
265    )?;
266    let mut status_line = String::new();
267    BufReader::new(stream)
268        .read_line(&mut status_line)
269        .context("read local server status")?;
270    status_line
271        .split_whitespace()
272        .nth(1)
273        .context("missing HTTP status")?
274        .parse()
275        .context("parse HTTP status")
276}
277
278fn capture_screenshot(
279    args: &AuditArgs,
280    url: &str,
281    viewport: &ViewportArg,
282    output: &Path,
283) -> Result<()> {
284    let mut last_error = None;
285    for attempt in 0..=args.capture_retries {
286        if args.wait_ms > 0 {
287            thread::sleep(Duration::from_millis(args.wait_ms));
288        }
289        match capture_screenshot_once(args, url, viewport, output) {
290            Ok(()) => return Ok(()),
291            Err(error) => last_error = Some(error),
292        }
293        if attempt < args.capture_retries {
294            thread::sleep(Duration::from_millis(100));
295        }
296    }
297    Err(last_error.unwrap_or_else(|| anyhow!("browser capture failed")))
298}
299
300fn capture_screenshot_once(
301    args: &AuditArgs,
302    url: &str,
303    viewport: &ViewportArg,
304    output: &Path,
305) -> Result<()> {
306    let mut command = ProcessCommand::new(&args.browser);
307    command
308        .arg("--headless")
309        .arg("--disable-gpu")
310        .arg("--hide-scrollbars")
311        .arg("--no-sandbox")
312        .arg(format!(
313            "--window-size={},{}",
314            viewport.width, viewport.height
315        ));
316    if let Some(scale) = args.device_scale_factor {
317        command.arg(format!("--force-device-scale-factor={scale}"));
318    }
319    command
320        .args(&args.browser_args)
321        .arg(format!("--screenshot={}", output.display()))
322        .arg(url);
323    let status = command
324        .status()
325        .with_context(|| format!("run browser {}", args.browser.display()))?;
326    if !status.success() {
327        bail!(
328            "browser {} failed for {} with status {status}",
329            args.browser.display(),
330            viewport.name
331        );
332    }
333    if !output.exists() {
334        bail!("browser did not write {}", output.display());
335    }
336    Ok(())
337}