vor 0.2.1

Cross-platform performance instrumentation with an in-app egui panel and live system and GPU metrics.
Documentation
//! Headless capture: profile an ML-style loop with no panel, then read
//! the capture back. This is the whole client surface - annotations,
//! record_metric, frame_mark, and one env var.
//!
//! Record a run:
//!
//!   VOR_RECORD=/tmp/run.vor cargo run --example headless
//!
//! Add flame frames (heavier; metrics-only is the default):
//!
//!   VOR_RECORD=/tmp/run.vor VOR_RECORD_FLAME=1 cargo run --example headless
//!
//! Summarize the capture (exercises vor::Reader):
//!
//!   cargo run --example headless -- /tmp/run.vor

use std::time::Instant;

const STEPS: u32 = 200;
/// Synthetic batch size, so throughput = tokens / step time is a real
/// tok/s rather than a made-up number.
const TOKENS_PER_STEP: f64 = 4096.0;

#[vor::profile]
fn train_step(step: u32) -> f64 {
    // Stand-in work so the flame chart (if captured) has a named scope
    // and step time is nonzero.
    let mut acc = 0u64;
    for k in 0..(50_000 + step as u64 * 200) {
        acc = acc.wrapping_mul(2_654_435_761).wrapping_add(k);
    }
    std::hint::black_box(acc);
    // A loss curve that decays, so the recorded scalar is recognizable.
    1.0 / (step as f64 + 1.0).sqrt()
}

fn record_run() {
    vor::enable();
    vor::record_metric_unit("throughput", "tok/s"); // once; loss + lr are unitless
    for step in 0..STEPS {
        let started = Instant::now();
        let loss = train_step(step);
        let tok_per_s = TOKENS_PER_STEP / started.elapsed().as_secs_f64();
        vor::record_metric("loss", loss);
        vor::record_metric("lr", 3e-4);
        vor::record_metric("throughput", tok_per_s);
        vor::frame_mark();
    }
    vor::flush_recording();
    match std::env::var_os("VOR_RECORD") {
        Some(path) => println!("recorded {STEPS} steps to {path:?}"),
        None => println!("ran {STEPS} steps (set VOR_RECORD=<path> to capture)"),
    }
}

fn summarize(path: &str) {
    let mut reader = vor::Reader::open(path).unwrap();
    let columns: Vec<String> = reader
        .columns()
        .iter()
        .map(|c| format!("{} ({})", c.name, c.unit))
        .collect();
    println!("columns: {}", columns.join(", "));
    println!("flame_enabled: {}", reader.flame_enabled());

    let mut frames = 0u32;
    let mut flames = 0u32;
    let mut scopes = 0usize;
    let mut last_loss = f64::NAN;
    while let Some(frame) = reader.next_frame().unwrap() {
        let vor::Frame {
            system: _,
            user,
            flame,
        } = frame;
        frames += 1;
        if let Some(bytes) = flame {
            flames += 1;
            scopes += scope_count(&bytes);
        }
        for (name, value) in &user {
            if name == "loss" {
                last_loss = *value;
            }
        }
    }
    let user: Vec<String> = reader
        .user_columns()
        .iter()
        .map(|c| format!("{} ({})", c.name, c.unit))
        .collect();
    println!("user metrics: {}", user.join(", "));
    println!("frames: {frames}  flame_frames: {flames}  scopes: {scopes}  final loss: {last_loss:.4}");
}

/// Decode one captured flame frame and count its scopes, confirming the
/// stored bytes are a real puffin frame.
fn scope_count(bytes: &[u8]) -> usize {
    let mut cursor = std::io::Cursor::new(bytes);
    let frame = puffin::FrameData::read_next(&mut cursor).unwrap().unwrap();
    let unpacked = frame.unpacked().unwrap();
    unpacked
        .thread_streams
        .values()
        .map(|s| {
            puffin::Reader::from_start(&s.stream)
                .read_top_scopes()
                .unwrap()
                .len()
        })
        .sum()
}

fn main() {
    match std::env::args().nth(1) {
        Some(path) => summarize(&path),
        None => record_run(),
    }
}