gpu-trace-perf 1.3.1

Plays a collection of GPU traces under different environments to evaluate driver changes on performance
#[macro_use] extern crate lazy_static;
extern crate assert_approx_eq;
extern crate statrs;
extern crate stderrlog;
extern crate structopt;
extern crate walkdir;
use log::error;
use log_once::error_once;
use regex::Regex;
use statrs::statistics::Statistics;
use std::io::prelude::*;
use std::process::Command;
use structopt::StructOpt;
use walkdir::WalkDir;

#[derive(Debug, StructOpt)]
#[structopt(version = "1.3.1", author = "Eric Anholt <eric@anholt.net>", about = "Plays a collection of GPU traces under different environments to evaluate driver changes on performance")]
struct Opts {
    /// A level of verbosity, and can be used multiple times
    #[structopt(short = "v", long = "verbose", parse(from_occurrences))]
    verbose: i32,
    #[structopt(subcommand)]
    subcmd: SubCommand,
}

#[derive(Debug, StructOpt)]
enum SubCommand {
    #[structopt(name = "run")]
    Run(Run),
}

#[derive(Debug, StructOpt)]
struct Run {
    #[structopt(short = "t", long = "traces", multiple = true, number_of_values = 1, takes_value = true, help = "path to directory containing traces")]
    traces: Vec<String>,
    #[structopt(index = 1, required = true, help = "command to set the runtime environment for the 'a' case")]
    a: String,
    #[structopt(index = 2, required = true, help = "command to set the runtime environment for the 'b' case")]
    b: String,
}

// Walk the directories specified in the command line args and make a vec of the actual traces to run
fn collect_traces(directories: Vec<String>) -> Vec<String> {
    let mut traces : Vec<String> = Vec::new();

    for dir in directories {
        let walk = WalkDir::new(dir)
            .follow_links(true);

        for entry in walk.into_iter().filter_map(|e| e.ok()) {
            if let Some(path) = entry.path().to_str() {
                if path.ends_with(".trace") || path.ends_with(".rdc") {
                   traces.push(path.to_string());
                }
            } else {
                error!("Path {:?} not UTF-8, skipping", entry.path());
            }
        }
    }

    traces
}

struct TraceResults {
    a: Vec<f64>,
    b: Vec<f64>,
    path: String,
}

// Returns the total GPU time from the last frame of an apitrace replay --pgpu output
fn parse_apitrace_pgpu_output(output: &str) -> Option<i64> {
    lazy_static! {
        static ref CALL_RE: Regex = Regex::new("^call [0-9]+ -?[0-9]+ ([0-9]+)").unwrap();
    }

    let mut total = 0;
    let mut start_of_frame = true;
    for line in output.lines() {
        if line == "frame_end" {
            start_of_frame = true;
        } else {
            let cap = CALL_RE.captures(line);
            if let Some(cap) = cap {
                if start_of_frame {
                    total = 0;
                    start_of_frame = false;
                }
                match cap[1].parse::<i64>() {
                    Ok(gpu) => {
                        if gpu >= 0 {
                            total += gpu;
                        } else {
                            error!("apitrace produced GL_TIME_ELAPSED < 0, skipping(gpu hang?)");
                            return None;
                        }
                    }
                    Err(_) => {
                        error!("failed to parse apitrace's GL_TIME_ELAPSED");
                        return None;
                    }
                }

            }
        }
    }

    Some(total)
}

// Returns the total GPU time output from gpu-trace-perf-renderdoc-wrapper.py
fn parse_renderdoc_wrapper_output(output: &str) -> Option<f32> {
    lazy_static! {
        static ref CALL_RE: Regex = Regex::new("EID [0-9]*: (.*)").unwrap();
    }

    let mut total = 0.0;
    for line in output.lines() {
        if let Some(cap) = CALL_RE.captures(line) {
            match cap[1].parse::<f32>() {
                Ok(time) => total += time,
                _ => {
                    println!("Failed to parse renderdoc time event '{}'", line);
                    return None;
                },
            }
        }
    }

    Some(total)
}

fn run_apitrace(trace: &str, environment: &str) -> Option<i64> {
    let apitrace_command = [
        "eglretrace", // apitrace replay otherwise assumes x11 (glretrace) for glx traces instead of using waffle
        "--pgpu",
        "--headless",
        "--loop=1", // loop the last frame once so we know that caches are hot, shaders are compiled, etc.
        trace,
    ];

    let mut command = Command::new(environment);
    command.args(&apitrace_command);
    command.env("NIR_VALIDATE", "0");

    let output = command.output()
        .expect("failed to `trace");

    if !output.status.success() {
        let stderr = std::str::from_utf8(&output.stderr).unwrap();
        if stderr.contains("waffle_context_create failed") {
            error_once!("apitrace reported waffle_context_create() failed, likely due to trace requiring too new of a GL version");
            return None;
        }

        println!("Failed to start apitrace:");
        let stdout = std::str::from_utf8(&output.stdout).unwrap();
        println!("{}", if stderr.len() > 2 { stderr } else { stdout });
        println!("command: {:?}", command);
        return None;
    }
    parse_apitrace_pgpu_output(&String::from_utf8_lossy(&output.stdout))
}

fn run_renderdoc(trace: &str, environment: &str) -> Option<f64> {
    let renderdoc_command = [
        "python3",
        "-", // script from stdin
        trace,
    ];

    let mut command = Command::new(environment);
    command.args(&renderdoc_command);
    command.env("NIR_VALIDATE", "0");
    command.stdin(std::process::Stdio::piped());
    command.stdout(std::process::Stdio::piped());
    command.stderr(std::process::Stdio::piped());

    let mut child = command.spawn().expect("failed to start python3");
    let stdin = child.stdin.as_mut().expect("Failed to open stdin");
    let renderdoc_wrapper_script = include_bytes!("gpu-trace-perf-renderdoc-wrapper.py");
    stdin.write_all(renderdoc_wrapper_script).expect("failed to write to python's stdin");
    let output = child.wait_with_output().expect("failed to read stdout");

    if !output.status.success() {
        let stderr = std::str::from_utf8(&output.stderr).unwrap();
        if stderr.contains("API is unsupported") {
            error_once!("renderdoc reported API (likely window system) unsupported, skipping trace");
            return None;
        }

        println!("Failed to start renderdoc:");
        println!("{}", stderr);
        println!("command: {:?}", command);

        if stderr.contains("FileNotFound") {
            println!("TIP: Failure to find a file with a space in its name probably means your wrapper script didn't quote the arguments")
        }
        return None;
    }
    parse_renderdoc_wrapper_output(&String::from_utf8_lossy(&output.stdout)).map(|t| t as f64)
}

fn run_trace(trace: &str, environment: &str) -> Option<f64> {
    if trace.ends_with(".trace") {
        run_apitrace(trace, environment).map(|t| t as f64)
    } else {
        assert!(trace.ends_with(".rdc"));
        run_renderdoc(trace, environment)
    }
}

fn run_each_trace(run_opts: &Run, results: &mut Vec<TraceResults>, a_first : bool) {
    for result in results.iter_mut() {
        println!("Running {}", result.path);
        let pair = if a_first {
            (run_trace(&result.path, &run_opts.a), run_trace(&result.path, &run_opts.b))
        } else {
            (run_trace(&result.path, &run_opts.b), run_trace(&result.path, &run_opts.a))
        };

        // Only push the pair if we got valid results from both.  (our stats are expecting len(a) == len(b))
        if let (Some(a), Some(b)) = pair {
            result.a.push(a);
            result.b.push(b);
        }
    }
}

struct StudentsTInverseCDFTable {
    df: u32,
    t: [f32; 7],
}

fn students_t_inverse_cdf(cdf: f64, df: u32) -> f32 {
    #[allow(clippy::approx_constant)]
    const STUDENTS_T: &[StudentsTInverseCDFTable] = &[
        StudentsTInverseCDFTable { df: 1, t: [3.078, 6.314, 12.706, 31.821, 63.656, 318.289, 636.578] },
        StudentsTInverseCDFTable { df: 2, t: [1.886, 2.920, 4.303, 6.965, 9.925, 22.328, 31.600] },
        StudentsTInverseCDFTable { df: 3, t: [1.638, 2.353, 3.182, 4.541, 5.841, 10.214, 12.924] },
        StudentsTInverseCDFTable { df: 4, t: [1.533, 2.132, 2.776, 3.747, 4.604, 7.173, 8.610] },
        StudentsTInverseCDFTable { df: 5, t: [1.476, 2.015, 2.571, 3.365, 4.032, 5.894, 6.869] },
        StudentsTInverseCDFTable { df: 6, t: [1.440, 1.943, 2.447, 3.143, 3.707, 5.208, 5.959] },
        StudentsTInverseCDFTable { df: 7, t: [1.415, 1.895, 2.365, 2.998, 3.499, 4.785, 5.408] },
        StudentsTInverseCDFTable { df: 8, t: [1.397, 1.860, 2.306, 2.896, 3.355, 4.501, 5.041] },
        StudentsTInverseCDFTable { df: 9, t: [1.383, 1.833, 2.262, 2.821, 3.250, 4.297, 4.781] },
        StudentsTInverseCDFTable { df: 10, t: [1.372, 1.812, 2.228, 2.764, 3.169, 4.144, 4.587] },
        StudentsTInverseCDFTable { df: 11, t: [1.363, 1.796, 2.201, 2.718, 3.106, 4.025, 4.437] },
        StudentsTInverseCDFTable { df: 12, t: [1.356, 1.782, 2.179, 2.681, 3.055, 3.930, 4.318] },
        StudentsTInverseCDFTable { df: 13, t: [1.350, 1.771, 2.160, 2.650, 3.012, 3.852, 4.221] },
        StudentsTInverseCDFTable { df: 14, t: [1.345, 1.761, 2.145, 2.624, 2.977, 3.787, 4.140] },
        StudentsTInverseCDFTable { df: 15, t: [1.341, 1.753, 2.131, 2.602, 2.947, 3.733, 4.073] },
        StudentsTInverseCDFTable { df: 16, t: [1.337, 1.746, 2.120, 2.583, 2.921, 3.686, 4.015] },
        StudentsTInverseCDFTable { df: 17, t: [1.333, 1.740, 2.110, 2.567, 2.898, 3.646, 3.965] },
        StudentsTInverseCDFTable { df: 18, t: [1.330, 1.734, 2.101, 2.552, 2.878, 3.610, 3.922] },
        StudentsTInverseCDFTable { df: 19, t: [1.328, 1.729, 2.093, 2.539, 2.861, 3.579, 3.883] },
        StudentsTInverseCDFTable { df: 20, t: [1.325, 1.725, 2.086, 2.528, 2.845, 3.552, 3.850] },
        StudentsTInverseCDFTable { df: 21, t: [1.323, 1.721, 2.080, 2.518, 2.831, 3.527, 3.819] },
        StudentsTInverseCDFTable { df: 22, t: [1.321, 1.717, 2.074, 2.508, 2.819, 3.505, 3.792] },
        StudentsTInverseCDFTable { df: 23, t: [1.319, 1.714, 2.069, 2.500, 2.807, 3.485, 3.768] },
        StudentsTInverseCDFTable { df: 24, t: [1.318, 1.711, 2.064, 2.492, 2.797, 3.467, 3.745] },
        StudentsTInverseCDFTable { df: 25, t: [1.316, 1.708, 2.060, 2.485, 2.787, 3.450, 3.725] },
        StudentsTInverseCDFTable { df: 26, t: [1.315, 1.706, 2.056, 2.479, 2.779, 3.435, 3.707] },
        StudentsTInverseCDFTable { df: 27, t: [1.314, 1.703, 2.052, 2.473, 2.771, 3.421, 3.689] },
        StudentsTInverseCDFTable { df: 28, t: [1.313, 1.701, 2.048, 2.467, 2.763, 3.408, 3.674] },
        StudentsTInverseCDFTable { df: 29, t: [1.311, 1.699, 2.045, 2.462, 2.756, 3.396, 3.660] },
        StudentsTInverseCDFTable { df: 30, t: [1.310, 1.697, 2.042, 2.457, 2.750, 3.385, 3.646] },
        StudentsTInverseCDFTable { df: 60, t: [1.296, 1.671, 2.000, 2.390, 2.660, 3.232, 3.460] },
        StudentsTInverseCDFTable { df: 120, t: [1.289, 1.658, 1.980, 2.358, 2.617, 3.160, 3.373] },
        StudentsTInverseCDFTable { df: !0u32, t: [1.282, 1.645, 1.960, 2.326, 2.576, 3.091, 3.291] },
    ];

    for entry in STUDENTS_T {
        if df <= entry.df {
            if cdf <= 0.90 {
                return entry.t[0];
            } else if cdf <= 0.95 {
                return entry.t[1];
            } else if cdf <= 0.975 {
                return entry.t[2];
            } else if cdf <= 0.99 {
                return entry.t[3];
            } else if cdf <= 0.995 {
                return entry.t[4];
            } else if cdf <= 0.999 {
                return entry.t[5];
            } else /* 0.9995 */ {
                return entry.t[6];
            }
        }
    }
    unreachable!("didn't find df")
}

struct ResultStats<'a> {
    change: f64,
    means: [f64; 2],
    results: &'a TraceResults,
    error: f64,
}

impl ResultStats<'_> {
    fn new(results: &TraceResults, alpha: f64) -> ResultStats {

        let means = [results.a.clone().mean(), results.b.clone().mean()];

        let n1 = results.a.len() as f64;
        let n2 = results.b.len() as f64;
        let s1 = results.a.clone().std_dev();
        let s2 = results.b.clone().std_dev();

        let df = n1 + n2 - 2.0;
        // Cohen's pooled variance
        let pooled_sd = (((n1 - 1.0) * s1 * s1 + (n2 - 1.0) * s2 * s2) / df).sqrt();

        let standard_error = pooled_sd * (1.0 / n1 + 1.0 / n2).sqrt();

        // We're doing two-tailed (you might increase or decrease perf), so 95%
        // CI uses the .975 value.
        let t = students_t_inverse_cdf(1.0 - alpha / 2.0, df as u32) as f64;
        let e = t * standard_error;
        ResultStats {
            results,
	        means,
            change: (means[1] - means[0]) / means[0],
            error: e / means[0],
        }
    }

    fn has_time(&self) -> bool {
        self.means[0] != 0.0 && self.means[1] != 0.0
    }
}

fn print_stats(results: &[TraceResults]) {
    let mut stats = Vec::new();

    // Apply the Bonferroni correction for multiple hypothesis testing.
    let alpha = 0.05 / results.len() as f64;

    let mut pathlen = 0;
    for result in results {
        if !result.a.is_empty() && !result.b.is_empty() {
            stats.push(ResultStats::new(&result, alpha));

            if result.path.len() > pathlen {
                pathlen = result.path.len();
            }
        }
    }

    stats.sort_by(|a, b| {
        if !a.has_time() {
            if !b.has_time() {
                std::cmp::Ordering::Equal
            } else {
                std::cmp::Ordering::Greater
            }
        } else if !b.has_time() {
            std::cmp::Ordering::Less
        } else {
            a.change.partial_cmp(&b.change).unwrap()
        }
    });

    for stat in stats {
        let prefix = format!("{path:pathlen$}: ",
            path = stat.results.path,
            pathlen = pathlen);

        let change = if stat.has_time() {
            format!("{:>7.2}%", stat.change * 100.0)
        } else {
            "no time detected".to_string()
        };

        let error = if stat.results.a.len() > 1 && stat.has_time() {
            format!(" (+/- {:5.1}%)", error = stat.error * 100.0)
        } else {
            "".to_string()
        };

        let count = if stat.results.a.len() == stat.results.b.len() {
            format!("{}", stat.results.a.len())
        } else {
            format!("{}/{}", stat.results.a.len(), stat.results.b.len())
        };

        println!("{prefix}{change}{error} (n={count})",
            prefix = prefix,
            change = change,
            error = error,
            count = count);
    }
}

fn run(run_opts: &Run, traces: Vec<String>) {
    let mut results : Vec<TraceResults> =
        traces
        .into_iter()
        .map(|path| TraceResults {
            a: Vec::new(),
            b: Vec::new(),
            path
        })
        .collect();

    let mut a_first = true;
    loop {
        run_each_trace(run_opts, &mut results, a_first);
        print_stats(&results);

        a_first = !a_first;
    }
}

// Test the environment scripts people hand us before trying to start up an actual trace
fn test_run_environment(environment: &str) -> bool {
    let output = Command::new(environment)
        .arg("true")
        .output();
    match output {
        Ok(_) => true,
        Err(err) => {
            println!("Failed to spawn test invocation of '{} true': {:?}", environment, err);
            println!("TIP: Exec format error here probably means you need #!/bin/sh in your shell script.");
            false
        }
    }
}

fn main() {
    let opts = Opts::from_args();
    stderrlog::new()
        .module(module_path!())
        .verbosity(opts.verbose as usize)
        .init()
        .unwrap();

    match &opts.subcmd {
        SubCommand::Run(run_opts) => {
            let traces = collect_traces(run_opts.traces.clone());
            if traces.is_empty() {
                println!("No traces found in the given directories:");
                for t in &run_opts.traces {
                    println!("  {}", t);
                }
                std::process::exit(1);
            }

            if !test_run_environment(&run_opts.a) || !test_run_environment(&run_opts.b) {
                std::process::exit(1);
            }

            run(&run_opts, traces);
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use assert_approx_eq::assert_approx_eq;

    #[test]
    fn test_apitrace_parsing() {
        // Actual apitrace output, trimmed down for the testcase.
        let apitrace_input = "
# call no gpu_start gpu_dura cpu_start cpu_dura vsize_start vsize_dura rss_start rss_dura pixels program name
call 44 0 0 0 0 0 0 0 0 0 0 glViewport
call 56 25082334 50166 0 0 0 0 0 0 0 0 glClear
call 81 41719667 0 0 0 0 0 0 0 0 0 glClear
call 176 42206667 472583 0 0 0 0 0 0 0 7 glDrawArrays
frame_end
call 222 0 0 0 0 0 0 0 0 0 4 glClearColor
call 224 45001334 21666 0 0 0 0 0 0 0 4 glClear
call 231 45023750 38000 0 0 0 0 0 0 0 7 glClear
call 239 45062584 519333 0 0 0 0 0 0 0 7 glDrawArrays
frame_end
call 222 0 0 0 0 0 0 0 0 0 4 glClearColor
call 224 47438000 13666 0 0 0 0 0 0 0 4 glClear
call 231 47452417 59583 0 0 0 0 0 0 0 7 glClear
call 239 47512917 579083 0 0 0 0 0 0 0 7 glDrawArrays
frame_end
Rendered 3 frames in 0.0539452 secs, average of 55.612 fps
";
        assert_eq!(parse_apitrace_pgpu_output(apitrace_input).unwrap(), 13_666 + 59_583 + 579_083)
    }

    #[test]
    fn test_apitrace_parsing_negatve_start() {
        let apitrace_input = "call 318 -8883437858 156 0 0 0 0 0 0 0 0 glBlitFramebuffer";
        assert_eq!(parse_apitrace_pgpu_output(apitrace_input).unwrap(), 156)
    }

    #[test]
    fn test_renderdoc_parsing() {
        // Actual apitrace output, trimmed down for the testcase.
        let renderdoc_input = "
Counter 1 (GPU Duration):
    Time taken for this event on the GPU, as measured by delta between two GPU timestamps.
    Returns 8 byte CompType.Double, representing CounterUnit.Seconds
Counter 2000000 (N vertices submitted):
    N vertices submitted
    Returns 8 byte CompType.UInt, representing CounterUnit.Absolute
EID 52: 0.000045
EID 370: 0.000004
EID 407: 0.000006
";

        assert_approx_eq!(parse_renderdoc_wrapper_output(renderdoc_input).unwrap(), 0.000_045 + 0.000_004 + 0.000_006, 0.000_001);
    }

    #[test]
    fn test_renderdoc_nan_parsing() {
        // Actual apitrace output, trimmed down for the testcase.
        let renderdoc_input = "
EID 52: 0.000045
EID 370: nan
EID 407: 0.000006
";

        assert_eq!(parse_renderdoc_wrapper_output(renderdoc_input), None);
    }
    #[test]
    fn test_stats() {
        // Results from scipy
        assert_approx_eq!(students_t_inverse_cdf(0.95, 2), 2.919_985, 0.001);
        assert_approx_eq!(students_t_inverse_cdf(0.995, 5), 4.032_142, 0.001);

        // Result from ministat
        let results = TraceResults {
            a: vec!(1.0, 2.0, 3.0, 4.0, 5.0),
            b: vec!(5.0, 6.0, 7.0, 8.0, 9.0),
            path: "".to_string(),
        };
        let stats = ResultStats::new(&results, 0.05);
        assert_approx_eq!(stats.change, 1.333_333, 0.001);
        assert_approx_eq!(stats.error, 0.768_668, 0.001);
    }
}