diskann_benchmark_runner/app.rs
1/*
2 * Copyright (c) Microsoft Corporation.
3 * Licensed under the MIT license.
4 */
5
6//! The CLI frontend for benchmark applications built with this crate.
7//!
8//! [`App`] provides a [`clap`]-based command line interface that handles input parsing,
9//! benchmark dispatch, and regression checking. Consumers build a binary by registering
10//! [`Input`](crate::Input)s and [`Benchmark`](crate::Benchmark)s, then forwarding to
11//! [`App::parse`] and [`App::run`].
12//!
13//! # Subcommands
14//!
15//! ## Standard Workflow
16//!
17//! * `inputs [NAME]`: List available input kinds, or describe one by name.
18//! * `benchmarks`: List registered benchmarks and their descriptions.
19//! * `skeleton`: Print a skeleton input JSON file.
20//! * `run --input-file <FILE> --output-file <FILE> [--dry-run]`: Run benchmarks.
21//!
22//! ## Regression Checks
23//!
24//! These are accessed via `check <SUBCOMMAND>`:
25//!
26//! * `check skeleton`: Print a skeleton tolerance JSON file.
27//! * `check tolerances [NAME]`: List tolerance kinds, or describe one by name.
28//! * `check verify --tolerances <FILE> --input-file <FILE>`: Validate a tolerance file
29//! against an input file.
30//! * `check run --tolerances <FILE> --input-file <FILE> --before <FILE> --after <FILE> [--output-file <FILE>]`:
31//! Run regression checks.
32//!
33//! # Example
34//!
35//! A typical binary using this crate:
36//!
37//! ```rust,no_run
38//! use diskann_benchmark_runner::{App, registry};
39//!
40//! fn main() -> anyhow::Result<()> {
41//! let mut registry = registry::Registry::new();
42//! // registry.register("my-bench", MyBenchmark::default())?;
43//! // registry.register_regression("my-regression", MyRegressionBenchmark::default())?;
44//!
45//! let app = App::parse();
46//! let mut output = diskann_benchmark_runner::output::default();
47//! app.run(®istry, &mut output)
48//! }
49//! ```
50//!
51//! # Regression Workflow
52//!
53//! 1. Run benchmarks twice (e.g. before and after a code change) with `run`, producing
54//! two output files.
55//! 2. Author a tolerance file describing acceptable variation (use `check skeleton` and
56//! `check tolerances` for guidance).
57//! 3. Validate the tolerance file with `check verify`.
58//! 4. Compare the two output files with `check run`.
59
60use std::{io::Write, path::PathBuf};
61
62use clap::{Parser, Subcommand};
63
64use crate::{
65 internal,
66 jobs::{self, Jobs},
67 output::Output,
68 registry,
69 result::Checkpoint,
70 utils::fmt::{Banner, Indent},
71};
72
73/// Check if we're running in debug mode and error if not allowed.
74fn check_debug_mode(allow_debug: bool) -> anyhow::Result<()> {
75 // Unit tests are treated as debug mode to ensure consistent behavior across builds.
76 if cfg!(any(test, debug_assertions)) && !allow_debug {
77 anyhow::bail!(
78 "Benchmarking in debug mode produces misleading performance results.\n\
79 Please compile in release mode or use the --allow-debug flag to bypass this check."
80 );
81 }
82 Ok(())
83}
84
85/// Parsed command line options.
86#[derive(Debug, Subcommand)]
87pub enum Commands {
88 /// List the kinds of input formats available for ingestion.
89 Inputs {
90 /// Describe the layout of the named input kind.
91 describe: Option<String>,
92 },
93 /// List the available benchmarks.
94 Benchmarks {},
95 /// Provide a skeleton JSON file for running a set of benchmarks.
96 Skeleton,
97 /// Run a list of benchmarks.
98 Run {
99 /// The input file to run.
100 #[arg(long = "input-file")]
101 input_file: PathBuf,
102 /// The path where the output file should reside.
103 #[arg(long = "output-file")]
104 output_file: PathBuf,
105 /// Parse an input file and perform all validation checks, but don't actually run any
106 /// benchmarks.
107 #[arg(long, action)]
108 dry_run: bool,
109 /// Allow running benchmarks in debug mode (not recommended).
110 #[arg(long, action)]
111 allow_debug: bool,
112 },
113 #[command(subcommand)]
114 Check(Check),
115}
116
117/// Subcommands for regression check operations.
118#[derive(Debug, Subcommand)]
119pub enum Check {
120 /// Provide a skeleton of the overall tolerance files.
121 Skeleton,
122 /// List all the tolerance inputs accepted by the benchmark executable.
123 Tolerances {
124 /// Describe the layout for the named tolerance kind.
125 describe: Option<String>,
126 },
127 /// Verify the tolerance file with the accompanying input file.
128 Verify {
129 /// The tolerance file to check.
130 #[arg(long = "tolerances")]
131 tolerances: PathBuf,
132 /// The benchmark input file used to generate the data that will be compared.
133 #[arg(long = "input-file")]
134 input_file: PathBuf,
135 },
136 /// Run regression checks against before/after output files.
137 Run {
138 /// The tolerance file to check.
139 #[arg(long = "tolerances")]
140 tolerances: PathBuf,
141 /// The benchmark input file used to generate the data that will be compared.
142 #[arg(long = "input-file")]
143 input_file: PathBuf,
144 /// The `--output-file` from a benchmark to use as a baseline.
145 #[arg(long = "before")]
146 before: PathBuf,
147 /// The `--output-file` that will be checked for regression against `before`.
148 #[arg(long = "after")]
149 after: PathBuf,
150 /// Optional path to write the JSON check results.
151 #[arg(long = "output-file")]
152 output_file: Option<PathBuf>,
153 },
154}
155
156/// The CLI used to drive a benchmark application.
157#[derive(Debug, Parser)]
158pub struct App {
159 #[command(subcommand)]
160 command: Commands,
161}
162
163impl App {
164 /// Construct [`Self`] by parsing commandline arguments from [`std::env::args`].
165 ///
166 /// This simply redirects to [`clap::Parser::parse`] and is provided to allow parsing
167 /// without the [`clap::Parser`] trait in scope.
168 pub fn parse() -> Self {
169 <Self as clap::Parser>::parse()
170 }
171
172 /// Construct [`Self`] by parsing command line arguments from the iterator.
173 ///
174 /// This simply redirects to [`clap::Parser::try_parse_from`] and is provided to allow
175 /// parsing without the [`clap::Parser`] trait in scope.
176 pub fn try_parse_from<I, T>(itr: I) -> anyhow::Result<Self>
177 where
178 I: IntoIterator<Item = T>,
179 T: Into<std::ffi::OsString> + Clone,
180 {
181 Ok(<Self as clap::Parser>::try_parse_from(itr)?)
182 }
183
184 /// Construct [`Self`] directly from a [`Commands`] enum.
185 pub fn from_commands(command: Commands) -> Self {
186 Self { command }
187 }
188
189 /// Run the application using the registered `inputs` and `benchmarks`.
190 pub fn run(
191 &self,
192 registry: ®istry::Registry,
193 mut output: &mut dyn Output,
194 ) -> anyhow::Result<()> {
195 match &self.command {
196 // If a named benchmark isn't given, then list the available benchmarks.
197 Commands::Inputs { describe } => {
198 if let Some(describe) = describe {
199 if let Some(input) = registry.input(describe) {
200 let repr = jobs::Unprocessed::format_input(input)?;
201 writeln!(
202 output,
203 "The example JSON representation for \"{}\" is:",
204 describe
205 )?;
206 writeln!(output, "{}", serde_json::to_string_pretty(&repr)?)?;
207 return Ok(());
208 } else {
209 writeln!(output, "No input found for \"{}\"", describe)?;
210 }
211
212 return Ok(());
213 }
214
215 writeln!(output, "Available input kinds are listed below:")?;
216 let mut tags: Vec<_> = registry.input_tags().collect();
217 tags.sort();
218 for i in tags.iter() {
219 writeln!(output, " {}", i)?;
220 }
221 }
222 // List the available benchmarks.
223 Commands::Benchmarks {} => {
224 writeln!(output, "Registered Benchmarks:")?;
225 for (name, description) in registry.names() {
226 write!(output, " {name}:")?;
227 if description.is_empty() {
228 writeln!(output)?;
229 } else {
230 writeln!(output)?;
231 write!(output, "{}", Indent::new(&description, 8))?;
232 }
233 }
234 }
235 Commands::Skeleton => {
236 writeln!(output, "Skeleton input file:")?;
237 writeln!(output, "{}", Jobs::example()?)?;
238 }
239 // Run the benchmarks
240 Commands::Run {
241 input_file,
242 output_file,
243 dry_run,
244 allow_debug,
245 } => {
246 // Parse and validate the input.
247 let run = Jobs::load(input_file, registry)?;
248 // Check if we have a match for each benchmark.
249 for job in run.jobs().iter() {
250 const MAX_METHODS: usize = 3;
251 if let Err(mismatches) = registry.debug(job, MAX_METHODS) {
252 let repr = serde_json::to_string_pretty(&job.serialize()?)?;
253
254 writeln!(
255 output,
256 "Could not find a match for the following input:\n\n{}\n",
257 repr
258 )?;
259 writeln!(output, "Closest matches:\n")?;
260 for (i, mismatch) in mismatches.into_iter().enumerate() {
261 writeln!(output, " {}. \"{}\":", i + 1, mismatch.method(),)?;
262 writeln!(output, "{}", Indent::new(mismatch.reason(), 8),)?;
263 }
264 writeln!(output)?;
265
266 return Err(anyhow::Error::msg(
267 "could not find a benchmark for all inputs",
268 ));
269 }
270 }
271
272 if *dry_run {
273 writeln!(
274 output,
275 "Success - skipping running benchmarks because \"--dry-run\" was used."
276 )?;
277 return Ok(());
278 }
279
280 // Check for debug mode before running benchmarks.
281 // This check is placed after the dry-run early return since dry-run doesn't
282 // actually execute benchmarks and thus won't produce misleading performance results.
283 check_debug_mode(*allow_debug)?;
284
285 // The collection of output results for each run.
286 let mut results = Vec::<serde_json::Value>::new();
287
288 // Now - we've verified the integrity of all the jobs we want to run and that
289 // each job can match an associated benchmark.
290 //
291 // All that's left is to actually run the benchmarks.
292 let jobs = run.jobs();
293 let serialized = jobs
294 .iter()
295 .map(|job| {
296 Ok(serde_json::to_value(jobs::Unprocessed::new(
297 job.tag().into(),
298 job.serialize()?,
299 ))?)
300 })
301 .collect::<anyhow::Result<Vec<_>>>()?;
302 for (i, job) in jobs.iter().enumerate() {
303 let prefix: &str = if i != 0 { "\n\n" } else { "" };
304 writeln!(
305 output,
306 "{}{}",
307 prefix,
308 Banner::new(&format!("Running Job {} of {}", i + 1, jobs.len()))
309 )?;
310
311 // Run the specified job.
312 let checkpoint = Checkpoint::new(&serialized, &results, output_file)?;
313 let r = registry.call(job, checkpoint, output)?;
314
315 // Collect the results
316 results.push(r);
317
318 // Save everything.
319 Checkpoint::new(&serialized, &results, output_file)?.save()?;
320 }
321 }
322 // Extensions
323 Commands::Check(check) => return self.check(check, registry, output),
324 };
325 Ok(())
326 }
327
328 // Extensions
329 fn check(
330 &self,
331 check: &Check,
332 registry: ®istry::Registry,
333 mut output: &mut dyn Output,
334 ) -> anyhow::Result<()> {
335 match check {
336 Check::Skeleton => {
337 let message = "Skeleton tolerance file.\n\n\
338 Each tolerance is paired with an input that is structurally\n\
339 matched with an entry in the corresponding `--input-file`.\n\n\
340 This allow a single tolerance entry to be applied to multiple\n\
341 benchmark runs as long as this structural mapping is unambiguous.\n";
342
343 writeln!(output, "{}", message)?;
344 writeln!(output, "{}", internal::regression::Raw::example())?;
345 Ok(())
346 }
347 Check::Tolerances { describe } => {
348 let tolerances = registry.tolerances();
349
350 match describe {
351 Some(name) => match tolerances.get(&**name) {
352 Some(registered) => {
353 let repr = internal::regression::RawInner::new(
354 jobs::Unprocessed::new(
355 "".to_string(),
356 serde_json::Value::Object(Default::default()),
357 ),
358 jobs::Unprocessed::format_input(registered.tolerance)?,
359 );
360
361 write!(
362 output,
363 "The example JSON representation for \"{}\" is shown below.\n\
364 Populate the \"input\" field with a compatible benchmark input.\n\
365 Matching will be performed by partial structural map on the input.\n\n",
366 name
367 )?;
368 writeln!(output, "{}", serde_json::to_string_pretty(&repr)?)?;
369 Ok(())
370 }
371 None => {
372 writeln!(output, "No tolerance input found for \"{}\"", name)?;
373 Ok(())
374 }
375 },
376 None => {
377 writeln!(output, "Available tolerance kinds are listed below.")?;
378
379 // Print the registered tolerance files in alphabetical order.
380 let mut keys: Vec<_> = tolerances.keys().collect();
381 keys.sort();
382 for k in keys {
383 // This access should not panic - we just obtained all the keys.
384 let registered = &tolerances[k];
385 writeln!(output, " {}", registered.tolerance.tag())?;
386 for pair in registered.regressions.iter() {
387 writeln!(
388 output,
389 " - \"{}\" => \"{}\"",
390 pair.input_tag(),
391 pair.name(),
392 )?;
393 }
394 }
395 Ok(())
396 }
397 }
398 }
399 Check::Verify {
400 tolerances,
401 input_file,
402 } => {
403 let _ = internal::regression::Checks::new(tolerances, input_file, registry)?;
404 Ok(())
405 }
406 Check::Run {
407 tolerances,
408 input_file,
409 before,
410 after,
411 output_file,
412 } => {
413 let checks = internal::regression::Checks::new(tolerances, input_file, registry)?;
414 let jobs = checks.jobs(before, after)?;
415 jobs.run(output, output_file.as_deref())?;
416 Ok(())
417 }
418 }
419 }
420}
421
422///////////
423// Tests //
424///////////
425
426/// The integration test below look inside the `tests` directory for folders.
427///
428/// ## Input Files
429///
430/// Each folder should have at least a `stdin.txt` file specifying the command line to give
431/// to the `App` parser.
432///
433/// Within the `stdin.txt` command line, there are several special symbols:
434///
435/// * $INPUT - Resolves to `input.json` in the same directory as the `stdin.txt` file.
436/// * $OUTPUT - Resolves to `output.json` in a temporary directory.
437/// * $TOLERANCES - Resolves to `tolerances.json` in the test directory.
438/// * $REGRESSION_INPUT - Resolves to `regression_input.json` in the test directory.
439/// * $CHECK_OUTPUT - Resolves to `checks.json` in a temporary directory.
440///
441/// As mentioned - an input JSON file can be included and must be named "input.json" to be
442/// discoverable.
443///
444/// ## Output Files
445///
446/// Tests should have at least a `stdout.txt` file with the expected outputs for running the
447/// command in `stdin.txt`. If an output JSON file is expected, it should be named `output.json`.
448///
449/// ## Test Discovery and Running
450///
451/// The unit test will visit each folder in `tests` and run the outlined scenario. The
452/// `stdout.txt` expected output is compared to the actual output and if they do not match,
453/// the test fails.
454///
455/// Additionally, if `output.json` is present, the unit test will verify that (1) the command
456/// did in fact produce an output JSON file and (2) the generated file matches the expected file.
457///
458/// ## Regenerating Expected Results
459///
460/// The benchmark output will naturally change over time. Running the unit tests with the
461/// environment variable
462/// ```text
463/// POCKETBENCH_TEST=overwrite
464/// ```
465/// will replace the `stdout.txt` (and `output.json` if one was generated) for each test
466/// scenario. Developers should then consult `git diff` to ensure that major regressions
467/// to the output did not occur.
468#[cfg(test)]
469mod tests {
470 use super::*;
471
472 use std::{
473 ffi::OsString,
474 path::{Path, PathBuf},
475 };
476
477 use crate::{registry, ux};
478
479 const ENV: &str = "POCKETBENCH_TEST";
480
481 // Expected I/O files.
482 const STDIN: &str = "stdin.txt";
483 const STDOUT: &str = "stdout.txt";
484 const INPUT_FILE: &str = "input.json";
485 const OUTPUT_FILE: &str = "output.json";
486
487 // Regression Extension
488 const TOLERANCES_FILE: &str = "tolerances.json";
489 const REGRESSION_INPUT_FILE: &str = "regression_input.json";
490 const CHECK_OUTPUT_FILE: &str = "checks.json";
491
492 const ALL_GENERATED_OUTPUTS: [&str; 2] = [OUTPUT_FILE, CHECK_OUTPUT_FILE];
493
494 // Read the entire contents of a file to a string.
495 fn read_to_string<P: AsRef<Path>>(path: P, ctx: &str) -> String {
496 match std::fs::read_to_string(path.as_ref()) {
497 Ok(s) => ux::normalize(s),
498 Err(err) => panic!(
499 "failed to read {} {:?} with error: {}",
500 ctx,
501 path.as_ref(),
502 err
503 ),
504 }
505 }
506
507 // Check if `POCKETBENCH_TEST=overwrite` is configured. Return `true` if so - otherwise
508 // return `false`.
509 //
510 // If `POCKETBENCH_TEST` is set but its value is not `overwrite` - panic.
511 fn overwrite() -> bool {
512 match std::env::var(ENV) {
513 Ok(v) => {
514 if v == "overwrite" {
515 true
516 } else {
517 panic!(
518 "Unknown value for {}: \"{}\". Expected \"overwrite\"",
519 ENV, v
520 );
521 }
522 }
523 Err(std::env::VarError::NotPresent) => false,
524 Err(std::env::VarError::NotUnicode(_)) => {
525 panic!("Value for {} is not unicode", ENV);
526 }
527 }
528 }
529
530 // Test Runner
531 struct Test {
532 dir: PathBuf,
533 overwrite: bool,
534 }
535
536 impl Test {
537 fn new(dir: &Path) -> Self {
538 Self {
539 dir: dir.into(),
540 overwrite: overwrite(),
541 }
542 }
543
544 fn parse_stdin(&self, tempdir: &Path) -> Vec<App> {
545 let path = self.dir.join(STDIN);
546
547 // Read the standard input file to a string.
548 let stdin = read_to_string(&path, "standard input");
549
550 let output: Vec<App> = stdin
551 .lines()
552 .filter_map(|line| {
553 if line.starts_with('#') || line.is_empty() {
554 None
555 } else {
556 Some(self.parse_line(line, tempdir))
557 }
558 })
559 .collect();
560
561 if output.is_empty() {
562 panic!("File \"{}/stdin.txt\" has no command!", self.dir.display());
563 }
564
565 output
566 }
567
568 fn parse_line(&self, line: &str, tempdir: &Path) -> App {
569 // Split and resolve special symbols
570 let args: Vec<OsString> = line
571 .split_whitespace()
572 .map(|v| -> OsString { self.resolve(v, tempdir).into() })
573 .collect();
574
575 App::try_parse_from(std::iter::once(OsString::from("test-app")).chain(args)).unwrap()
576 }
577
578 fn resolve(&self, s: &str, tempdir: &Path) -> PathBuf {
579 match s {
580 // Standard workflow
581 "$INPUT" => self.dir.join(INPUT_FILE),
582 "$OUTPUT" => tempdir.join(OUTPUT_FILE),
583 // Regression extension
584 "$TOLERANCES" => self.dir.join(TOLERANCES_FILE),
585 "$REGRESSION_INPUT" => self.dir.join(REGRESSION_INPUT_FILE),
586 "$CHECK_OUTPUT" => tempdir.join(CHECK_OUTPUT_FILE),
587
588 // Catch-all: no interpolation
589 _ => s.into(),
590 }
591 }
592
593 fn run(&self, tempdir: &Path) {
594 let apps = self.parse_stdin(tempdir);
595
596 // Register outputs
597 let mut registry = registry::Registry::new();
598 crate::test::register_benchmarks(&mut registry).unwrap();
599
600 // Run each app invocation - collecting the last output into a buffer.
601 //
602 // Only the last run is allowed to return an error - if it does, format the
603 // error to the output buffer as well using the debug formatting option.
604 let mut buffer = crate::output::Memory::new();
605 for (i, app) in apps.iter().enumerate() {
606 let is_last = i + 1 == apps.len();
607
608 // Select where to route the test output.
609 //
610 // Only the last run gets saved. Setup output is discarded — if a setup
611 // command fails, the panic message includes the error.
612 let mut b: &mut dyn crate::Output = if is_last {
613 &mut buffer
614 } else {
615 &mut crate::output::Sink::new()
616 };
617
618 if let Err(err) = app.run(®istry, b) {
619 if is_last {
620 write!(b, "{:?}", err).unwrap();
621 } else {
622 panic!(
623 "App {} of {} failed with error: {:?}",
624 i + 1,
625 apps.len(),
626 err
627 );
628 }
629 }
630 }
631
632 // Check that `stdout` matches
633 let stdout: String =
634 ux::normalize(ux::strip_backtrace(buffer.into_inner().try_into().unwrap()));
635 let stdout = ux::scrub_path(stdout, tempdir, "$TEMPDIR");
636 let output = self.dir.join(STDOUT);
637 if self.overwrite {
638 std::fs::write(output, stdout).unwrap();
639 } else {
640 let expected = read_to_string(&output, "expected standard output");
641 if stdout != expected {
642 panic!("Got:\n--\n{}\n--\nExpected:\n--\n{}\n--", stdout, expected);
643 }
644 }
645
646 // Check that the output files match.
647 for file in ALL_GENERATED_OUTPUTS {
648 self.check_output_file(tempdir, file);
649 }
650 }
651
652 fn check_output_file(&self, tempdir: &Path, filename: &str) {
653 let generated_path = tempdir.join(filename);
654 let was_generated = generated_path.is_file();
655
656 let expected_path = self.dir.join(filename);
657 let is_expected = expected_path.is_file();
658
659 if self.overwrite {
660 // Copy the output file to the destination.
661 if was_generated {
662 println!(
663 "Moving generated file {:?} to {:?}",
664 generated_path, expected_path
665 );
666
667 if let Err(err) = std::fs::rename(&generated_path, &expected_path) {
668 panic!(
669 "Moving generated file {:?} to expected location {:?} failed: {}",
670 generated_path, expected_path, err
671 );
672 }
673 } else if is_expected {
674 println!("Removing outdated file {:?}", expected_path);
675 if let Err(err) = std::fs::remove_file(&expected_path) {
676 panic!("Failed removing outdated file {:?}: {}", expected_path, err);
677 }
678 }
679 } else {
680 match (was_generated, is_expected) {
681 (true, true) => {
682 let output_contents = read_to_string(generated_path, "generated");
683
684 let expected_contents = read_to_string(expected_path, "expected");
685
686 if output_contents != expected_contents {
687 panic!(
688 "{}: Got:\n\n{}\n\nExpected:\n\n{}\n",
689 filename, output_contents, expected_contents
690 );
691 }
692 }
693 (true, false) => {
694 let output_contents = read_to_string(generated_path, "generated");
695
696 panic!(
697 "{} was generated when none was expected. Contents:\n\n{}",
698 filename, output_contents
699 );
700 }
701 (false, true) => {
702 panic!("{} was not generated when it was expected", filename);
703 }
704 (false, false) => { /* this is okay */ }
705 }
706 }
707 }
708 }
709
710 fn run_specific_test(test_dir: &Path) {
711 println!("running test in {:?}", test_dir);
712 let temp_dir = tempfile::tempdir().unwrap();
713 Test::new(test_dir).run(temp_dir.path());
714 }
715
716 fn run_all_tests_in(dir: &str) {
717 let dir: PathBuf = format!("{}/tests/{}", env!("CARGO_MANIFEST_DIR"), dir).into();
718 for entry in std::fs::read_dir(dir).unwrap() {
719 let entry = entry.unwrap();
720 if let Ok(file_type) = entry.file_type() {
721 if file_type.is_dir() {
722 run_specific_test(&entry.path());
723 }
724 } else {
725 panic!("couldn't get file type for {:?}", entry.path());
726 }
727 }
728 }
729
730 #[test]
731 fn benchmark_tests() {
732 run_all_tests_in("benchmark");
733 }
734
735 #[test]
736 fn regression_tests() {
737 run_all_tests_in("regression");
738 }
739}