fluxbench_cli/
lib.rs

1#![warn(missing_docs)]
2//! FluxBench CLI Library
3//!
4//! This module provides the CLI infrastructure for benchmark binaries.
5//! Use `fluxbench::run()` (or `fluxbench_cli::run()`) in your main function to get the full
6//! fluxbench CLI experience with your registered benchmarks.
7//!
8//! # Example
9//!
10//! ```ignore
11//! use fluxbench::prelude::*;
12//!
13//! #[bench]
14//! fn my_benchmark(b: &mut Bencher) {
15//!     b.iter(|| expensive_operation());
16//! }
17//!
18//! fn main() {
19//!     fluxbench_cli::run();
20//! }
21//! ```
22
23mod config;
24mod executor;
25mod planner;
26mod supervisor;
27
28pub use config::*;
29pub use executor::{
30    ExecutionConfig, Executor, IsolatedExecutor, build_report, compute_statistics,
31    execute_verifications, format_human_output,
32};
33pub use supervisor::*;
34
35use clap::{Parser, Subcommand};
36use fluxbench_core::{BenchmarkDef, WorkerMain};
37use fluxbench_logic::aggregate_verifications;
38use fluxbench_report::{
39    OutputFormat, format_duration, generate_csv_report, generate_github_action_benchmark,
40    generate_github_summary, generate_html_report, generate_json_report,
41};
42use rayon::ThreadPoolBuilder;
43use regex::Regex;
44use std::io::Write;
45use std::path::PathBuf;
46use std::time::Instant;
47
48/// FluxBench CLI arguments
49#[derive(Parser, Debug)]
50#[command(name = "fluxbench")]
51#[command(author, version, about = "FluxBench - benchmarking framework for Rust")]
52pub struct Cli {
53    /// Optional subcommand (List, Run, Compare); defaults to Run
54    #[command(subcommand)]
55    pub command: Option<Commands>,
56
57    /// Filter benchmarks by regex pattern
58    #[arg(default_value = ".*")]
59    pub filter: String,
60
61    /// Output format: json, github-summary, github-action, csv, html, human
62    #[arg(long, default_value = "human")]
63    pub format: String,
64
65    /// Output file (stdout if not specified)
66    #[arg(short, long)]
67    pub output: Option<PathBuf>,
68
69    /// Load baseline for comparison
70    /// Optionally specify a path; defaults to config or target/fluxbench/baseline.json
71    #[arg(long)]
72    pub baseline: Option<Option<PathBuf>>,
73
74    /// Dry run - list benchmarks without executing
75    #[arg(long)]
76    pub dry_run: bool,
77
78    /// Regression threshold percentage
79    #[arg(long)]
80    pub threshold: Option<f64>,
81
82    /// Run benchmarks for this group only
83    #[arg(long)]
84    pub group: Option<String>,
85
86    /// Filter by tag
87    #[arg(long)]
88    pub tag: Option<String>,
89
90    /// Skip benchmarks with this tag
91    #[arg(long)]
92    pub skip_tag: Option<String>,
93
94    /// Warmup time in seconds
95    #[arg(long, default_value = "3")]
96    pub warmup: u64,
97
98    /// Measurement time in seconds
99    #[arg(long, default_value = "5")]
100    pub measurement: u64,
101
102    /// Fixed sample count mode: skip warmup, run exactly N iterations
103    /// Each iteration becomes one sample. Overrides warmup/measurement/min/max.
104    #[arg(long, short = 'n')]
105    pub samples: Option<u64>,
106
107    /// Minimum number of iterations
108    #[arg(long)]
109    pub min_iterations: Option<u64>,
110
111    /// Maximum number of iterations
112    #[arg(long)]
113    pub max_iterations: Option<u64>,
114
115    /// Verbose output
116    #[arg(short, long)]
117    pub verbose: bool,
118
119    /// Run benchmarks in isolated worker processes (default: true)
120    /// Use --isolated=false to disable and run in-process
121    #[arg(long, default_value = "true", action = clap::ArgAction::Set)]
122    pub isolated: bool,
123
124    /// Use fresh worker process for each benchmark (One-Shot mode)
125    /// Default is Persistent mode: reuse worker for safe Rust code
126    #[arg(long)]
127    pub one_shot: bool,
128
129    /// Worker timeout in seconds
130    #[arg(long, default_value = "60")]
131    pub worker_timeout: u64,
132
133    /// Number of parallel isolated workers
134    #[arg(long, default_value = "1")]
135    pub jobs: usize,
136
137    /// Number of threads for parallel statistics computation
138    /// 0 = use all available cores (default), 1 = single-threaded
139    #[arg(long, short = 'j', default_value = "0")]
140    pub threads: usize,
141
142    /// Internal: Run as worker process (used by supervisor)
143    #[arg(long, hide = true)]
144    pub flux_worker: bool,
145
146    /// Save benchmark results as baseline JSON
147    /// Optionally specify a path; defaults to config or target/fluxbench/baseline.json
148    #[arg(long)]
149    pub save_baseline: Option<Option<PathBuf>>,
150
151    /// Internal: Absorb cargo bench's --bench flag
152    #[arg(long, hide = true)]
153    pub bench: bool,
154}
155
156/// CLI subcommands
157#[derive(Subcommand, Debug)]
158pub enum Commands {
159    /// List all discovered benchmarks
160    List,
161    /// Run benchmarks (default)
162    Run,
163    /// Compare against a git ref
164    Compare {
165        /// Git ref to compare against (e.g., origin/main)
166        #[arg(name = "REF")]
167        git_ref: String,
168    },
169}
170
171/// Run the FluxBench CLI with the given arguments.
172/// This is the main entry point for benchmark binaries.
173///
174/// # Returns
175/// Returns `Ok(())` on success, or an error if something goes wrong.
176pub fn run() -> anyhow::Result<()> {
177    let cli = Cli::parse();
178    run_with_cli(cli)
179}
180
181/// Run the FluxBench CLI with pre-parsed arguments.
182pub fn run_with_cli(cli: Cli) -> anyhow::Result<()> {
183    // Handle worker mode first (before any other initialization)
184    if cli.flux_worker {
185        return run_worker_mode();
186    }
187
188    // Initialize logging
189    if cli.verbose {
190        tracing_subscriber::fmt()
191            .with_env_filter("fluxbench=debug")
192            .init();
193    } else {
194        tracing_subscriber::fmt()
195            .with_env_filter("fluxbench=info")
196            .init();
197    }
198
199    // Discover flux.toml configuration (CLI flags override)
200    let config = FluxConfig::discover().unwrap_or_default();
201
202    // Parse output format
203    let format: OutputFormat = cli.format.parse().unwrap_or(OutputFormat::Human);
204
205    // Resolve jobs: CLI wins if explicitly set (not default 1), else flux.toml, else 1
206    let jobs = if cli.jobs != 1 {
207        cli.jobs
208    } else {
209        config.runner.jobs.unwrap_or(1)
210    };
211
212    match cli.command {
213        Some(Commands::List) => {
214            list_benchmarks(&cli)?;
215        }
216        Some(Commands::Run) => {
217            run_benchmarks(&cli, &config, format, jobs)?;
218        }
219        Some(Commands::Compare { ref git_ref }) => {
220            compare_benchmarks(&cli, &config, git_ref, format)?;
221        }
222        None => {
223            // Default: run benchmarks
224            if cli.dry_run {
225                list_benchmarks(&cli)?;
226            } else {
227                run_benchmarks(&cli, &config, format, jobs)?;
228            }
229        }
230    }
231
232    Ok(())
233}
234
235/// Run as a worker process (IPC mode)
236fn run_worker_mode() -> anyhow::Result<()> {
237    let mut worker = WorkerMain::new();
238    worker
239        .run()
240        .map_err(|e| anyhow::anyhow!("Worker error: {}", e))
241}
242
243/// Filter benchmarks based on CLI options using the planner module.
244///
245/// Returns benchmarks sorted alphabetically by ID for deterministic execution.
246fn filter_benchmarks(
247    cli: &Cli,
248    benchmarks: &[&'static BenchmarkDef],
249) -> Vec<&'static BenchmarkDef> {
250    let filter_re = Regex::new(&cli.filter).ok();
251
252    let plan = planner::build_plan(
253        benchmarks.iter().copied(),
254        filter_re.as_ref(),
255        cli.group.as_deref(),
256        cli.tag.as_deref(),
257        cli.skip_tag.as_deref(),
258    );
259
260    plan.benchmarks
261}
262
263fn list_benchmarks(cli: &Cli) -> anyhow::Result<()> {
264    println!("FluxBench Plan:");
265
266    let all_benchmarks: Vec<_> = inventory::iter::<BenchmarkDef>.into_iter().collect();
267    let benchmarks = filter_benchmarks(cli, &all_benchmarks);
268
269    let mut groups: std::collections::BTreeMap<&str, Vec<&BenchmarkDef>> =
270        std::collections::BTreeMap::new();
271
272    for bench in &benchmarks {
273        groups.entry(bench.group).or_default().push(bench);
274    }
275
276    let mut total = 0;
277    for (group, benches) in &groups {
278        println!("├── group: {}", group);
279        for bench in benches {
280            let tags = if bench.tags.is_empty() {
281                String::new()
282            } else {
283                format!(" [{}]", bench.tags.join(", "))
284            };
285            println!(
286                "│   ├── {}{} ({}:{})",
287                bench.id, tags, bench.file, bench.line
288            );
289            total += 1;
290        }
291    }
292
293    println!("{} benchmarks found.", total);
294
295    // Show all available tags across the entire suite (not just filtered results)
296    // so users can discover what tags they can filter by.
297    let mut tag_counts: std::collections::BTreeMap<&str, usize> = std::collections::BTreeMap::new();
298    for bench in &all_benchmarks {
299        for tag in bench.tags {
300            *tag_counts.entry(tag).or_default() += 1;
301        }
302    }
303    if !tag_counts.is_empty() {
304        let tags_display: Vec<String> = tag_counts
305            .iter()
306            .map(|(tag, count)| format!("{} ({})", tag, count))
307            .collect();
308        println!("Tags: {}", tags_display.join(", "));
309    }
310
311    Ok(())
312}
313
314/// Build an ExecutionConfig by layering: flux.toml defaults → CLI overrides.
315fn build_execution_config(cli: &Cli, config: &FluxConfig) -> ExecutionConfig {
316    // Start from flux.toml values (parsed durations fall back to defaults on error)
317    let warmup_ns = FluxConfig::parse_duration(&config.runner.warmup_time).unwrap_or(3_000_000_000);
318    let measurement_ns =
319        FluxConfig::parse_duration(&config.runner.measurement_time).unwrap_or(5_000_000_000);
320
321    // CLI flags override config file values.
322    // clap defaults are warmup=3, measurement=5, so we check if the user explicitly
323    // passed different values by comparing against clap defaults. If the CLI value
324    // differs from clap's default, the user explicitly set it and it wins.
325    let warmup_time_ns = if cli.warmup != 3 {
326        cli.warmup * 1_000_000_000
327    } else {
328        warmup_ns
329    };
330    let measurement_time_ns = if cli.measurement != 5 {
331        cli.measurement * 1_000_000_000
332    } else {
333        measurement_ns
334    };
335
336    // --samples N: fixed-count mode, no warmup, each iteration = one sample
337    // CLI wins, then flux.toml
338    if let Some(n) = cli.samples.or(config.runner.samples) {
339        return ExecutionConfig {
340            warmup_time_ns: 0,
341            measurement_time_ns: 0,
342            min_iterations: Some(n),
343            max_iterations: Some(n),
344            track_allocations: config.allocator.track,
345            bootstrap_iterations: config.runner.bootstrap_iterations,
346            confidence_level: config.runner.confidence_level,
347        };
348    }
349
350    // min/max iterations: CLI wins if set, else config, else default
351    let min_iterations = cli.min_iterations.or(config.runner.min_iterations);
352    let max_iterations = cli.max_iterations.or(config.runner.max_iterations);
353
354    ExecutionConfig {
355        warmup_time_ns,
356        measurement_time_ns,
357        min_iterations,
358        max_iterations,
359        track_allocations: config.allocator.track,
360        bootstrap_iterations: config.runner.bootstrap_iterations,
361        confidence_level: config.runner.confidence_level,
362    }
363}
364
365fn run_benchmarks(
366    cli: &Cli,
367    config: &FluxConfig,
368    format: OutputFormat,
369    jobs: usize,
370) -> anyhow::Result<()> {
371    let jobs = jobs.max(1);
372
373    // Configure Rayon thread pool for statistics computation
374    if cli.threads > 0 {
375        ThreadPoolBuilder::new()
376            .num_threads(cli.threads)
377            .build_global()
378            .ok();
379    }
380
381    // Discover benchmarks
382    let all_benchmarks: Vec<_> = inventory::iter::<BenchmarkDef>.into_iter().collect();
383    let benchmarks = filter_benchmarks(cli, &all_benchmarks);
384
385    if benchmarks.is_empty() {
386        // If filtering by tag and no matches, check if the tag exists at all
387        if let Some(ref tag) = cli.tag {
388            let all_tags: std::collections::BTreeSet<&str> = all_benchmarks
389                .iter()
390                .flat_map(|b| b.tags.iter().copied())
391                .collect();
392            if !all_tags.contains(tag.as_str()) {
393                let available: Vec<&str> = all_tags.into_iter().collect();
394                eprintln!(
395                    "Warning: tag '{}' not found. Available tags: {}",
396                    tag,
397                    available.join(", ")
398                );
399            }
400        }
401        println!("No benchmarks found.");
402        return Ok(());
403    }
404
405    // Determine isolation mode: flux.toml can override CLI default
406    let isolated = if config.runner.isolation.is_isolated() {
407        cli.isolated
408    } else {
409        false
410    };
411
412    let threads_str = if cli.threads == 0 {
413        "all".to_string()
414    } else {
415        cli.threads.to_string()
416    };
417    let mode_str = if isolated {
418        if cli.one_shot {
419            " (isolated, one-shot)"
420        } else {
421            " (isolated, persistent)"
422        }
423    } else {
424        " (in-process)"
425    };
426    println!(
427        "Running {} benchmarks{}, {} threads, {} worker(s)...\n",
428        benchmarks.len(),
429        mode_str,
430        threads_str,
431        jobs
432    );
433
434    let start_time = Instant::now();
435
436    // Build execution config from flux.toml + CLI overrides
437    let exec_config = build_execution_config(cli, config);
438
439    if exec_config.bootstrap_iterations > 0 && exec_config.bootstrap_iterations < 100 {
440        eprintln!(
441            "Warning: bootstrap_iterations={} is very low; confidence intervals will be unreliable. \
442             Use >= 1000 for meaningful results, or 0 to skip bootstrap.",
443            exec_config.bootstrap_iterations
444        );
445    }
446
447    // Execute benchmarks (isolated by default per TDD)
448    let results = if isolated {
449        let timeout = std::time::Duration::from_secs(cli.worker_timeout);
450        let reuse_workers = !cli.one_shot;
451        let isolated_executor =
452            IsolatedExecutor::new(exec_config.clone(), timeout, reuse_workers, jobs);
453        isolated_executor.execute(&benchmarks)
454    } else {
455        if jobs > 1 {
456            eprintln!(
457                "Warning: --jobs currently applies only to isolated mode; running in-process serially."
458            );
459        }
460        let mut executor = Executor::new(exec_config.clone());
461        executor.execute(&benchmarks)
462    };
463
464    // Compute statistics
465    let stats = compute_statistics(&results, &exec_config);
466
467    // Warn if allocation tracking is enabled but nothing was recorded
468    if exec_config.track_allocations
469        && !results.is_empty()
470        && results
471            .iter()
472            .all(|r| r.alloc_bytes == 0 && r.alloc_count == 0)
473    {
474        eprintln!(
475            "Warning: allocation tracking enabled but all benchmarks reported 0 bytes allocated.\n\
476             Ensure TrackingAllocator is set as #[global_allocator] in your benchmark binary."
477        );
478    }
479
480    // Build report
481    let total_duration_ms = start_time.elapsed().as_secs_f64() * 1000.0;
482    let mut report = build_report(&results, &stats, &exec_config, total_duration_ms);
483
484    // Load and apply baseline comparison if --baseline was passed
485    if let Some(baseline_path) = resolve_baseline_path(&cli.baseline, config) {
486        if baseline_path.exists() {
487            match std::fs::read_to_string(&baseline_path).and_then(|json| {
488                serde_json::from_str::<fluxbench_report::Report>(&json)
489                    .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))
490            }) {
491                Ok(baseline) => {
492                    let threshold = cli.threshold.unwrap_or(config.ci.regression_threshold);
493                    apply_baseline_comparison(&mut report, &baseline, threshold);
494                }
495                Err(e) => {
496                    eprintln!(
497                        "Warning: failed to load baseline {}: {}",
498                        baseline_path.display(),
499                        e
500                    );
501                }
502            }
503        } else {
504            eprintln!(
505                "Warning: baseline file not found: {}",
506                baseline_path.display()
507            );
508        }
509    }
510
511    // Run comparisons, synthetics, and verifications
512    let (comparison_results, comparison_series, synthetic_results, verification_results) =
513        execute_verifications(&results, &stats);
514    let verification_summary = aggregate_verifications(&verification_results);
515    report.comparisons = comparison_results;
516    report.comparison_series = comparison_series;
517    report.synthetics = synthetic_results;
518    report.verifications = verification_results;
519
520    // Update summary with verification info
521    report.summary.critical_failures = verification_summary.critical_failures;
522    report.summary.warnings = verification_summary.failed - verification_summary.critical_failures;
523
524    // Emit GitHub Actions annotations if enabled
525    if config.ci.github_annotations {
526        emit_github_annotations(&report);
527    }
528
529    // Generate output
530    let output = match format {
531        OutputFormat::Json => generate_json_report(&report)?,
532        OutputFormat::GithubSummary => generate_github_summary(&report),
533        OutputFormat::GithubActionBenchmark => generate_github_action_benchmark(&report),
534        OutputFormat::Html => generate_html_report(&report),
535        OutputFormat::Csv => generate_csv_report(&report),
536        OutputFormat::Human => format_human_output(&report),
537    };
538
539    // Write output
540    if let Some(ref path) = cli.output {
541        let mut file = std::fs::File::create(path)?;
542        file.write_all(output.as_bytes())?;
543        println!("Report written to: {}", path.display());
544    } else {
545        print!("{}", output);
546    }
547
548    // Save baseline if requested
549    save_baseline_if_needed(cli, config, &report)?;
550
551    // Exit with appropriate code
552    let has_crashes = report
553        .results
554        .iter()
555        .any(|r| matches!(r.status, fluxbench_report::BenchmarkStatus::Crashed));
556
557    if verification_summary.should_fail_ci() || has_crashes {
558        if has_crashes {
559            eprintln!("\nBenchmark(s) crashed during execution");
560        }
561        if verification_summary.should_fail_ci() {
562            eprintln!(
563                "\n{} critical verification failure(s)",
564                verification_summary.critical_failures + verification_summary.critical_errors
565            );
566        }
567        std::process::exit(1);
568    }
569
570    Ok(())
571}
572
573fn compare_benchmarks(
574    cli: &Cli,
575    config: &FluxConfig,
576    git_ref: &str,
577    format: OutputFormat,
578) -> anyhow::Result<()> {
579    // Load baseline — resolve path from CLI, config, or default
580    let baseline_path = resolve_baseline_path(&cli.baseline, config).ok_or_else(|| {
581        anyhow::anyhow!(
582            "--baseline required for comparison, or use 'compare' command with a git ref"
583        )
584    })?;
585
586    if !baseline_path.exists() {
587        return Err(anyhow::anyhow!(
588            "Baseline file not found: {}",
589            baseline_path.display()
590        ));
591    }
592
593    let baseline_json = std::fs::read_to_string(&baseline_path)?;
594    let baseline: fluxbench_report::Report = serde_json::from_str(&baseline_json)?;
595    let resolved_git_ref = resolve_git_ref(git_ref)?;
596
597    if let Some(baseline_commit) = baseline.meta.git_commit.as_deref() {
598        let matches_ref = baseline_commit == resolved_git_ref
599            || baseline_commit.starts_with(&resolved_git_ref)
600            || resolved_git_ref.starts_with(baseline_commit);
601        if !matches_ref {
602            return Err(anyhow::anyhow!(
603                "Baseline commit {} does not match git ref {} ({})",
604                baseline_commit,
605                git_ref,
606                resolved_git_ref
607            ));
608        }
609    } else {
610        eprintln!(
611            "Warning: baseline report has no commit metadata; git ref consistency cannot be verified."
612        );
613    }
614
615    println!("Comparing against baseline: {}", baseline_path.display());
616    println!("Git ref: {} ({})\n", git_ref, resolved_git_ref);
617
618    // Run current benchmarks
619    let all_benchmarks: Vec<_> = inventory::iter::<BenchmarkDef>.into_iter().collect();
620    let benchmarks = filter_benchmarks(cli, &all_benchmarks);
621
622    if benchmarks.is_empty() {
623        println!("No benchmarks found.");
624        return Ok(());
625    }
626
627    let start_time = Instant::now();
628
629    let exec_config = build_execution_config(cli, config);
630
631    let mut executor = Executor::new(exec_config.clone());
632    let results = executor.execute(&benchmarks);
633    let stats = compute_statistics(&results, &exec_config);
634
635    let total_duration_ms = start_time.elapsed().as_secs_f64() * 1000.0;
636    let mut report = build_report(&results, &stats, &exec_config, total_duration_ms);
637
638    // Apply baseline comparison data
639    let regression_threshold = cli.threshold.unwrap_or(config.ci.regression_threshold);
640    apply_baseline_comparison(&mut report, &baseline, regression_threshold);
641
642    // Run comparisons, synthetics, and verifications
643    let (comparison_results, comparison_series, synthetic_results, verification_results) =
644        execute_verifications(&results, &stats);
645    let verification_summary = aggregate_verifications(&verification_results);
646    report.comparisons = comparison_results;
647    report.comparison_series = comparison_series;
648    report.synthetics = synthetic_results;
649    report.verifications = verification_results;
650    report.summary.critical_failures = verification_summary.critical_failures;
651    report.summary.warnings = verification_summary.failed - verification_summary.critical_failures;
652
653    // Emit GitHub Actions annotations if enabled
654    if config.ci.github_annotations {
655        emit_github_annotations(&report);
656    }
657
658    // Generate output
659    let output = match format {
660        OutputFormat::Json => generate_json_report(&report)?,
661        OutputFormat::GithubSummary => generate_github_summary(&report),
662        OutputFormat::GithubActionBenchmark => generate_github_action_benchmark(&report),
663        OutputFormat::Html => generate_html_report(&report),
664        OutputFormat::Csv => generate_csv_report(&report),
665        OutputFormat::Human => format_comparison_output(&report, &baseline),
666    };
667
668    if let Some(ref path) = cli.output {
669        let mut file = std::fs::File::create(path)?;
670        file.write_all(output.as_bytes())?;
671        println!("Report written to: {}", path.display());
672    } else {
673        print!("{}", output);
674    }
675
676    // Save baseline if requested
677    save_baseline_if_needed(cli, config, &report)?;
678
679    // Exit with error if regressions exceed threshold or verifications fail
680    let should_fail = report.summary.regressions > 0 || verification_summary.should_fail_ci();
681    if should_fail {
682        if report.summary.regressions > 0 {
683            eprintln!(
684                "\n{} regression(s) detected above {}% threshold",
685                report.summary.regressions, regression_threshold
686            );
687        }
688        if verification_summary.should_fail_ci() {
689            eprintln!(
690                "\n{} critical verification failure(s)",
691                verification_summary.critical_failures + verification_summary.critical_errors
692            );
693        }
694        std::process::exit(1);
695    }
696
697    Ok(())
698}
699
700/// Save the report as a baseline JSON file if configured.
701fn save_baseline_if_needed(
702    cli: &Cli,
703    config: &FluxConfig,
704    report: &fluxbench_report::Report,
705) -> anyhow::Result<()> {
706    // Determine if we should save: CLI --save-baseline flag or config.output.save_baseline
707    let should_save = cli.save_baseline.is_some() || config.output.save_baseline;
708    if !should_save {
709        return Ok(());
710    }
711
712    // Resolve path: CLI value > config value > default
713    let path = cli
714        .save_baseline
715        .as_ref()
716        .and_then(|opt| opt.clone())
717        .or_else(|| config.output.baseline_path.as_ref().map(PathBuf::from))
718        .unwrap_or_else(|| PathBuf::from("target/fluxbench/baseline.json"));
719
720    if let Some(parent) = path.parent() {
721        std::fs::create_dir_all(parent)?;
722    }
723
724    let json = generate_json_report(report)?;
725    std::fs::write(&path, json)?;
726    eprintln!("Baseline saved to: {}", path.display());
727
728    Ok(())
729}
730
731/// Apply baseline comparison data to the report.
732///
733/// Computes per-benchmark regression/improvement metrics by comparing current
734/// results against baseline means, CI overlap, and effect size.
735fn apply_baseline_comparison(
736    report: &mut fluxbench_report::Report,
737    baseline: &fluxbench_report::Report,
738    regression_threshold: f64,
739) {
740    report.baseline_meta = Some(baseline.meta.clone());
741
742    let baseline_map: std::collections::HashMap<_, _> = baseline
743        .results
744        .iter()
745        .filter_map(|r| r.metrics.as_ref().map(|m| (r.id.clone(), m.clone())))
746        .collect();
747
748    for result in &mut report.results {
749        if let (Some(metrics), Some(baseline_metrics)) =
750            (&result.metrics, baseline_map.get(&result.id))
751        {
752            // Use per-benchmark threshold if set (> 0.0), otherwise global
753            let effective_threshold = if result.threshold > 0.0 {
754                result.threshold
755            } else {
756                regression_threshold
757            };
758
759            let baseline_mean = baseline_metrics.mean_ns;
760            let absolute_change = metrics.mean_ns - baseline_mean;
761            let relative_change = if baseline_mean > 0.0 {
762                (absolute_change / baseline_mean) * 100.0
763            } else {
764                0.0
765            };
766
767            let ci_non_overlap = metrics.ci_upper_ns < baseline_metrics.ci_lower_ns
768                || metrics.ci_lower_ns > baseline_metrics.ci_upper_ns;
769            let is_significant = relative_change.abs() > effective_threshold && ci_non_overlap;
770
771            if relative_change > effective_threshold {
772                report.summary.regressions += 1;
773            } else if relative_change < -effective_threshold {
774                report.summary.improvements += 1;
775            }
776
777            let mut effect_size = if metrics.std_dev_ns > f64::EPSILON {
778                absolute_change / metrics.std_dev_ns
779            } else {
780                0.0
781            };
782            if !effect_size.is_finite() {
783                effect_size = 0.0;
784            }
785
786            let probability_regression = if ci_non_overlap {
787                if relative_change > 0.0 { 0.99 } else { 0.01 }
788            } else if relative_change > 0.0 {
789                0.60
790            } else {
791                0.40
792            };
793
794            result.comparison = Some(fluxbench_report::Comparison {
795                baseline_mean_ns: baseline_mean,
796                absolute_change_ns: absolute_change,
797                relative_change,
798                probability_regression,
799                is_significant,
800                effect_size,
801            });
802        }
803    }
804}
805
806/// Resolve baseline path from CLI flag, config, or default.
807///
808/// - `Some(Some(path))` — explicit path from `--baseline /path/to/file`
809/// - `Some(None)` — `--baseline` with no value, use config or default
810/// - `None` — flag not passed at all
811fn resolve_baseline_path(
812    cli_baseline: &Option<Option<PathBuf>>,
813    config: &FluxConfig,
814) -> Option<PathBuf> {
815    match cli_baseline {
816        Some(Some(path)) => Some(path.clone()),
817        Some(None) => {
818            // --baseline passed without path: use config or default
819            Some(
820                config
821                    .output
822                    .baseline_path
823                    .as_ref()
824                    .map(PathBuf::from)
825                    .unwrap_or_else(|| PathBuf::from("target/fluxbench/baseline.json")),
826            )
827        }
828        None => None,
829    }
830}
831
832/// Emit `::error::` and `::warning::` annotations for GitHub Actions.
833///
834/// These appear inline on PR diffs when running in GitHub Actions CI.
835fn emit_github_annotations(report: &fluxbench_report::Report) {
836    // Annotate crashed/failed benchmarks
837    for result in &report.results {
838        match result.status {
839            fluxbench_report::BenchmarkStatus::Crashed => {
840                let msg = result
841                    .failure
842                    .as_ref()
843                    .map(|f| f.message.as_str())
844                    .unwrap_or("benchmark crashed");
845                println!(
846                    "::error file={},line={}::{}: {}",
847                    result.file, result.line, result.id, msg
848                );
849            }
850            fluxbench_report::BenchmarkStatus::Failed => {
851                let msg = result
852                    .failure
853                    .as_ref()
854                    .map(|f| f.message.as_str())
855                    .unwrap_or("benchmark failed");
856                println!(
857                    "::error file={},line={}::{}: {}",
858                    result.file, result.line, result.id, msg
859                );
860            }
861            _ => {}
862        }
863
864        // Annotate significant regressions
865        if let Some(cmp) = &result.comparison {
866            if cmp.is_significant && cmp.relative_change > 0.0 {
867                println!(
868                    "::error file={},line={}::{}: regression {:+.1}% ({} → {})",
869                    result.file,
870                    result.line,
871                    result.id,
872                    cmp.relative_change,
873                    format_duration(cmp.baseline_mean_ns),
874                    result
875                        .metrics
876                        .as_ref()
877                        .map(|m| format_duration(m.mean_ns))
878                        .unwrap_or_default(),
879                );
880            }
881        }
882    }
883
884    // Annotate verification failures
885    for v in &report.verifications {
886        match &v.status {
887            fluxbench_logic::VerificationStatus::Failed => {
888                let level = match v.severity {
889                    fluxbench_core::Severity::Critical => "error",
890                    _ => "warning",
891                };
892                println!("::{}::{}: {}", level, v.id, v.message);
893            }
894            fluxbench_logic::VerificationStatus::Error { message } => {
895                println!("::error::{}: evaluation error: {}", v.id, message);
896            }
897            _ => {}
898        }
899    }
900}
901
902fn resolve_git_ref(git_ref: &str) -> anyhow::Result<String> {
903    let output = std::process::Command::new("git")
904        .args(["rev-parse", "--verify", git_ref])
905        .output()
906        .map_err(|e| anyhow::anyhow!("Failed to resolve git ref '{}': {}", git_ref, e))?;
907
908    if !output.status.success() {
909        let stderr = String::from_utf8_lossy(&output.stderr);
910        return Err(anyhow::anyhow!(
911            "Invalid git ref '{}': {}",
912            git_ref,
913            stderr.trim()
914        ));
915    }
916
917    let resolved = String::from_utf8(output.stdout)?.trim().to_string();
918    if resolved.is_empty() {
919        return Err(anyhow::anyhow!(
920            "Git ref '{}' resolved to an empty commit hash",
921            git_ref
922        ));
923    }
924
925    Ok(resolved)
926}
927
928/// Format comparison output for human display
929fn format_comparison_output(
930    report: &fluxbench_report::Report,
931    baseline: &fluxbench_report::Report,
932) -> String {
933    let mut output = String::new();
934
935    output.push('\n');
936    output.push_str("FluxBench Comparison Results\n");
937    output.push_str(&"=".repeat(60));
938    output.push_str("\n\n");
939
940    output.push_str(&format!(
941        "Baseline: {} ({})\n",
942        baseline.meta.git_commit.as_deref().unwrap_or("unknown"),
943        baseline.meta.timestamp.format("%Y-%m-%d %H:%M:%S")
944    ));
945    output.push_str(&format!(
946        "Current:  {} ({})\n\n",
947        report.meta.git_commit.as_deref().unwrap_or("unknown"),
948        report.meta.timestamp.format("%Y-%m-%d %H:%M:%S")
949    ));
950
951    for result in &report.results {
952        let status_icon = match result.status {
953            fluxbench_report::BenchmarkStatus::Passed => "✓",
954            fluxbench_report::BenchmarkStatus::Failed => "✗",
955            fluxbench_report::BenchmarkStatus::Crashed => "💥",
956            fluxbench_report::BenchmarkStatus::Skipped => "⊘",
957        };
958
959        output.push_str(&format!("{} {}\n", status_icon, result.id));
960
961        if let (Some(metrics), Some(comparison)) = (&result.metrics, &result.comparison) {
962            let change_icon = if comparison.relative_change > 5.0 {
963                "📈 REGRESSION"
964            } else if comparison.relative_change < -5.0 {
965                "📉 improvement"
966            } else {
967                "≈ no change"
968            };
969
970            output.push_str(&format!(
971                "    baseline: {} → current: {}\n",
972                format_duration(comparison.baseline_mean_ns),
973                format_duration(metrics.mean_ns),
974            ));
975            output.push_str(&format!(
976                "    change: {:+.2}% ({}) {}\n",
977                comparison.relative_change,
978                format_duration(comparison.absolute_change_ns.abs()),
979                change_icon,
980            ));
981        }
982
983        output.push('\n');
984    }
985
986    // Summary
987    output.push_str("Summary\n");
988    output.push_str(&"-".repeat(60));
989    output.push('\n');
990    output.push_str(&format!(
991        "  Regressions: {}  Improvements: {}  No Change: {}\n",
992        report.summary.regressions,
993        report.summary.improvements,
994        report.summary.total_benchmarks - report.summary.regressions - report.summary.improvements
995    ));
996
997    output
998}
999
1000#[cfg(test)]
1001mod tests {
1002    use super::*;
1003    use fluxbench_report::{
1004        BenchmarkMetrics, BenchmarkReportResult, BenchmarkStatus, Report, ReportConfig, ReportMeta,
1005        ReportSummary, SystemInfo,
1006    };
1007
1008    fn dummy_meta() -> ReportMeta {
1009        ReportMeta {
1010            schema_version: 1,
1011            version: "0.1.0".to_string(),
1012            timestamp: chrono::Utc::now(),
1013            git_commit: None,
1014            git_branch: None,
1015            system: SystemInfo {
1016                os: "linux".to_string(),
1017                os_version: "6.0".to_string(),
1018                cpu: "test".to_string(),
1019                cpu_cores: 1,
1020                memory_gb: 1.0,
1021            },
1022            config: ReportConfig {
1023                warmup_time_ns: 0,
1024                measurement_time_ns: 0,
1025                min_iterations: None,
1026                max_iterations: None,
1027                bootstrap_iterations: 0,
1028                confidence_level: 0.95,
1029                track_allocations: false,
1030            },
1031        }
1032    }
1033
1034    fn dummy_metrics(mean: f64) -> BenchmarkMetrics {
1035        BenchmarkMetrics {
1036            samples: 100,
1037            mean_ns: mean,
1038            median_ns: mean,
1039            std_dev_ns: mean * 0.01,
1040            min_ns: mean * 0.9,
1041            max_ns: mean * 1.1,
1042            p50_ns: mean,
1043            p90_ns: mean * 1.05,
1044            p95_ns: mean * 1.07,
1045            p99_ns: mean * 1.09,
1046            p999_ns: mean * 1.1,
1047            skewness: 0.0,
1048            kurtosis: 3.0,
1049            ci_lower_ns: mean * 0.98,
1050            ci_upper_ns: mean * 1.02,
1051            ci_level: 0.95,
1052            throughput_ops_sec: None,
1053            alloc_bytes: 0,
1054            alloc_count: 0,
1055            mean_cycles: 0.0,
1056            median_cycles: 0.0,
1057            min_cycles: 0,
1058            max_cycles: 0,
1059            cycles_per_ns: 0.0,
1060        }
1061    }
1062
1063    fn dummy_result(id: &str, mean: f64, threshold: f64) -> BenchmarkReportResult {
1064        BenchmarkReportResult {
1065            id: id.to_string(),
1066            name: id.to_string(),
1067            group: "test".to_string(),
1068            status: BenchmarkStatus::Passed,
1069            severity: fluxbench_core::Severity::Warning,
1070            file: "test.rs".to_string(),
1071            line: 1,
1072            metrics: Some(dummy_metrics(mean)),
1073            threshold,
1074            comparison: None,
1075            failure: None,
1076        }
1077    }
1078
1079    fn dummy_report(results: Vec<BenchmarkReportResult>) -> Report {
1080        let total = results.len();
1081        Report {
1082            meta: dummy_meta(),
1083            results,
1084            comparisons: vec![],
1085            comparison_series: vec![],
1086            synthetics: vec![],
1087            verifications: vec![],
1088            summary: ReportSummary {
1089                total_benchmarks: total,
1090                passed: total,
1091                ..Default::default()
1092            },
1093            baseline_meta: None,
1094        }
1095    }
1096
1097    #[test]
1098    fn per_bench_threshold_overrides_global() {
1099        // Baseline: 100ns. Current: 108ns → 8% regression.
1100        // Global threshold: 25%. Per-bench threshold: 5%.
1101        // Should detect regression via per-bench threshold but not global.
1102        let mut report = dummy_report(vec![dummy_result("fast_bench", 108.0, 5.0)]);
1103        let baseline = dummy_report(vec![dummy_result("fast_bench", 100.0, 5.0)]);
1104
1105        apply_baseline_comparison(&mut report, &baseline, 25.0);
1106
1107        assert_eq!(
1108            report.summary.regressions, 1,
1109            "per-bench 5% should catch 8% regression"
1110        );
1111        let cmp = report.results[0].comparison.as_ref().unwrap();
1112        assert!(cmp.is_significant);
1113    }
1114
1115    #[test]
1116    fn zero_threshold_falls_back_to_global() {
1117        // Baseline: 100ns. Current: 108ns → 8% regression.
1118        // Global threshold: 25%. Per-bench threshold: 0.0 (use global).
1119        // 8% < 25%, so no regression.
1120        let mut report = dummy_report(vec![dummy_result("normal_bench", 108.0, 0.0)]);
1121        let baseline = dummy_report(vec![dummy_result("normal_bench", 100.0, 0.0)]);
1122
1123        apply_baseline_comparison(&mut report, &baseline, 25.0);
1124
1125        assert_eq!(
1126            report.summary.regressions, 0,
1127            "8% under 25% global should not regress"
1128        );
1129        let cmp = report.results[0].comparison.as_ref().unwrap();
1130        assert!(!cmp.is_significant);
1131    }
1132
1133    #[test]
1134    fn mixed_thresholds_independent() {
1135        // Two benchmarks: one with tight per-bench threshold, one using global.
1136        // Both regress by 8%.
1137        let mut report = dummy_report(vec![
1138            dummy_result("tight", 108.0, 5.0), // per-bench 5% → should regress
1139            dummy_result("loose", 108.0, 0.0), // global 25% → should not
1140        ]);
1141        let baseline = dummy_report(vec![
1142            dummy_result("tight", 100.0, 5.0),
1143            dummy_result("loose", 100.0, 0.0),
1144        ]);
1145
1146        apply_baseline_comparison(&mut report, &baseline, 25.0);
1147
1148        assert_eq!(report.summary.regressions, 1);
1149        assert!(
1150            report.results[0]
1151                .comparison
1152                .as_ref()
1153                .unwrap()
1154                .is_significant
1155        );
1156        assert!(
1157            !report.results[1]
1158                .comparison
1159                .as_ref()
1160                .unwrap()
1161                .is_significant
1162        );
1163    }
1164
1165    #[test]
1166    fn per_bench_threshold_detects_improvement() {
1167        // Baseline: 100ns. Current: 90ns → -10% improvement.
1168        // Per-bench threshold: 5%.
1169        let mut report = dummy_report(vec![dummy_result("improving", 90.0, 5.0)]);
1170        let baseline = dummy_report(vec![dummy_result("improving", 100.0, 5.0)]);
1171
1172        apply_baseline_comparison(&mut report, &baseline, 25.0);
1173
1174        assert_eq!(report.summary.improvements, 1);
1175        assert_eq!(report.summary.regressions, 0);
1176    }
1177}
fluxbench_cli/lib.rs

fluxbench_cli/
lib.rs