Skip to main content

fluxbench_cli/
lib.rs

1#![warn(missing_docs)]
2//! FluxBench CLI Library
3//!
4//! This module provides the CLI infrastructure for benchmark binaries.
5//! Use `fluxbench::run()` (or `fluxbench_cli::run()`) in your main function to get the full
6//! fluxbench CLI experience with your registered benchmarks.
7//!
8//! # Example
9//!
10//! ```ignore
11//! use fluxbench::prelude::*;
12//!
13//! #[bench]
14//! fn my_benchmark(b: &mut Bencher) {
15//!     b.iter(|| expensive_operation());
16//! }
17//!
18//! fn main() {
19//!     fluxbench_cli::run();
20//! }
21//! ```
22
23mod config;
24mod executor;
25mod planner;
26mod supervisor;
27
28pub use config::*;
29pub use executor::{
30    ExecutionConfig, Executor, IsolatedExecutor, build_report, compute_statistics,
31    execute_verifications, format_human_output,
32};
33pub use supervisor::*;
34
35use clap::{Parser, Subcommand};
36use fluxbench_core::{BenchmarkDef, WorkerMain};
37use fluxbench_logic::aggregate_verifications;
38use fluxbench_report::{
39    OutputFormat, format_duration, generate_csv_report, generate_github_summary,
40    generate_html_report, generate_json_report,
41};
42use rayon::ThreadPoolBuilder;
43use regex::Regex;
44use std::io::Write;
45use std::path::PathBuf;
46use std::time::Instant;
47
48/// FluxBench CLI arguments
49#[derive(Parser, Debug)]
50#[command(name = "fluxbench")]
51#[command(author, version, about = "FluxBench - benchmarking framework for Rust")]
52pub struct Cli {
53    /// Optional subcommand (List, Run, Compare); defaults to Run
54    #[command(subcommand)]
55    pub command: Option<Commands>,
56
57    /// Filter benchmarks by regex pattern
58    #[arg(default_value = ".*")]
59    pub filter: String,
60
61    /// Output format: json, github-summary, csv, html, human
62    #[arg(long, default_value = "human")]
63    pub format: String,
64
65    /// Output file (stdout if not specified)
66    #[arg(short, long)]
67    pub output: Option<PathBuf>,
68
69    /// Load baseline for comparison
70    /// Optionally specify a path; defaults to config or target/fluxbench/baseline.json
71    #[arg(long)]
72    pub baseline: Option<Option<PathBuf>>,
73
74    /// Dry run - list benchmarks without executing
75    #[arg(long)]
76    pub dry_run: bool,
77
78    /// Regression threshold percentage
79    #[arg(long)]
80    pub threshold: Option<f64>,
81
82    /// Run benchmarks for this group only
83    #[arg(long)]
84    pub group: Option<String>,
85
86    /// Filter by tag
87    #[arg(long)]
88    pub tag: Option<String>,
89
90    /// Skip benchmarks with this tag
91    #[arg(long)]
92    pub skip_tag: Option<String>,
93
94    /// Warmup time in seconds
95    #[arg(long, default_value = "3")]
96    pub warmup: u64,
97
98    /// Measurement time in seconds
99    #[arg(long, default_value = "5")]
100    pub measurement: u64,
101
102    /// Fixed sample count mode: skip warmup, run exactly N iterations
103    /// Each iteration becomes one sample. Overrides warmup/measurement/min/max.
104    #[arg(long, short = 'n')]
105    pub samples: Option<u64>,
106
107    /// Minimum number of iterations
108    #[arg(long)]
109    pub min_iterations: Option<u64>,
110
111    /// Maximum number of iterations
112    #[arg(long)]
113    pub max_iterations: Option<u64>,
114
115    /// Verbose output
116    #[arg(short, long)]
117    pub verbose: bool,
118
119    /// Run benchmarks in isolated worker processes (default: true)
120    /// Use --isolated=false to disable and run in-process
121    #[arg(long, default_value = "true", action = clap::ArgAction::Set)]
122    pub isolated: bool,
123
124    /// Use fresh worker process for each benchmark (One-Shot mode)
125    /// Default is Persistent mode: reuse worker for safe Rust code
126    #[arg(long)]
127    pub one_shot: bool,
128
129    /// Worker timeout in seconds
130    #[arg(long, default_value = "60")]
131    pub worker_timeout: u64,
132
133    /// Number of parallel isolated workers
134    #[arg(long, default_value = "1")]
135    pub jobs: usize,
136
137    /// Number of threads for parallel statistics computation
138    /// 0 = use all available cores (default), 1 = single-threaded
139    #[arg(long, short = 'j', default_value = "0")]
140    pub threads: usize,
141
142    /// Internal: Run as worker process (used by supervisor)
143    #[arg(long, hide = true)]
144    pub flux_worker: bool,
145
146    /// Save benchmark results as baseline JSON
147    /// Optionally specify a path; defaults to config or target/fluxbench/baseline.json
148    #[arg(long)]
149    pub save_baseline: Option<Option<PathBuf>>,
150
151    /// Internal: Absorb cargo bench's --bench flag
152    #[arg(long, hide = true)]
153    pub bench: bool,
154}
155
156/// CLI subcommands
157#[derive(Subcommand, Debug)]
158pub enum Commands {
159    /// List all discovered benchmarks
160    List,
161    /// Run benchmarks (default)
162    Run,
163    /// Compare against a git ref
164    Compare {
165        /// Git ref to compare against (e.g., origin/main)
166        #[arg(name = "REF")]
167        git_ref: String,
168    },
169}
170
171/// Run the FluxBench CLI with the given arguments.
172/// This is the main entry point for benchmark binaries.
173///
174/// # Returns
175/// Returns `Ok(())` on success, or an error if something goes wrong.
176pub fn run() -> anyhow::Result<()> {
177    let cli = Cli::parse();
178    run_with_cli(cli)
179}
180
181/// Run the FluxBench CLI with pre-parsed arguments.
182pub fn run_with_cli(cli: Cli) -> anyhow::Result<()> {
183    // Handle worker mode first (before any other initialization)
184    if cli.flux_worker {
185        return run_worker_mode();
186    }
187
188    // Initialize logging
189    if cli.verbose {
190        tracing_subscriber::fmt()
191            .with_env_filter("fluxbench=debug")
192            .init();
193    } else {
194        tracing_subscriber::fmt()
195            .with_env_filter("fluxbench=info")
196            .init();
197    }
198
199    // Discover flux.toml configuration (CLI flags override)
200    let config = FluxConfig::discover().unwrap_or_default();
201
202    // Parse output format
203    let format: OutputFormat = cli.format.parse().unwrap_or(OutputFormat::Human);
204
205    // Resolve jobs: CLI wins if explicitly set (not default 1), else flux.toml, else 1
206    let jobs = if cli.jobs != 1 {
207        cli.jobs
208    } else {
209        config.runner.jobs.unwrap_or(1)
210    };
211
212    match cli.command {
213        Some(Commands::List) => {
214            list_benchmarks(&cli)?;
215        }
216        Some(Commands::Run) => {
217            run_benchmarks(&cli, &config, format, jobs)?;
218        }
219        Some(Commands::Compare { ref git_ref }) => {
220            compare_benchmarks(&cli, &config, git_ref, format)?;
221        }
222        None => {
223            // Default: run benchmarks
224            if cli.dry_run {
225                list_benchmarks(&cli)?;
226            } else {
227                run_benchmarks(&cli, &config, format, jobs)?;
228            }
229        }
230    }
231
232    Ok(())
233}
234
235/// Run as a worker process (IPC mode)
236fn run_worker_mode() -> anyhow::Result<()> {
237    let mut worker = WorkerMain::new();
238    worker
239        .run()
240        .map_err(|e| anyhow::anyhow!("Worker error: {}", e))
241}
242
243/// Filter benchmarks based on CLI options using the planner module.
244///
245/// Returns benchmarks sorted alphabetically by ID for deterministic execution.
246fn filter_benchmarks(
247    cli: &Cli,
248    benchmarks: &[&'static BenchmarkDef],
249) -> Vec<&'static BenchmarkDef> {
250    let filter_re = Regex::new(&cli.filter).ok();
251
252    let plan = planner::build_plan(
253        benchmarks.iter().copied(),
254        filter_re.as_ref(),
255        cli.group.as_deref(),
256        cli.tag.as_deref(),
257        cli.skip_tag.as_deref(),
258    );
259
260    plan.benchmarks
261}
262
263fn list_benchmarks(cli: &Cli) -> anyhow::Result<()> {
264    println!("FluxBench Plan:");
265
266    let all_benchmarks: Vec<_> = inventory::iter::<BenchmarkDef>.into_iter().collect();
267    let benchmarks = filter_benchmarks(cli, &all_benchmarks);
268
269    let mut groups: std::collections::BTreeMap<&str, Vec<&BenchmarkDef>> =
270        std::collections::BTreeMap::new();
271
272    for bench in &benchmarks {
273        groups.entry(bench.group).or_default().push(bench);
274    }
275
276    let mut total = 0;
277    for (group, benches) in &groups {
278        println!("├── group: {}", group);
279        for bench in benches {
280            let tags = if bench.tags.is_empty() {
281                String::new()
282            } else {
283                format!(" [{}]", bench.tags.join(", "))
284            };
285            println!(
286                "│   ├── {}{} ({}:{})",
287                bench.id, tags, bench.file, bench.line
288            );
289            total += 1;
290        }
291    }
292
293    println!("{} benchmarks found.", total);
294
295    // Show all available tags across the entire suite (not just filtered results)
296    // so users can discover what tags they can filter by.
297    let mut tag_counts: std::collections::BTreeMap<&str, usize> = std::collections::BTreeMap::new();
298    for bench in &all_benchmarks {
299        for tag in bench.tags {
300            *tag_counts.entry(tag).or_default() += 1;
301        }
302    }
303    if !tag_counts.is_empty() {
304        let tags_display: Vec<String> = tag_counts
305            .iter()
306            .map(|(tag, count)| format!("{} ({})", tag, count))
307            .collect();
308        println!("Tags: {}", tags_display.join(", "));
309    }
310
311    Ok(())
312}
313
314/// Build an ExecutionConfig by layering: flux.toml defaults → CLI overrides.
315fn build_execution_config(cli: &Cli, config: &FluxConfig) -> ExecutionConfig {
316    // Start from flux.toml values (parsed durations fall back to defaults on error)
317    let warmup_ns = FluxConfig::parse_duration(&config.runner.warmup_time).unwrap_or(3_000_000_000);
318    let measurement_ns =
319        FluxConfig::parse_duration(&config.runner.measurement_time).unwrap_or(5_000_000_000);
320
321    // CLI flags override config file values.
322    // clap defaults are warmup=3, measurement=5, so we check if the user explicitly
323    // passed different values by comparing against clap defaults. If the CLI value
324    // differs from clap's default, the user explicitly set it and it wins.
325    let warmup_time_ns = if cli.warmup != 3 {
326        cli.warmup * 1_000_000_000
327    } else {
328        warmup_ns
329    };
330    let measurement_time_ns = if cli.measurement != 5 {
331        cli.measurement * 1_000_000_000
332    } else {
333        measurement_ns
334    };
335
336    // --samples N: fixed-count mode, no warmup, each iteration = one sample
337    // CLI wins, then flux.toml
338    if let Some(n) = cli.samples.or(config.runner.samples) {
339        return ExecutionConfig {
340            warmup_time_ns: 0,
341            measurement_time_ns: 0,
342            min_iterations: Some(n),
343            max_iterations: Some(n),
344            track_allocations: config.allocator.track,
345            bootstrap_iterations: config.runner.bootstrap_iterations,
346            confidence_level: config.runner.confidence_level,
347        };
348    }
349
350    // min/max iterations: CLI wins if set, else config, else default
351    let min_iterations = cli.min_iterations.or(config.runner.min_iterations);
352    let max_iterations = cli.max_iterations.or(config.runner.max_iterations);
353
354    ExecutionConfig {
355        warmup_time_ns,
356        measurement_time_ns,
357        min_iterations,
358        max_iterations,
359        track_allocations: config.allocator.track,
360        bootstrap_iterations: config.runner.bootstrap_iterations,
361        confidence_level: config.runner.confidence_level,
362    }
363}
364
365fn run_benchmarks(
366    cli: &Cli,
367    config: &FluxConfig,
368    format: OutputFormat,
369    jobs: usize,
370) -> anyhow::Result<()> {
371    let jobs = jobs.max(1);
372
373    // Configure Rayon thread pool for statistics computation
374    if cli.threads > 0 {
375        ThreadPoolBuilder::new()
376            .num_threads(cli.threads)
377            .build_global()
378            .ok();
379    }
380
381    // Discover benchmarks
382    let all_benchmarks: Vec<_> = inventory::iter::<BenchmarkDef>.into_iter().collect();
383    let benchmarks = filter_benchmarks(cli, &all_benchmarks);
384
385    if benchmarks.is_empty() {
386        // If filtering by tag and no matches, check if the tag exists at all
387        if let Some(ref tag) = cli.tag {
388            let all_tags: std::collections::BTreeSet<&str> = all_benchmarks
389                .iter()
390                .flat_map(|b| b.tags.iter().copied())
391                .collect();
392            if !all_tags.contains(tag.as_str()) {
393                let available: Vec<&str> = all_tags.into_iter().collect();
394                eprintln!(
395                    "Warning: tag '{}' not found. Available tags: {}",
396                    tag,
397                    available.join(", ")
398                );
399            }
400        }
401        println!("No benchmarks found.");
402        return Ok(());
403    }
404
405    // Determine isolation mode: flux.toml can override CLI default
406    let isolated = if config.runner.isolation.is_isolated() {
407        cli.isolated
408    } else {
409        false
410    };
411
412    let threads_str = if cli.threads == 0 {
413        "all".to_string()
414    } else {
415        cli.threads.to_string()
416    };
417    let mode_str = if isolated {
418        if cli.one_shot {
419            " (isolated, one-shot)"
420        } else {
421            " (isolated, persistent)"
422        }
423    } else {
424        " (in-process)"
425    };
426    println!(
427        "Running {} benchmarks{}, {} threads, {} worker(s)...\n",
428        benchmarks.len(),
429        mode_str,
430        threads_str,
431        jobs
432    );
433
434    let start_time = Instant::now();
435
436    // Build execution config from flux.toml + CLI overrides
437    let exec_config = build_execution_config(cli, config);
438
439    if exec_config.bootstrap_iterations > 0 && exec_config.bootstrap_iterations < 100 {
440        eprintln!(
441            "Warning: bootstrap_iterations={} is very low; confidence intervals will be unreliable. \
442             Use >= 1000 for meaningful results, or 0 to skip bootstrap.",
443            exec_config.bootstrap_iterations
444        );
445    }
446
447    // Execute benchmarks (isolated by default per TDD)
448    let results = if isolated {
449        let timeout = std::time::Duration::from_secs(cli.worker_timeout);
450        let reuse_workers = !cli.one_shot;
451        let isolated_executor =
452            IsolatedExecutor::new(exec_config.clone(), timeout, reuse_workers, jobs);
453        isolated_executor.execute(&benchmarks)
454    } else {
455        if jobs > 1 {
456            eprintln!(
457                "Warning: --jobs currently applies only to isolated mode; running in-process serially."
458            );
459        }
460        let mut executor = Executor::new(exec_config.clone());
461        executor.execute(&benchmarks)
462    };
463
464    // Compute statistics
465    let stats = compute_statistics(&results, &exec_config);
466
467    // Warn if allocation tracking is enabled but nothing was recorded
468    if exec_config.track_allocations
469        && !results.is_empty()
470        && results
471            .iter()
472            .all(|r| r.alloc_bytes == 0 && r.alloc_count == 0)
473    {
474        eprintln!(
475            "Warning: allocation tracking enabled but all benchmarks reported 0 bytes allocated.\n\
476             Ensure TrackingAllocator is set as #[global_allocator] in your benchmark binary."
477        );
478    }
479
480    // Build report
481    let total_duration_ms = start_time.elapsed().as_secs_f64() * 1000.0;
482    let mut report = build_report(&results, &stats, &exec_config, total_duration_ms);
483
484    // Load and apply baseline comparison if --baseline was passed
485    if let Some(baseline_path) = resolve_baseline_path(&cli.baseline, config) {
486        if baseline_path.exists() {
487            match std::fs::read_to_string(&baseline_path).and_then(|json| {
488                serde_json::from_str::<fluxbench_report::Report>(&json)
489                    .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))
490            }) {
491                Ok(baseline) => {
492                    let threshold = cli.threshold.unwrap_or(config.ci.regression_threshold);
493                    apply_baseline_comparison(&mut report, &baseline, threshold);
494                }
495                Err(e) => {
496                    eprintln!(
497                        "Warning: failed to load baseline {}: {}",
498                        baseline_path.display(),
499                        e
500                    );
501                }
502            }
503        } else {
504            eprintln!(
505                "Warning: baseline file not found: {}",
506                baseline_path.display()
507            );
508        }
509    }
510
511    // Run comparisons, synthetics, and verifications
512    let (comparison_results, comparison_series, synthetic_results, verification_results) =
513        execute_verifications(&results, &stats);
514    let verification_summary = aggregate_verifications(&verification_results);
515    report.comparisons = comparison_results;
516    report.comparison_series = comparison_series;
517    report.synthetics = synthetic_results;
518    report.verifications = verification_results;
519
520    // Update summary with verification info
521    report.summary.critical_failures = verification_summary.critical_failures;
522    report.summary.warnings = verification_summary.failed - verification_summary.critical_failures;
523
524    // Emit GitHub Actions annotations if enabled
525    if config.ci.github_annotations {
526        emit_github_annotations(&report);
527    }
528
529    // Generate output
530    let output = match format {
531        OutputFormat::Json => generate_json_report(&report)?,
532        OutputFormat::GithubSummary => generate_github_summary(&report),
533        OutputFormat::Html => generate_html_report(&report),
534        OutputFormat::Csv => generate_csv_report(&report),
535        OutputFormat::Human => format_human_output(&report),
536    };
537
538    // Write output
539    if let Some(ref path) = cli.output {
540        let mut file = std::fs::File::create(path)?;
541        file.write_all(output.as_bytes())?;
542        println!("Report written to: {}", path.display());
543    } else {
544        print!("{}", output);
545    }
546
547    // Save baseline if requested
548    save_baseline_if_needed(cli, config, &report)?;
549
550    // Exit with appropriate code
551    let has_crashes = report
552        .results
553        .iter()
554        .any(|r| matches!(r.status, fluxbench_report::BenchmarkStatus::Crashed));
555
556    if verification_summary.should_fail_ci() || has_crashes {
557        if has_crashes {
558            eprintln!("\nBenchmark(s) crashed during execution");
559        }
560        if verification_summary.should_fail_ci() {
561            eprintln!(
562                "\n{} critical verification failure(s)",
563                verification_summary.critical_failures + verification_summary.critical_errors
564            );
565        }
566        std::process::exit(1);
567    }
568
569    Ok(())
570}
571
572fn compare_benchmarks(
573    cli: &Cli,
574    config: &FluxConfig,
575    git_ref: &str,
576    format: OutputFormat,
577) -> anyhow::Result<()> {
578    // Load baseline — resolve path from CLI, config, or default
579    let baseline_path = resolve_baseline_path(&cli.baseline, config).ok_or_else(|| {
580        anyhow::anyhow!(
581            "--baseline required for comparison, or use 'compare' command with a git ref"
582        )
583    })?;
584
585    if !baseline_path.exists() {
586        return Err(anyhow::anyhow!(
587            "Baseline file not found: {}",
588            baseline_path.display()
589        ));
590    }
591
592    let baseline_json = std::fs::read_to_string(&baseline_path)?;
593    let baseline: fluxbench_report::Report = serde_json::from_str(&baseline_json)?;
594    let resolved_git_ref = resolve_git_ref(git_ref)?;
595
596    if let Some(baseline_commit) = baseline.meta.git_commit.as_deref() {
597        let matches_ref = baseline_commit == resolved_git_ref
598            || baseline_commit.starts_with(&resolved_git_ref)
599            || resolved_git_ref.starts_with(baseline_commit);
600        if !matches_ref {
601            return Err(anyhow::anyhow!(
602                "Baseline commit {} does not match git ref {} ({})",
603                baseline_commit,
604                git_ref,
605                resolved_git_ref
606            ));
607        }
608    } else {
609        eprintln!(
610            "Warning: baseline report has no commit metadata; git ref consistency cannot be verified."
611        );
612    }
613
614    println!("Comparing against baseline: {}", baseline_path.display());
615    println!("Git ref: {} ({})\n", git_ref, resolved_git_ref);
616
617    // Run current benchmarks
618    let all_benchmarks: Vec<_> = inventory::iter::<BenchmarkDef>.into_iter().collect();
619    let benchmarks = filter_benchmarks(cli, &all_benchmarks);
620
621    if benchmarks.is_empty() {
622        println!("No benchmarks found.");
623        return Ok(());
624    }
625
626    let start_time = Instant::now();
627
628    let exec_config = build_execution_config(cli, config);
629
630    let mut executor = Executor::new(exec_config.clone());
631    let results = executor.execute(&benchmarks);
632    let stats = compute_statistics(&results, &exec_config);
633
634    let total_duration_ms = start_time.elapsed().as_secs_f64() * 1000.0;
635    let mut report = build_report(&results, &stats, &exec_config, total_duration_ms);
636
637    // Apply baseline comparison data
638    let regression_threshold = cli.threshold.unwrap_or(config.ci.regression_threshold);
639    apply_baseline_comparison(&mut report, &baseline, regression_threshold);
640
641    // Run comparisons, synthetics, and verifications
642    let (comparison_results, comparison_series, synthetic_results, verification_results) =
643        execute_verifications(&results, &stats);
644    let verification_summary = aggregate_verifications(&verification_results);
645    report.comparisons = comparison_results;
646    report.comparison_series = comparison_series;
647    report.synthetics = synthetic_results;
648    report.verifications = verification_results;
649    report.summary.critical_failures = verification_summary.critical_failures;
650    report.summary.warnings = verification_summary.failed - verification_summary.critical_failures;
651
652    // Emit GitHub Actions annotations if enabled
653    if config.ci.github_annotations {
654        emit_github_annotations(&report);
655    }
656
657    // Generate output
658    let output = match format {
659        OutputFormat::Json => generate_json_report(&report)?,
660        OutputFormat::GithubSummary => generate_github_summary(&report),
661        OutputFormat::Html => generate_html_report(&report),
662        OutputFormat::Csv => generate_csv_report(&report),
663        OutputFormat::Human => format_comparison_output(&report, &baseline),
664    };
665
666    if let Some(ref path) = cli.output {
667        let mut file = std::fs::File::create(path)?;
668        file.write_all(output.as_bytes())?;
669        println!("Report written to: {}", path.display());
670    } else {
671        print!("{}", output);
672    }
673
674    // Save baseline if requested
675    save_baseline_if_needed(cli, config, &report)?;
676
677    // Exit with error if regressions exceed threshold or verifications fail
678    let should_fail = report.summary.regressions > 0 || verification_summary.should_fail_ci();
679    if should_fail {
680        if report.summary.regressions > 0 {
681            eprintln!(
682                "\n{} regression(s) detected above {}% threshold",
683                report.summary.regressions, regression_threshold
684            );
685        }
686        if verification_summary.should_fail_ci() {
687            eprintln!(
688                "\n{} critical verification failure(s)",
689                verification_summary.critical_failures + verification_summary.critical_errors
690            );
691        }
692        std::process::exit(1);
693    }
694
695    Ok(())
696}
697
698/// Save the report as a baseline JSON file if configured.
699fn save_baseline_if_needed(
700    cli: &Cli,
701    config: &FluxConfig,
702    report: &fluxbench_report::Report,
703) -> anyhow::Result<()> {
704    // Determine if we should save: CLI --save-baseline flag or config.output.save_baseline
705    let should_save = cli.save_baseline.is_some() || config.output.save_baseline;
706    if !should_save {
707        return Ok(());
708    }
709
710    // Resolve path: CLI value > config value > default
711    let path = cli
712        .save_baseline
713        .as_ref()
714        .and_then(|opt| opt.clone())
715        .or_else(|| config.output.baseline_path.as_ref().map(PathBuf::from))
716        .unwrap_or_else(|| PathBuf::from("target/fluxbench/baseline.json"));
717
718    if let Some(parent) = path.parent() {
719        std::fs::create_dir_all(parent)?;
720    }
721
722    let json = generate_json_report(report)?;
723    std::fs::write(&path, json)?;
724    eprintln!("Baseline saved to: {}", path.display());
725
726    Ok(())
727}
728
729/// Apply baseline comparison data to the report.
730///
731/// Computes per-benchmark regression/improvement metrics by comparing current
732/// results against baseline means, CI overlap, and effect size.
733fn apply_baseline_comparison(
734    report: &mut fluxbench_report::Report,
735    baseline: &fluxbench_report::Report,
736    regression_threshold: f64,
737) {
738    report.baseline_meta = Some(baseline.meta.clone());
739
740    let baseline_map: std::collections::HashMap<_, _> = baseline
741        .results
742        .iter()
743        .filter_map(|r| r.metrics.as_ref().map(|m| (r.id.clone(), m.clone())))
744        .collect();
745
746    for result in &mut report.results {
747        if let (Some(metrics), Some(baseline_metrics)) =
748            (&result.metrics, baseline_map.get(&result.id))
749        {
750            // Use per-benchmark threshold if set (> 0.0), otherwise global
751            let effective_threshold = if result.threshold > 0.0 {
752                result.threshold
753            } else {
754                regression_threshold
755            };
756
757            let baseline_mean = baseline_metrics.mean_ns;
758            let absolute_change = metrics.mean_ns - baseline_mean;
759            let relative_change = if baseline_mean > 0.0 {
760                (absolute_change / baseline_mean) * 100.0
761            } else {
762                0.0
763            };
764
765            let ci_non_overlap = metrics.ci_upper_ns < baseline_metrics.ci_lower_ns
766                || metrics.ci_lower_ns > baseline_metrics.ci_upper_ns;
767            let is_significant = relative_change.abs() > effective_threshold && ci_non_overlap;
768
769            if relative_change > effective_threshold {
770                report.summary.regressions += 1;
771            } else if relative_change < -effective_threshold {
772                report.summary.improvements += 1;
773            }
774
775            let mut effect_size = if metrics.std_dev_ns > f64::EPSILON {
776                absolute_change / metrics.std_dev_ns
777            } else {
778                0.0
779            };
780            if !effect_size.is_finite() {
781                effect_size = 0.0;
782            }
783
784            let probability_regression = if ci_non_overlap {
785                if relative_change > 0.0 { 0.99 } else { 0.01 }
786            } else if relative_change > 0.0 {
787                0.60
788            } else {
789                0.40
790            };
791
792            result.comparison = Some(fluxbench_report::Comparison {
793                baseline_mean_ns: baseline_mean,
794                absolute_change_ns: absolute_change,
795                relative_change,
796                probability_regression,
797                is_significant,
798                effect_size,
799            });
800        }
801    }
802}
803
804/// Resolve baseline path from CLI flag, config, or default.
805///
806/// - `Some(Some(path))` — explicit path from `--baseline /path/to/file`
807/// - `Some(None)` — `--baseline` with no value, use config or default
808/// - `None` — flag not passed at all
809fn resolve_baseline_path(
810    cli_baseline: &Option<Option<PathBuf>>,
811    config: &FluxConfig,
812) -> Option<PathBuf> {
813    match cli_baseline {
814        Some(Some(path)) => Some(path.clone()),
815        Some(None) => {
816            // --baseline passed without path: use config or default
817            Some(
818                config
819                    .output
820                    .baseline_path
821                    .as_ref()
822                    .map(PathBuf::from)
823                    .unwrap_or_else(|| PathBuf::from("target/fluxbench/baseline.json")),
824            )
825        }
826        None => None,
827    }
828}
829
830/// Emit `::error::` and `::warning::` annotations for GitHub Actions.
831///
832/// These appear inline on PR diffs when running in GitHub Actions CI.
833fn emit_github_annotations(report: &fluxbench_report::Report) {
834    // Annotate crashed/failed benchmarks
835    for result in &report.results {
836        match result.status {
837            fluxbench_report::BenchmarkStatus::Crashed => {
838                let msg = result
839                    .failure
840                    .as_ref()
841                    .map(|f| f.message.as_str())
842                    .unwrap_or("benchmark crashed");
843                println!(
844                    "::error file={},line={}::{}: {}",
845                    result.file, result.line, result.id, msg
846                );
847            }
848            fluxbench_report::BenchmarkStatus::Failed => {
849                let msg = result
850                    .failure
851                    .as_ref()
852                    .map(|f| f.message.as_str())
853                    .unwrap_or("benchmark failed");
854                println!(
855                    "::error file={},line={}::{}: {}",
856                    result.file, result.line, result.id, msg
857                );
858            }
859            _ => {}
860        }
861
862        // Annotate significant regressions
863        if let Some(cmp) = &result.comparison {
864            if cmp.is_significant && cmp.relative_change > 0.0 {
865                println!(
866                    "::error file={},line={}::{}: regression {:+.1}% ({} → {})",
867                    result.file,
868                    result.line,
869                    result.id,
870                    cmp.relative_change,
871                    format_duration(cmp.baseline_mean_ns),
872                    result
873                        .metrics
874                        .as_ref()
875                        .map(|m| format_duration(m.mean_ns))
876                        .unwrap_or_default(),
877                );
878            }
879        }
880    }
881
882    // Annotate verification failures
883    for v in &report.verifications {
884        match &v.status {
885            fluxbench_logic::VerificationStatus::Failed => {
886                let level = match v.severity {
887                    fluxbench_core::Severity::Critical => "error",
888                    _ => "warning",
889                };
890                println!("::{}::{}: {}", level, v.id, v.message);
891            }
892            fluxbench_logic::VerificationStatus::Error { message } => {
893                println!("::error::{}: evaluation error: {}", v.id, message);
894            }
895            _ => {}
896        }
897    }
898}
899
900fn resolve_git_ref(git_ref: &str) -> anyhow::Result<String> {
901    let output = std::process::Command::new("git")
902        .args(["rev-parse", "--verify", git_ref])
903        .output()
904        .map_err(|e| anyhow::anyhow!("Failed to resolve git ref '{}': {}", git_ref, e))?;
905
906    if !output.status.success() {
907        let stderr = String::from_utf8_lossy(&output.stderr);
908        return Err(anyhow::anyhow!(
909            "Invalid git ref '{}': {}",
910            git_ref,
911            stderr.trim()
912        ));
913    }
914
915    let resolved = String::from_utf8(output.stdout)?.trim().to_string();
916    if resolved.is_empty() {
917        return Err(anyhow::anyhow!(
918            "Git ref '{}' resolved to an empty commit hash",
919            git_ref
920        ));
921    }
922
923    Ok(resolved)
924}
925
926/// Format comparison output for human display
927fn format_comparison_output(
928    report: &fluxbench_report::Report,
929    baseline: &fluxbench_report::Report,
930) -> String {
931    let mut output = String::new();
932
933    output.push('\n');
934    output.push_str("FluxBench Comparison Results\n");
935    output.push_str(&"=".repeat(60));
936    output.push_str("\n\n");
937
938    output.push_str(&format!(
939        "Baseline: {} ({})\n",
940        baseline.meta.git_commit.as_deref().unwrap_or("unknown"),
941        baseline.meta.timestamp.format("%Y-%m-%d %H:%M:%S")
942    ));
943    output.push_str(&format!(
944        "Current:  {} ({})\n\n",
945        report.meta.git_commit.as_deref().unwrap_or("unknown"),
946        report.meta.timestamp.format("%Y-%m-%d %H:%M:%S")
947    ));
948
949    for result in &report.results {
950        let status_icon = match result.status {
951            fluxbench_report::BenchmarkStatus::Passed => "✓",
952            fluxbench_report::BenchmarkStatus::Failed => "✗",
953            fluxbench_report::BenchmarkStatus::Crashed => "💥",
954            fluxbench_report::BenchmarkStatus::Skipped => "⊘",
955        };
956
957        output.push_str(&format!("{} {}\n", status_icon, result.id));
958
959        if let (Some(metrics), Some(comparison)) = (&result.metrics, &result.comparison) {
960            let change_icon = if comparison.relative_change > 5.0 {
961                "📈 REGRESSION"
962            } else if comparison.relative_change < -5.0 {
963                "📉 improvement"
964            } else {
965                "≈ no change"
966            };
967
968            output.push_str(&format!(
969                "    baseline: {} → current: {}\n",
970                format_duration(comparison.baseline_mean_ns),
971                format_duration(metrics.mean_ns),
972            ));
973            output.push_str(&format!(
974                "    change: {:+.2}% ({}) {}\n",
975                comparison.relative_change,
976                format_duration(comparison.absolute_change_ns.abs()),
977                change_icon,
978            ));
979        }
980
981        output.push('\n');
982    }
983
984    // Summary
985    output.push_str("Summary\n");
986    output.push_str(&"-".repeat(60));
987    output.push('\n');
988    output.push_str(&format!(
989        "  Regressions: {}  Improvements: {}  No Change: {}\n",
990        report.summary.regressions,
991        report.summary.improvements,
992        report.summary.total_benchmarks - report.summary.regressions - report.summary.improvements
993    ));
994
995    output
996}
997
998#[cfg(test)]
999mod tests {
1000    use super::*;
1001    use fluxbench_report::{
1002        BenchmarkMetrics, BenchmarkReportResult, BenchmarkStatus, Report, ReportConfig, ReportMeta,
1003        ReportSummary, SystemInfo,
1004    };
1005
1006    fn dummy_meta() -> ReportMeta {
1007        ReportMeta {
1008            schema_version: 1,
1009            version: "0.1.0".to_string(),
1010            timestamp: chrono::Utc::now(),
1011            git_commit: None,
1012            git_branch: None,
1013            system: SystemInfo {
1014                os: "linux".to_string(),
1015                os_version: "6.0".to_string(),
1016                cpu: "test".to_string(),
1017                cpu_cores: 1,
1018                memory_gb: 1.0,
1019            },
1020            config: ReportConfig {
1021                warmup_time_ns: 0,
1022                measurement_time_ns: 0,
1023                min_iterations: None,
1024                max_iterations: None,
1025                bootstrap_iterations: 0,
1026                confidence_level: 0.95,
1027                track_allocations: false,
1028            },
1029        }
1030    }
1031
1032    fn dummy_metrics(mean: f64) -> BenchmarkMetrics {
1033        BenchmarkMetrics {
1034            samples: 100,
1035            mean_ns: mean,
1036            median_ns: mean,
1037            std_dev_ns: mean * 0.01,
1038            min_ns: mean * 0.9,
1039            max_ns: mean * 1.1,
1040            p50_ns: mean,
1041            p90_ns: mean * 1.05,
1042            p95_ns: mean * 1.07,
1043            p99_ns: mean * 1.09,
1044            p999_ns: mean * 1.1,
1045            skewness: 0.0,
1046            kurtosis: 3.0,
1047            ci_lower_ns: mean * 0.98,
1048            ci_upper_ns: mean * 1.02,
1049            ci_level: 0.95,
1050            throughput_ops_sec: None,
1051            alloc_bytes: 0,
1052            alloc_count: 0,
1053            mean_cycles: 0.0,
1054            median_cycles: 0.0,
1055            min_cycles: 0,
1056            max_cycles: 0,
1057            cycles_per_ns: 0.0,
1058        }
1059    }
1060
1061    fn dummy_result(id: &str, mean: f64, threshold: f64) -> BenchmarkReportResult {
1062        BenchmarkReportResult {
1063            id: id.to_string(),
1064            name: id.to_string(),
1065            group: "test".to_string(),
1066            status: BenchmarkStatus::Passed,
1067            severity: fluxbench_core::Severity::Warning,
1068            file: "test.rs".to_string(),
1069            line: 1,
1070            metrics: Some(dummy_metrics(mean)),
1071            threshold,
1072            comparison: None,
1073            failure: None,
1074        }
1075    }
1076
1077    fn dummy_report(results: Vec<BenchmarkReportResult>) -> Report {
1078        let total = results.len();
1079        Report {
1080            meta: dummy_meta(),
1081            results,
1082            comparisons: vec![],
1083            comparison_series: vec![],
1084            synthetics: vec![],
1085            verifications: vec![],
1086            summary: ReportSummary {
1087                total_benchmarks: total,
1088                passed: total,
1089                ..Default::default()
1090            },
1091            baseline_meta: None,
1092        }
1093    }
1094
1095    #[test]
1096    fn per_bench_threshold_overrides_global() {
1097        // Baseline: 100ns. Current: 108ns → 8% regression.
1098        // Global threshold: 25%. Per-bench threshold: 5%.
1099        // Should detect regression via per-bench threshold but not global.
1100        let mut report = dummy_report(vec![dummy_result("fast_bench", 108.0, 5.0)]);
1101        let baseline = dummy_report(vec![dummy_result("fast_bench", 100.0, 5.0)]);
1102
1103        apply_baseline_comparison(&mut report, &baseline, 25.0);
1104
1105        assert_eq!(
1106            report.summary.regressions, 1,
1107            "per-bench 5% should catch 8% regression"
1108        );
1109        let cmp = report.results[0].comparison.as_ref().unwrap();
1110        assert!(cmp.is_significant);
1111    }
1112
1113    #[test]
1114    fn zero_threshold_falls_back_to_global() {
1115        // Baseline: 100ns. Current: 108ns → 8% regression.
1116        // Global threshold: 25%. Per-bench threshold: 0.0 (use global).
1117        // 8% < 25%, so no regression.
1118        let mut report = dummy_report(vec![dummy_result("normal_bench", 108.0, 0.0)]);
1119        let baseline = dummy_report(vec![dummy_result("normal_bench", 100.0, 0.0)]);
1120
1121        apply_baseline_comparison(&mut report, &baseline, 25.0);
1122
1123        assert_eq!(
1124            report.summary.regressions, 0,
1125            "8% under 25% global should not regress"
1126        );
1127        let cmp = report.results[0].comparison.as_ref().unwrap();
1128        assert!(!cmp.is_significant);
1129    }
1130
1131    #[test]
1132    fn mixed_thresholds_independent() {
1133        // Two benchmarks: one with tight per-bench threshold, one using global.
1134        // Both regress by 8%.
1135        let mut report = dummy_report(vec![
1136            dummy_result("tight", 108.0, 5.0), // per-bench 5% → should regress
1137            dummy_result("loose", 108.0, 0.0), // global 25% → should not
1138        ]);
1139        let baseline = dummy_report(vec![
1140            dummy_result("tight", 100.0, 5.0),
1141            dummy_result("loose", 100.0, 0.0),
1142        ]);
1143
1144        apply_baseline_comparison(&mut report, &baseline, 25.0);
1145
1146        assert_eq!(report.summary.regressions, 1);
1147        assert!(
1148            report.results[0]
1149                .comparison
1150                .as_ref()
1151                .unwrap()
1152                .is_significant
1153        );
1154        assert!(
1155            !report.results[1]
1156                .comparison
1157                .as_ref()
1158                .unwrap()
1159                .is_significant
1160        );
1161    }
1162
1163    #[test]
1164    fn per_bench_threshold_detects_improvement() {
1165        // Baseline: 100ns. Current: 90ns → -10% improvement.
1166        // Per-bench threshold: 5%.
1167        let mut report = dummy_report(vec![dummy_result("improving", 90.0, 5.0)]);
1168        let baseline = dummy_report(vec![dummy_result("improving", 100.0, 5.0)]);
1169
1170        apply_baseline_comparison(&mut report, &baseline, 25.0);
1171
1172        assert_eq!(report.summary.improvements, 1);
1173        assert_eq!(report.summary.regressions, 0);
1174    }
1175}