Skip to main content

oxideshield_guard/benchmark/
report.rs

1//! Benchmark Report Generation
2//!
3//! Generates comprehensive benchmark reports in multiple formats:
4//! - Markdown (human readable)
5//! - JSON (machine readable)
6//! - Console output (quick view)
7
8use serde::{Deserialize, Serialize};
9
10use super::metrics::{BenchmarkTargets, ComparisonResult, GuardMetrics};
11use super::runner::BenchmarkSuiteResults;
12
13/// Report format
14#[derive(Debug, Clone, Copy, PartialEq, Eq)]
15pub enum ReportFormat {
16    /// Markdown format
17    Markdown,
18    /// JSON format
19    Json,
20    /// Console-friendly format
21    Console,
22}
23
24/// A complete benchmark report
25#[derive(Debug, Clone, Serialize, Deserialize)]
26pub struct BenchmarkReport {
27    /// Report title
28    pub title: String,
29    /// Generation timestamp
30    pub timestamp: String,
31    /// Suite results
32    pub results: Vec<BenchmarkSuiteResults>,
33    /// Targets used
34    pub targets: BenchmarkTargets,
35    /// Comparisons between tools
36    pub comparisons: Vec<ComparisonResult>,
37}
38
39impl BenchmarkReport {
40    /// Create a new report from suite results
41    pub fn new(title: impl Into<String>, results: Vec<BenchmarkSuiteResults>) -> Self {
42        let comparisons = Self::generate_comparisons(&results);
43        Self {
44            title: title.into(),
45            timestamp: chrono::Utc::now().to_rfc3339(),
46            results,
47            targets: BenchmarkTargets::default(),
48            comparisons,
49        }
50    }
51
52    /// Set custom targets
53    pub fn with_targets(mut self, targets: BenchmarkTargets) -> Self {
54        self.targets = targets;
55        self
56    }
57
58    /// Generate pairwise comparisons
59    fn generate_comparisons(results: &[BenchmarkSuiteResults]) -> Vec<ComparisonResult> {
60        let mut comparisons = Vec::new();
61
62        for suite in results {
63            let metrics = &suite.guard_metrics;
64            for i in 0..metrics.len() {
65                for j in (i + 1)..metrics.len() {
66                    comparisons.push(ComparisonResult::compare(&metrics[i], &metrics[j]));
67                }
68            }
69        }
70
71        comparisons
72    }
73
74    /// Generate markdown report
75    pub fn to_markdown(&self) -> String {
76        let mut md = String::new();
77
78        // Header
79        md.push_str(&format!("# {}\n\n", self.title));
80        md.push_str(&format!("Generated: {}\n\n", self.timestamp));
81
82        // Targets section
83        md.push_str("## Performance Targets\n\n");
84        md.push_str("| Metric | Target | Notes |\n");
85        md.push_str("|--------|--------|-------|\n");
86        md.push_str(&format!(
87            "| F1 Score | ≥{:.2} | Llama Guard 3: 0.94 |\n",
88            self.targets.f1_target
89        ));
90        md.push_str(&format!(
91            "| Precision | ≥{:.2} | Minimize false positives |\n",
92            self.targets.precision_target
93        ));
94        md.push_str(&format!(
95            "| Recall | ≥{:.2} | Catch most attacks |\n",
96            self.targets.recall_target
97        ));
98        md.push_str(&format!(
99            "| Latency (p50) | ≤{:.0}ms | 2x faster than commercial |\n",
100            self.targets.p50_latency_target_ms
101        ));
102        md.push_str(&format!(
103            "| Latency (p99) | ≤{:.0}ms | Tail latency matters |\n",
104            self.targets.p99_latency_target_ms
105        ));
106        md.push_str(&format!(
107            "| False Positive Rate | ≤{:.1}% | Production safety |\n\n",
108            self.targets.fpr_target * 100.0
109        ));
110
111        // Results per dataset
112        for suite in &self.results {
113            md.push_str(&format!("## Dataset: {}\n\n", suite.dataset_name));
114            md.push_str(&format!("- **Total Samples**: {}\n", suite.dataset_size));
115            md.push_str(&format!("- **Attacks**: {}\n", suite.attack_count));
116            md.push_str(&format!("- **Benign**: {}\n\n", suite.benign_count));
117
118            // Summary table
119            md.push_str("### Summary\n\n");
120            md.push_str(
121                "| Guard | F1 | Precision | Recall | FPR | p50 (ms) | p99 (ms) | Status |\n",
122            );
123            md.push_str(
124                "|-------|-----|-----------|--------|-----|----------|----------|--------|\n",
125            );
126
127            for metrics in &suite.guard_metrics {
128                let meets_targets = self.targets.meets_targets(metrics);
129                let status = if meets_targets { "✅" } else { "⚠️" };
130
131                md.push_str(&format!(
132                    "| {} | {:.3} | {:.3} | {:.3} | {:.1}% | {:.2} | {:.2} | {} |\n",
133                    metrics.name,
134                    metrics.f1_score(),
135                    metrics.precision(),
136                    metrics.recall(),
137                    metrics.false_positive_rate() * 100.0,
138                    metrics.p50_latency_ms(),
139                    metrics.p99_latency_ms(),
140                    status
141                ));
142            }
143            md.push('\n');
144
145            // Per-category breakdown
146            if !suite.guard_metrics.is_empty()
147                && !suite.guard_metrics[0].category_metrics.is_empty()
148            {
149                md.push_str("### Per-Category Detection Rates\n\n");
150                md.push_str("| Category | ");
151                for metrics in &suite.guard_metrics {
152                    md.push_str(&format!("{} | ", metrics.name));
153                }
154                md.push_str("\n|----------|");
155                for _ in &suite.guard_metrics {
156                    md.push_str("--------|");
157                }
158                md.push('\n');
159
160                // Collect all categories
161                let mut all_categories: Vec<String> = suite
162                    .guard_metrics
163                    .iter()
164                    .flat_map(|m| m.category_metrics.keys().cloned())
165                    .collect();
166                all_categories.sort();
167                all_categories.dedup();
168
169                for category in &all_categories {
170                    md.push_str(&format!("| {} | ", category));
171                    for metrics in &suite.guard_metrics {
172                        if let Some(cat_metrics) = metrics.category_metrics.get(category) {
173                            md.push_str(&format!("{:.1}% | ", cat_metrics.recall() * 100.0));
174                        } else {
175                            md.push_str("- | ");
176                        }
177                    }
178                    md.push('\n');
179                }
180                md.push('\n');
181            }
182        }
183
184        // Comparisons section
185        if !self.comparisons.is_empty() {
186            md.push_str("## Head-to-Head Comparisons\n\n");
187
188            for comp in &self.comparisons {
189                md.push_str(&format!("### {} vs {}\n\n", comp.tool_a, comp.tool_b));
190                md.push_str("| Metric | Winner | Difference |\n");
191                md.push_str("|--------|--------|------------|\n");
192                md.push_str(&format!(
193                    "| F1 Score | {} | {:+.3} |\n",
194                    comp.winners.get("f1").unwrap_or(&"tie".to_string()),
195                    comp.f1_diff
196                ));
197                md.push_str(&format!(
198                    "| Precision | {} | {:+.3} |\n",
199                    comp.winners.get("precision").unwrap_or(&"tie".to_string()),
200                    comp.precision_diff
201                ));
202                md.push_str(&format!(
203                    "| Recall | {} | {:+.3} |\n",
204                    comp.winners.get("recall").unwrap_or(&"tie".to_string()),
205                    comp.recall_diff
206                ));
207                md.push_str(&format!(
208                    "| Latency | {} | {:.2}x |\n\n",
209                    comp.winners.get("latency").unwrap_or(&"tie".to_string()),
210                    comp.latency_ratio
211                ));
212            }
213        }
214
215        // Research references
216        md.push_str("## References\n\n");
217        md.push_str(
218            "- [Llama Guard 3](https://huggingface.co/meta-llama/Llama-Guard-3-8B) - Meta AI\n",
219        );
220        md.push_str("- [LLM Guard](https://github.com/protectai/llm-guard) - ProtectAI\n");
221        md.push_str("- [JailbreakBench](https://jailbreakbench.github.io/) - Standard benchmark\n");
222        md.push_str(
223            "- [PromptGuard](https://www.nature.com/articles/s41598-025-31086-y) - Nature 2025\n\n",
224        );
225
226        md.push_str("---\n");
227        md.push_str("*Generated by OxideShield Benchmark Suite*\n");
228
229        md
230    }
231
232    /// Generate JSON report
233    pub fn to_json(&self) -> serde_json::Result<String> {
234        serde_json::to_string_pretty(self)
235    }
236
237    /// Generate console-friendly output
238    pub fn to_console(&self) -> String {
239        let mut out = String::new();
240
241        out.push_str(&format!("\n{}\n", "=".repeat(80)));
242        out.push_str(&format!("{:^80}\n", self.title));
243        out.push_str(&format!("{}\n\n", "=".repeat(80)));
244
245        for suite in &self.results {
246            out.push_str(&format!(
247                "Dataset: {} ({} samples)\n",
248                suite.dataset_name, suite.dataset_size
249            ));
250            out.push_str(&format!("{}\n", "-".repeat(60)));
251
252            out.push_str(&format!(
253                "{:<20} {:>8} {:>8} {:>8} {:>8} {:>10}\n",
254                "Guard", "F1", "Prec", "Recall", "FPR", "p50 (ms)"
255            ));
256            out.push_str(&format!("{}\n", "-".repeat(60)));
257
258            for metrics in &suite.guard_metrics {
259                let meets = self.targets.meets_targets(metrics);
260                let indicator = if meets { "✓" } else { "✗" };
261
262                out.push_str(&format!(
263                    "{:<20} {:>7.3} {:>7.3} {:>7.3} {:>7.1}% {:>9.2} {}\n",
264                    metrics.name,
265                    metrics.f1_score(),
266                    metrics.precision(),
267                    metrics.recall(),
268                    metrics.false_positive_rate() * 100.0,
269                    metrics.p50_latency_ms(),
270                    indicator
271                ));
272            }
273            out.push('\n');
274        }
275
276        // Best performers
277        if let Some(suite) = self.results.first() {
278            if let Some(best_f1) = suite.best_by_f1() {
279                out.push_str(&format!(
280                    "Best F1: {} ({:.3})\n",
281                    best_f1.name,
282                    best_f1.f1_score()
283                ));
284            }
285            if let Some(fastest) = suite.fastest_by_p50() {
286                out.push_str(&format!(
287                    "Fastest: {} ({:.2}ms p50)\n",
288                    fastest.name,
289                    fastest.p50_latency_ms()
290                ));
291            }
292        }
293
294        out.push_str(&format!("\n{}\n", "=".repeat(80)));
295
296        out
297    }
298
299    /// Render to specified format
300    pub fn render(&self, format: ReportFormat) -> String {
301        match format {
302            ReportFormat::Markdown => self.to_markdown(),
303            ReportFormat::Json => self
304                .to_json()
305                .unwrap_or_else(|e| format!("JSON error: {}", e)),
306            ReportFormat::Console => self.to_console(),
307        }
308    }
309}
310
311/// Quick benchmark summary for a single guard
312#[derive(Debug, Clone, Serialize, Deserialize)]
313pub struct QuickSummary {
314    pub guard_name: String,
315    pub f1_score: f64,
316    pub precision: f64,
317    pub recall: f64,
318    pub false_positive_rate: f64,
319    pub p50_latency_ms: f64,
320    pub p99_latency_ms: f64,
321    pub throughput: f64,
322    pub meets_targets: bool,
323}
324
325impl QuickSummary {
326    /// Create from guard metrics
327    pub fn from_metrics(metrics: &GuardMetrics, targets: &BenchmarkTargets) -> Self {
328        Self {
329            guard_name: metrics.name.clone(),
330            f1_score: metrics.f1_score(),
331            precision: metrics.precision(),
332            recall: metrics.recall(),
333            false_positive_rate: metrics.false_positive_rate(),
334            p50_latency_ms: metrics.p50_latency_ms(),
335            p99_latency_ms: metrics.p99_latency_ms(),
336            throughput: metrics.throughput(),
337            meets_targets: targets.meets_targets(metrics),
338        }
339    }
340}
341
342#[cfg(test)]
343mod tests {
344    use super::*;
345
346    fn create_test_metrics(name: &str, f1: f64) -> GuardMetrics {
347        let mut metrics = GuardMetrics::new(name);
348        // Set values to achieve approximate F1
349        metrics.true_positives = (f1 * 100.0) as usize;
350        metrics.false_negatives = 100 - metrics.true_positives;
351        metrics.true_negatives = 95;
352        metrics.false_positives = 5;
353        metrics.latencies_ms = vec![10.0, 15.0, 20.0, 25.0, 30.0];
354        metrics
355    }
356
357    #[test]
358    fn test_markdown_report() {
359        let suite = BenchmarkSuiteResults {
360            dataset_name: "Test Dataset".into(),
361            dataset_size: 100,
362            attack_count: 50,
363            benign_count: 50,
364            guard_metrics: vec![
365                create_test_metrics("GuardA", 0.90),
366                create_test_metrics("GuardB", 0.85),
367            ],
368            timestamp: chrono::Utc::now().to_rfc3339(),
369        };
370
371        let report = BenchmarkReport::new("Test Report", vec![suite]);
372        let markdown = report.to_markdown();
373
374        assert!(markdown.contains("Test Report"));
375        assert!(markdown.contains("GuardA"));
376        assert!(markdown.contains("GuardB"));
377        assert!(markdown.contains("Performance Targets"));
378    }
379
380    #[test]
381    fn test_console_output() {
382        let suite = BenchmarkSuiteResults {
383            dataset_name: "Test Dataset".into(),
384            dataset_size: 100,
385            attack_count: 50,
386            benign_count: 50,
387            guard_metrics: vec![create_test_metrics("TestGuard", 0.90)],
388            timestamp: chrono::Utc::now().to_rfc3339(),
389        };
390
391        let report = BenchmarkReport::new("Console Test", vec![suite]);
392        let console = report.to_console();
393
394        assert!(console.contains("Console Test"));
395        assert!(console.contains("TestGuard"));
396    }
397
398    #[test]
399    fn test_json_report() {
400        let suite = BenchmarkSuiteResults {
401            dataset_name: "JSON Test".into(),
402            dataset_size: 50,
403            attack_count: 25,
404            benign_count: 25,
405            guard_metrics: vec![create_test_metrics("JSONGuard", 0.92)],
406            timestamp: chrono::Utc::now().to_rfc3339(),
407        };
408
409        let report = BenchmarkReport::new("JSON Test", vec![suite]);
410        let json = report.to_json().unwrap();
411
412        assert!(json.contains("JSON Test"));
413        assert!(json.contains("JSONGuard"));
414    }
415
416    #[test]
417    fn test_quick_summary() {
418        let metrics = create_test_metrics("QuickTest", 0.93);
419        let targets = BenchmarkTargets::default();
420        let summary = QuickSummary::from_metrics(&metrics, &targets);
421
422        assert_eq!(summary.guard_name, "QuickTest");
423        assert!(summary.f1_score > 0.0);
424    }
425}