oxideshield_guard/benchmark/
report.rs1use serde::{Deserialize, Serialize};
9
10use super::metrics::{BenchmarkTargets, ComparisonResult, GuardMetrics};
11use super::runner::BenchmarkSuiteResults;
12
13#[derive(Debug, Clone, Copy, PartialEq, Eq)]
15pub enum ReportFormat {
16 Markdown,
18 Json,
20 Console,
22}
23
24#[derive(Debug, Clone, Serialize, Deserialize)]
26pub struct BenchmarkReport {
27 pub title: String,
29 pub timestamp: String,
31 pub results: Vec<BenchmarkSuiteResults>,
33 pub targets: BenchmarkTargets,
35 pub comparisons: Vec<ComparisonResult>,
37}
38
39impl BenchmarkReport {
40 pub fn new(title: impl Into<String>, results: Vec<BenchmarkSuiteResults>) -> Self {
42 let comparisons = Self::generate_comparisons(&results);
43 Self {
44 title: title.into(),
45 timestamp: chrono::Utc::now().to_rfc3339(),
46 results,
47 targets: BenchmarkTargets::default(),
48 comparisons,
49 }
50 }
51
52 pub fn with_targets(mut self, targets: BenchmarkTargets) -> Self {
54 self.targets = targets;
55 self
56 }
57
58 fn generate_comparisons(results: &[BenchmarkSuiteResults]) -> Vec<ComparisonResult> {
60 let mut comparisons = Vec::new();
61
62 for suite in results {
63 let metrics = &suite.guard_metrics;
64 for i in 0..metrics.len() {
65 for j in (i + 1)..metrics.len() {
66 comparisons.push(ComparisonResult::compare(&metrics[i], &metrics[j]));
67 }
68 }
69 }
70
71 comparisons
72 }
73
74 pub fn to_markdown(&self) -> String {
76 let mut md = String::new();
77
78 md.push_str(&format!("# {}\n\n", self.title));
80 md.push_str(&format!("Generated: {}\n\n", self.timestamp));
81
82 md.push_str("## Performance Targets\n\n");
84 md.push_str("| Metric | Target | Notes |\n");
85 md.push_str("|--------|--------|-------|\n");
86 md.push_str(&format!(
87 "| F1 Score | ≥{:.2} | Llama Guard 3: 0.94 |\n",
88 self.targets.f1_target
89 ));
90 md.push_str(&format!(
91 "| Precision | ≥{:.2} | Minimize false positives |\n",
92 self.targets.precision_target
93 ));
94 md.push_str(&format!(
95 "| Recall | ≥{:.2} | Catch most attacks |\n",
96 self.targets.recall_target
97 ));
98 md.push_str(&format!(
99 "| Latency (p50) | ≤{:.0}ms | 2x faster than commercial |\n",
100 self.targets.p50_latency_target_ms
101 ));
102 md.push_str(&format!(
103 "| Latency (p99) | ≤{:.0}ms | Tail latency matters |\n",
104 self.targets.p99_latency_target_ms
105 ));
106 md.push_str(&format!(
107 "| False Positive Rate | ≤{:.1}% | Production safety |\n\n",
108 self.targets.fpr_target * 100.0
109 ));
110
111 for suite in &self.results {
113 md.push_str(&format!("## Dataset: {}\n\n", suite.dataset_name));
114 md.push_str(&format!("- **Total Samples**: {}\n", suite.dataset_size));
115 md.push_str(&format!("- **Attacks**: {}\n", suite.attack_count));
116 md.push_str(&format!("- **Benign**: {}\n\n", suite.benign_count));
117
118 md.push_str("### Summary\n\n");
120 md.push_str(
121 "| Guard | F1 | Precision | Recall | FPR | p50 (ms) | p99 (ms) | Status |\n",
122 );
123 md.push_str(
124 "|-------|-----|-----------|--------|-----|----------|----------|--------|\n",
125 );
126
127 for metrics in &suite.guard_metrics {
128 let meets_targets = self.targets.meets_targets(metrics);
129 let status = if meets_targets { "✅" } else { "⚠️" };
130
131 md.push_str(&format!(
132 "| {} | {:.3} | {:.3} | {:.3} | {:.1}% | {:.2} | {:.2} | {} |\n",
133 metrics.name,
134 metrics.f1_score(),
135 metrics.precision(),
136 metrics.recall(),
137 metrics.false_positive_rate() * 100.0,
138 metrics.p50_latency_ms(),
139 metrics.p99_latency_ms(),
140 status
141 ));
142 }
143 md.push('\n');
144
145 if !suite.guard_metrics.is_empty()
147 && !suite.guard_metrics[0].category_metrics.is_empty()
148 {
149 md.push_str("### Per-Category Detection Rates\n\n");
150 md.push_str("| Category | ");
151 for metrics in &suite.guard_metrics {
152 md.push_str(&format!("{} | ", metrics.name));
153 }
154 md.push_str("\n|----------|");
155 for _ in &suite.guard_metrics {
156 md.push_str("--------|");
157 }
158 md.push('\n');
159
160 let mut all_categories: Vec<String> = suite
162 .guard_metrics
163 .iter()
164 .flat_map(|m| m.category_metrics.keys().cloned())
165 .collect();
166 all_categories.sort();
167 all_categories.dedup();
168
169 for category in &all_categories {
170 md.push_str(&format!("| {} | ", category));
171 for metrics in &suite.guard_metrics {
172 if let Some(cat_metrics) = metrics.category_metrics.get(category) {
173 md.push_str(&format!("{:.1}% | ", cat_metrics.recall() * 100.0));
174 } else {
175 md.push_str("- | ");
176 }
177 }
178 md.push('\n');
179 }
180 md.push('\n');
181 }
182 }
183
184 if !self.comparisons.is_empty() {
186 md.push_str("## Head-to-Head Comparisons\n\n");
187
188 for comp in &self.comparisons {
189 md.push_str(&format!("### {} vs {}\n\n", comp.tool_a, comp.tool_b));
190 md.push_str("| Metric | Winner | Difference |\n");
191 md.push_str("|--------|--------|------------|\n");
192 md.push_str(&format!(
193 "| F1 Score | {} | {:+.3} |\n",
194 comp.winners.get("f1").unwrap_or(&"tie".to_string()),
195 comp.f1_diff
196 ));
197 md.push_str(&format!(
198 "| Precision | {} | {:+.3} |\n",
199 comp.winners.get("precision").unwrap_or(&"tie".to_string()),
200 comp.precision_diff
201 ));
202 md.push_str(&format!(
203 "| Recall | {} | {:+.3} |\n",
204 comp.winners.get("recall").unwrap_or(&"tie".to_string()),
205 comp.recall_diff
206 ));
207 md.push_str(&format!(
208 "| Latency | {} | {:.2}x |\n\n",
209 comp.winners.get("latency").unwrap_or(&"tie".to_string()),
210 comp.latency_ratio
211 ));
212 }
213 }
214
215 md.push_str("## References\n\n");
217 md.push_str(
218 "- [Llama Guard 3](https://huggingface.co/meta-llama/Llama-Guard-3-8B) - Meta AI\n",
219 );
220 md.push_str("- [LLM Guard](https://github.com/protectai/llm-guard) - ProtectAI\n");
221 md.push_str("- [JailbreakBench](https://jailbreakbench.github.io/) - Standard benchmark\n");
222 md.push_str(
223 "- [PromptGuard](https://www.nature.com/articles/s41598-025-31086-y) - Nature 2025\n\n",
224 );
225
226 md.push_str("---\n");
227 md.push_str("*Generated by OxideShield Benchmark Suite*\n");
228
229 md
230 }
231
232 pub fn to_json(&self) -> serde_json::Result<String> {
234 serde_json::to_string_pretty(self)
235 }
236
237 pub fn to_console(&self) -> String {
239 let mut out = String::new();
240
241 out.push_str(&format!("\n{}\n", "=".repeat(80)));
242 out.push_str(&format!("{:^80}\n", self.title));
243 out.push_str(&format!("{}\n\n", "=".repeat(80)));
244
245 for suite in &self.results {
246 out.push_str(&format!(
247 "Dataset: {} ({} samples)\n",
248 suite.dataset_name, suite.dataset_size
249 ));
250 out.push_str(&format!("{}\n", "-".repeat(60)));
251
252 out.push_str(&format!(
253 "{:<20} {:>8} {:>8} {:>8} {:>8} {:>10}\n",
254 "Guard", "F1", "Prec", "Recall", "FPR", "p50 (ms)"
255 ));
256 out.push_str(&format!("{}\n", "-".repeat(60)));
257
258 for metrics in &suite.guard_metrics {
259 let meets = self.targets.meets_targets(metrics);
260 let indicator = if meets { "✓" } else { "✗" };
261
262 out.push_str(&format!(
263 "{:<20} {:>7.3} {:>7.3} {:>7.3} {:>7.1}% {:>9.2} {}\n",
264 metrics.name,
265 metrics.f1_score(),
266 metrics.precision(),
267 metrics.recall(),
268 metrics.false_positive_rate() * 100.0,
269 metrics.p50_latency_ms(),
270 indicator
271 ));
272 }
273 out.push('\n');
274 }
275
276 if let Some(suite) = self.results.first() {
278 if let Some(best_f1) = suite.best_by_f1() {
279 out.push_str(&format!(
280 "Best F1: {} ({:.3})\n",
281 best_f1.name,
282 best_f1.f1_score()
283 ));
284 }
285 if let Some(fastest) = suite.fastest_by_p50() {
286 out.push_str(&format!(
287 "Fastest: {} ({:.2}ms p50)\n",
288 fastest.name,
289 fastest.p50_latency_ms()
290 ));
291 }
292 }
293
294 out.push_str(&format!("\n{}\n", "=".repeat(80)));
295
296 out
297 }
298
299 pub fn render(&self, format: ReportFormat) -> String {
301 match format {
302 ReportFormat::Markdown => self.to_markdown(),
303 ReportFormat::Json => self
304 .to_json()
305 .unwrap_or_else(|e| format!("JSON error: {}", e)),
306 ReportFormat::Console => self.to_console(),
307 }
308 }
309}
310
311#[derive(Debug, Clone, Serialize, Deserialize)]
313pub struct QuickSummary {
314 pub guard_name: String,
315 pub f1_score: f64,
316 pub precision: f64,
317 pub recall: f64,
318 pub false_positive_rate: f64,
319 pub p50_latency_ms: f64,
320 pub p99_latency_ms: f64,
321 pub throughput: f64,
322 pub meets_targets: bool,
323}
324
325impl QuickSummary {
326 pub fn from_metrics(metrics: &GuardMetrics, targets: &BenchmarkTargets) -> Self {
328 Self {
329 guard_name: metrics.name.clone(),
330 f1_score: metrics.f1_score(),
331 precision: metrics.precision(),
332 recall: metrics.recall(),
333 false_positive_rate: metrics.false_positive_rate(),
334 p50_latency_ms: metrics.p50_latency_ms(),
335 p99_latency_ms: metrics.p99_latency_ms(),
336 throughput: metrics.throughput(),
337 meets_targets: targets.meets_targets(metrics),
338 }
339 }
340}
341
342#[cfg(test)]
343mod tests {
344 use super::*;
345
346 fn create_test_metrics(name: &str, f1: f64) -> GuardMetrics {
347 let mut metrics = GuardMetrics::new(name);
348 metrics.true_positives = (f1 * 100.0) as usize;
350 metrics.false_negatives = 100 - metrics.true_positives;
351 metrics.true_negatives = 95;
352 metrics.false_positives = 5;
353 metrics.latencies_ms = vec![10.0, 15.0, 20.0, 25.0, 30.0];
354 metrics
355 }
356
357 #[test]
358 fn test_markdown_report() {
359 let suite = BenchmarkSuiteResults {
360 dataset_name: "Test Dataset".into(),
361 dataset_size: 100,
362 attack_count: 50,
363 benign_count: 50,
364 guard_metrics: vec![
365 create_test_metrics("GuardA", 0.90),
366 create_test_metrics("GuardB", 0.85),
367 ],
368 timestamp: chrono::Utc::now().to_rfc3339(),
369 };
370
371 let report = BenchmarkReport::new("Test Report", vec![suite]);
372 let markdown = report.to_markdown();
373
374 assert!(markdown.contains("Test Report"));
375 assert!(markdown.contains("GuardA"));
376 assert!(markdown.contains("GuardB"));
377 assert!(markdown.contains("Performance Targets"));
378 }
379
380 #[test]
381 fn test_console_output() {
382 let suite = BenchmarkSuiteResults {
383 dataset_name: "Test Dataset".into(),
384 dataset_size: 100,
385 attack_count: 50,
386 benign_count: 50,
387 guard_metrics: vec![create_test_metrics("TestGuard", 0.90)],
388 timestamp: chrono::Utc::now().to_rfc3339(),
389 };
390
391 let report = BenchmarkReport::new("Console Test", vec![suite]);
392 let console = report.to_console();
393
394 assert!(console.contains("Console Test"));
395 assert!(console.contains("TestGuard"));
396 }
397
398 #[test]
399 fn test_json_report() {
400 let suite = BenchmarkSuiteResults {
401 dataset_name: "JSON Test".into(),
402 dataset_size: 50,
403 attack_count: 25,
404 benign_count: 25,
405 guard_metrics: vec![create_test_metrics("JSONGuard", 0.92)],
406 timestamp: chrono::Utc::now().to_rfc3339(),
407 };
408
409 let report = BenchmarkReport::new("JSON Test", vec![suite]);
410 let json = report.to_json().unwrap();
411
412 assert!(json.contains("JSON Test"));
413 assert!(json.contains("JSONGuard"));
414 }
415
416 #[test]
417 fn test_quick_summary() {
418 let metrics = create_test_metrics("QuickTest", 0.93);
419 let targets = BenchmarkTargets::default();
420 let summary = QuickSummary::from_metrics(&metrics, &targets);
421
422 assert_eq!(summary.guard_name, "QuickTest");
423 assert!(summary.f1_score > 0.0);
424 }
425}