use crate::error::Result;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::time::Instant;
use super::{
executor::{ProtocolInput, ProtocolOutput},
validation::{DeepSeekValidationResult, ValidationVerdict},
validation_executor::{ValidatingProtocolExecutor, ValidationExecutorConfig, ValidationLevel},
};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BenchmarkConfig {
pub scenarios: Vec<BenchmarkScenario>,
pub iterations: usize,
pub validation_levels: Vec<ValidationLevel>,
pub enable_statistics: bool,
pub timeout_secs: u64,
}
impl Default for BenchmarkConfig {
fn default() -> Self {
Self {
scenarios: vec![
BenchmarkScenario::BusinessDecision,
BenchmarkScenario::TechnicalArchitecture,
BenchmarkScenario::ComplianceAnalysis,
BenchmarkScenario::RiskAssessment,
],
iterations: 10,
validation_levels: vec![
ValidationLevel::None,
ValidationLevel::Quick,
ValidationLevel::Standard,
ValidationLevel::Rigorous,
],
enable_statistics: true,
timeout_secs: 300,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum BenchmarkScenario {
BusinessDecision,
TechnicalArchitecture,
ComplianceAnalysis,
RiskAssessment,
StrategicPlanning,
MultiPerspectiveAnalysis,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ScenarioResult {
pub scenario: BenchmarkScenario,
pub validation_level: ValidationLevel,
pub iterations: usize,
pub average_duration_ms: f64,
pub average_confidence: f64,
pub average_validation_confidence: f64,
pub success_rate: f64,
pub validation_success_rate: f64,
pub token_usage: TokenUsageMetrics,
pub validation_findings: Vec<ValidationFindingStat>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TokenUsageMetrics {
pub average_input_tokens: f64,
pub average_output_tokens: f64,
pub average_total_tokens: f64,
pub average_cost_usd: f64,
pub token_per_second: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ValidationFindingStat {
pub finding_category: String,
pub average_severity: f64,
pub frequency: f64,
pub average_confidence_impact: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BenchmarkResults {
pub config: BenchmarkConfig,
pub scenario_results: HashMap<String, ScenarioResult>,
pub summary: BenchmarkSummary,
pub timestamp: chrono::DateTime<chrono::Utc>,
pub version: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BenchmarkSummary {
pub total_duration_seconds: f64,
pub total_iterations: usize,
pub overall_success_rate: f64,
pub average_confidence_gain: f64,
pub cost_per_validation: f64,
pub performance_improvements: Vec<String>,
pub recommendations: Vec<String>,
}
#[derive(Default)]
pub struct DeepSeekBenchmarkRunner {
config: BenchmarkConfig,
}
impl DeepSeekBenchmarkRunner {
pub fn new(config: BenchmarkConfig) -> Self {
Self { config }
}
pub async fn run_benchmark(&self) -> Result<BenchmarkResults> {
let start_time = Instant::now();
let mut scenario_results = HashMap::new();
let mut total_iterations = 0;
for scenario in &self.config.scenarios {
for validation_level in &self.config.validation_levels {
let result = self.run_scenario(*scenario, *validation_level).await?;
total_iterations += result.iterations;
let key = format!("{:?}_{:?}", scenario, validation_level);
scenario_results.insert(key, result);
}
}
let total_duration = start_time.elapsed().as_secs_f64();
let summary = self.calculate_summary(&scenario_results, total_duration, total_iterations);
Ok(BenchmarkResults {
config: self.config.clone(),
scenario_results,
summary,
timestamp: chrono::Utc::now(),
version: env!("CARGO_PKG_VERSION").to_string(),
})
}
async fn run_scenario(
&self,
scenario: BenchmarkScenario,
validation_level: ValidationLevel,
) -> Result<ScenarioResult> {
let mut durations = Vec::new();
let mut confidences = Vec::new();
let mut validation_confidences = Vec::new();
let mut success_count = 0;
let mut validation_success_count = 0;
let mut token_metrics = Vec::new();
let mut findings_stats = HashMap::new();
for i in 0..self.config.iterations {
let input = self.generate_scenario_input(scenario, i);
let start_time = Instant::now();
let result = self
.execute_validation_run(&input, validation_level)
.await?;
let duration = start_time.elapsed().as_millis() as f64;
durations.push(duration);
confidences.push(result.confidence);
if result.success {
success_count += 1;
}
if let Some(validation_data) = result.data.get("deepseek_validation") {
if let Ok(validation_result) =
serde_json::from_value::<DeepSeekValidationResult>(validation_data.clone())
{
validation_confidences.push(validation_result.validation_confidence);
if validation_result.verdict == ValidationVerdict::Validated {
validation_success_count += 1;
}
token_metrics.push(TokenUsageMetrics {
average_input_tokens: validation_result.tokens_used.input_tokens as f64,
average_output_tokens: validation_result.tokens_used.output_tokens as f64,
average_total_tokens: validation_result.tokens_used.total_tokens as f64,
average_cost_usd: validation_result.tokens_used.cost_usd,
token_per_second: validation_result.performance.tokens_per_second,
});
self.analyze_findings(&validation_result, &mut findings_stats);
}
}
}
let average_duration = durations.iter().sum::<f64>() / durations.len() as f64;
let average_confidence = confidences.iter().sum::<f64>() / confidences.len() as f64;
let average_validation_confidence = if validation_confidences.is_empty() {
0.0
} else {
validation_confidences.iter().sum::<f64>() / validation_confidences.len() as f64
};
let success_rate = success_count as f64 / self.config.iterations as f64;
let validation_success_rate =
validation_success_count as f64 / self.config.iterations as f64;
let token_usage = if token_metrics.is_empty() {
TokenUsageMetrics::default()
} else {
TokenUsageMetrics {
average_input_tokens: token_metrics
.iter()
.map(|m| m.average_input_tokens)
.sum::<f64>()
/ token_metrics.len() as f64,
average_output_tokens: token_metrics
.iter()
.map(|m| m.average_output_tokens)
.sum::<f64>()
/ token_metrics.len() as f64,
average_total_tokens: token_metrics
.iter()
.map(|m| m.average_total_tokens)
.sum::<f64>()
/ token_metrics.len() as f64,
average_cost_usd: token_metrics
.iter()
.map(|m| m.average_cost_usd)
.sum::<f64>()
/ token_metrics.len() as f64,
token_per_second: token_metrics
.iter()
.map(|m| m.token_per_second)
.sum::<f64>()
/ token_metrics.len() as f64,
}
};
let validation_findings = findings_stats
.into_iter()
.map(|(category, stats)| ValidationFindingStat {
finding_category: category,
average_severity: stats.average_severity / stats.count as f64,
frequency: stats.count as f64 / self.config.iterations as f64,
average_confidence_impact: stats.total_confidence_impact / stats.count as f64,
})
.collect();
Ok(ScenarioResult {
scenario,
validation_level,
iterations: self.config.iterations,
average_duration_ms: average_duration,
average_confidence,
average_validation_confidence,
success_rate,
validation_success_rate,
token_usage,
validation_findings,
})
}
async fn execute_validation_run(
&self,
input: &ProtocolInput,
validation_level: ValidationLevel,
) -> Result<ProtocolOutput> {
let config = ValidationExecutorConfig {
validation_level,
..Default::default()
};
let executor = ValidatingProtocolExecutor::with_configs(Default::default(), config)?;
executor
.execute_profile_with_validation("balanced", input.clone())
.await
}
fn generate_scenario_input(
&self,
scenario: BenchmarkScenario,
iteration: usize,
) -> ProtocolInput {
match scenario {
BenchmarkScenario::BusinessDecision => ProtocolInput::query(format!(
"Should we expand to the European market? Consider market size, competition, regulatory requirements, and potential ROI. Iteration: {}",
iteration
)),
BenchmarkScenario::TechnicalArchitecture => ProtocolInput::query(format!(
"Evaluate microservices vs monolithic architecture for a 10,000 user SaaS application. Consider scalability, maintainability, deployment complexity. Iteration: {}",
iteration
)),
BenchmarkScenario::ComplianceAnalysis => ProtocolInput::query(format!(
"Analyze GDPR compliance requirements for a customer analytics platform processing EU citizen data. Iteration: {}",
iteration
)),
BenchmarkScenario::RiskAssessment => ProtocolInput::query(format!(
"Assess cybersecurity risks for a cloud-based financial application handling sensitive customer data. Iteration: {}",
iteration
)),
BenchmarkScenario::StrategicPlanning => ProtocolInput::query(format!(
"Develop a 5-year strategic plan for a technology startup in the AI infrastructure space. Iteration: {}",
iteration
)),
BenchmarkScenario::MultiPerspectiveAnalysis => ProtocolInput::query(format!(
"Analyze the impact of remote work policies from technical, cultural, productivity, and security perspectives. Iteration: {}",
iteration
)),
}
}
fn analyze_findings(
&self,
validation_result: &DeepSeekValidationResult,
findings_stats: &mut HashMap<String, FindingStatsAccumulator>,
) {
for finding in &validation_result.findings {
let category = format!("{:?}", finding.category);
let stats = findings_stats.entry(category).or_default();
stats.count += 1;
stats.average_severity += match finding.severity {
super::validation::Severity::Critical => 5.0,
super::validation::Severity::High => 4.0,
super::validation::Severity::Medium => 3.0,
super::validation::Severity::Low => 2.0,
super::validation::Severity::Info => 1.0,
};
stats.total_confidence_impact += validation_result.validation_confidence;
}
}
fn calculate_summary(
&self,
scenario_results: &HashMap<String, ScenarioResult>,
total_duration: f64,
total_iterations: usize,
) -> BenchmarkSummary {
let mut total_success = 0.0_f64;
let mut total_confidence_gain = 0.0;
let mut total_cost = 0.0;
for result in scenario_results.values() {
total_success += result.success_rate * result.iterations as f64;
total_confidence_gain +=
result.average_validation_confidence - result.average_confidence;
total_cost += result.token_usage.average_cost_usd * result.iterations as f64;
}
let overall_success_rate = if total_iterations == 0 {
0.0
} else {
total_success / total_iterations as f64
};
let average_confidence_gain = total_confidence_gain / scenario_results.len() as f64;
let cost_per_validation = total_cost / total_iterations as f64;
let mut performance_improvements = Vec::new();
let mut recommendations = Vec::new();
if average_confidence_gain > 0.0 {
performance_improvements.push(format!(
"Average confidence improvement: +{:.1}%",
average_confidence_gain * 100.0
));
}
if cost_per_validation < 0.05 {
performance_improvements.push(format!(
"Cost-effective validation: ${:.3} per analysis",
cost_per_validation
));
}
if overall_success_rate > 0.85 {
recommendations.push("Ready for production deployment".to_string());
} else if overall_success_rate >= 0.70 {
recommendations.push("Suitable for development and testing".to_string());
} else {
recommendations.push("Further optimization recommended".to_string());
}
BenchmarkSummary {
total_duration_seconds: total_duration,
total_iterations,
overall_success_rate,
average_confidence_gain,
cost_per_validation,
performance_improvements,
recommendations,
}
}
}
#[derive(Debug, Clone, Default)]
struct FindingStatsAccumulator {
count: usize,
average_severity: f64,
total_confidence_impact: f64,
}
impl Default for TokenUsageMetrics {
fn default() -> Self {
Self {
average_input_tokens: 0.0,
average_output_tokens: 0.0,
average_total_tokens: 0.0,
average_cost_usd: 0.0,
token_per_second: 0.0,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::collections::HashMap;
#[test]
fn test_benchmark_config_default() {
let config = BenchmarkConfig::default();
assert_eq!(config.iterations, 10);
assert!(config.enable_statistics);
assert_eq!(config.timeout_secs, 300);
assert_eq!(config.scenarios.len(), 4);
assert!(config
.scenarios
.contains(&BenchmarkScenario::BusinessDecision));
assert!(config
.scenarios
.contains(&BenchmarkScenario::TechnicalArchitecture));
assert!(config
.scenarios
.contains(&BenchmarkScenario::ComplianceAnalysis));
assert!(config
.scenarios
.contains(&BenchmarkScenario::RiskAssessment));
assert_eq!(config.validation_levels.len(), 4);
assert!(config.validation_levels.contains(&ValidationLevel::None));
assert!(config.validation_levels.contains(&ValidationLevel::Quick));
assert!(config
.validation_levels
.contains(&ValidationLevel::Standard));
assert!(config
.validation_levels
.contains(&ValidationLevel::Rigorous));
}
#[test]
fn test_benchmark_config_custom() {
let config = BenchmarkConfig {
scenarios: vec![BenchmarkScenario::BusinessDecision],
iterations: 5,
validation_levels: vec![ValidationLevel::Standard],
enable_statistics: false,
timeout_secs: 60,
};
assert_eq!(config.scenarios.len(), 1);
assert_eq!(config.iterations, 5);
assert_eq!(config.validation_levels.len(), 1);
assert!(!config.enable_statistics);
assert_eq!(config.timeout_secs, 60);
}
#[test]
fn test_benchmark_config_serialization() {
let config = BenchmarkConfig::default();
let json = serde_json::to_string(&config).expect("Failed to serialize BenchmarkConfig");
assert!(json.contains("\"iterations\":10"));
assert!(json.contains("\"enable_statistics\":true"));
let deserialized: BenchmarkConfig =
serde_json::from_str(&json).expect("Failed to deserialize BenchmarkConfig");
assert_eq!(deserialized.iterations, config.iterations);
assert_eq!(deserialized.timeout_secs, config.timeout_secs);
}
#[test]
fn test_benchmark_scenario_enum_variants() {
let scenarios = [
BenchmarkScenario::BusinessDecision,
BenchmarkScenario::TechnicalArchitecture,
BenchmarkScenario::ComplianceAnalysis,
BenchmarkScenario::RiskAssessment,
BenchmarkScenario::StrategicPlanning,
BenchmarkScenario::MultiPerspectiveAnalysis,
];
assert_eq!(scenarios.len(), 6);
}
#[test]
fn test_benchmark_scenario_equality() {
assert_eq!(
BenchmarkScenario::BusinessDecision,
BenchmarkScenario::BusinessDecision
);
assert_ne!(
BenchmarkScenario::BusinessDecision,
BenchmarkScenario::TechnicalArchitecture
);
}
#[test]
fn test_benchmark_scenario_serialization() {
let scenario = BenchmarkScenario::BusinessDecision;
let json = serde_json::to_string(&scenario).expect("Failed to serialize scenario");
assert_eq!(json, "\"business_decision\"");
let deserialized: BenchmarkScenario =
serde_json::from_str(&json).expect("Failed to deserialize scenario");
assert_eq!(deserialized, scenario);
}
#[test]
fn test_all_scenarios_serialize_correctly() {
let test_cases = vec![
(BenchmarkScenario::BusinessDecision, "\"business_decision\""),
(
BenchmarkScenario::TechnicalArchitecture,
"\"technical_architecture\"",
),
(
BenchmarkScenario::ComplianceAnalysis,
"\"compliance_analysis\"",
),
(BenchmarkScenario::RiskAssessment, "\"risk_assessment\""),
(
BenchmarkScenario::StrategicPlanning,
"\"strategic_planning\"",
),
(
BenchmarkScenario::MultiPerspectiveAnalysis,
"\"multi_perspective_analysis\"",
),
];
for (scenario, expected_json) in test_cases {
let json = serde_json::to_string(&scenario).expect("Failed to serialize");
assert_eq!(
json, expected_json,
"Scenario {:?} serialization mismatch",
scenario
);
}
}
#[test]
fn test_scenario_input_generation_business_decision() {
let runner = DeepSeekBenchmarkRunner::default();
let input = runner.generate_scenario_input(BenchmarkScenario::BusinessDecision, 0);
let query = input.get_str("query").expect("Query field missing");
assert!(query.contains("European market"));
assert!(query.contains("Iteration: 0"));
}
#[test]
fn test_scenario_input_generation_technical_architecture() {
let runner = DeepSeekBenchmarkRunner::default();
let input = runner.generate_scenario_input(BenchmarkScenario::TechnicalArchitecture, 5);
let query = input.get_str("query").expect("Query field missing");
assert!(query.contains("microservices"));
assert!(query.contains("monolithic"));
assert!(query.contains("Iteration: 5"));
}
#[test]
fn test_scenario_input_generation_compliance_analysis() {
let runner = DeepSeekBenchmarkRunner::default();
let input = runner.generate_scenario_input(BenchmarkScenario::ComplianceAnalysis, 3);
let query = input.get_str("query").expect("Query field missing");
assert!(query.contains("GDPR"));
assert!(query.contains("EU citizen"));
assert!(query.contains("Iteration: 3"));
}
#[test]
fn test_scenario_input_generation_risk_assessment() {
let runner = DeepSeekBenchmarkRunner::default();
let input = runner.generate_scenario_input(BenchmarkScenario::RiskAssessment, 7);
let query = input.get_str("query").expect("Query field missing");
assert!(query.contains("cybersecurity"));
assert!(query.contains("financial application"));
assert!(query.contains("Iteration: 7"));
}
#[test]
fn test_scenario_input_generation_strategic_planning() {
let runner = DeepSeekBenchmarkRunner::default();
let input = runner.generate_scenario_input(BenchmarkScenario::StrategicPlanning, 2);
let query = input.get_str("query").expect("Query field missing");
assert!(query.contains("5-year strategic plan"));
assert!(query.contains("AI infrastructure"));
assert!(query.contains("Iteration: 2"));
}
#[test]
fn test_scenario_input_generation_multi_perspective() {
let runner = DeepSeekBenchmarkRunner::default();
let input = runner.generate_scenario_input(BenchmarkScenario::MultiPerspectiveAnalysis, 9);
let query = input.get_str("query").expect("Query field missing");
assert!(query.contains("remote work"));
assert!(query.contains("technical"));
assert!(query.contains("cultural"));
assert!(query.contains("Iteration: 9"));
}
#[test]
fn test_scenario_input_iteration_uniqueness() {
let runner = DeepSeekBenchmarkRunner::default();
let input0 = runner.generate_scenario_input(BenchmarkScenario::BusinessDecision, 0);
let input1 = runner.generate_scenario_input(BenchmarkScenario::BusinessDecision, 1);
let query0 = input0.get_str("query").unwrap();
let query1 = input1.get_str("query").unwrap();
assert_ne!(query0, query1);
assert!(query0.contains("Iteration: 0"));
assert!(query1.contains("Iteration: 1"));
}
#[test]
fn test_token_usage_metrics_default() {
let metrics = TokenUsageMetrics::default();
assert_eq!(metrics.average_input_tokens, 0.0);
assert_eq!(metrics.average_output_tokens, 0.0);
assert_eq!(metrics.average_total_tokens, 0.0);
assert_eq!(metrics.average_cost_usd, 0.0);
assert_eq!(metrics.token_per_second, 0.0);
}
#[test]
fn test_token_usage_metrics_custom() {
let metrics = TokenUsageMetrics {
average_input_tokens: 100.0,
average_output_tokens: 200.0,
average_total_tokens: 300.0,
average_cost_usd: 0.005,
token_per_second: 50.0,
};
assert_eq!(metrics.average_input_tokens, 100.0);
assert_eq!(metrics.average_output_tokens, 200.0);
assert_eq!(metrics.average_total_tokens, 300.0);
assert_eq!(metrics.average_cost_usd, 0.005);
assert_eq!(metrics.token_per_second, 50.0);
}
#[test]
fn test_token_usage_metrics_serialization() {
let metrics = TokenUsageMetrics {
average_input_tokens: 150.5,
average_output_tokens: 250.5,
average_total_tokens: 401.0,
average_cost_usd: 0.0075,
token_per_second: 100.0,
};
let json = serde_json::to_string(&metrics).expect("Failed to serialize");
let deserialized: TokenUsageMetrics =
serde_json::from_str(&json).expect("Failed to deserialize");
assert_eq!(
deserialized.average_input_tokens,
metrics.average_input_tokens
);
assert_eq!(
deserialized.average_output_tokens,
metrics.average_output_tokens
);
assert_eq!(
deserialized.average_total_tokens,
metrics.average_total_tokens
);
assert!((deserialized.average_cost_usd - metrics.average_cost_usd).abs() < 0.0001);
assert_eq!(deserialized.token_per_second, metrics.token_per_second);
}
#[test]
fn test_validation_finding_stat_creation() {
let stat = ValidationFindingStat {
finding_category: "LogicalFlow".to_string(),
average_severity: 3.5,
frequency: 0.25,
average_confidence_impact: 0.85,
};
assert_eq!(stat.finding_category, "LogicalFlow");
assert_eq!(stat.average_severity, 3.5);
assert_eq!(stat.frequency, 0.25);
assert_eq!(stat.average_confidence_impact, 0.85);
}
#[test]
fn test_validation_finding_stat_serialization() {
let stat = ValidationFindingStat {
finding_category: "Compliance".to_string(),
average_severity: 4.0,
frequency: 0.5,
average_confidence_impact: 0.75,
};
let json = serde_json::to_string(&stat).expect("Failed to serialize");
assert!(json.contains("\"finding_category\":\"Compliance\""));
assert!(json.contains("\"average_severity\":4.0"));
let deserialized: ValidationFindingStat =
serde_json::from_str(&json).expect("Failed to deserialize");
assert_eq!(deserialized.finding_category, stat.finding_category);
}
#[test]
fn test_scenario_result_creation() {
let result = ScenarioResult {
scenario: BenchmarkScenario::BusinessDecision,
validation_level: ValidationLevel::Standard,
iterations: 10,
average_duration_ms: 500.0,
average_confidence: 0.85,
average_validation_confidence: 0.90,
success_rate: 0.95,
validation_success_rate: 0.80,
token_usage: TokenUsageMetrics::default(),
validation_findings: vec![],
};
assert_eq!(result.scenario, BenchmarkScenario::BusinessDecision);
assert_eq!(result.validation_level, ValidationLevel::Standard);
assert_eq!(result.iterations, 10);
assert_eq!(result.average_duration_ms, 500.0);
assert_eq!(result.average_confidence, 0.85);
assert_eq!(result.success_rate, 0.95);
}
#[test]
fn test_scenario_result_with_findings() {
let findings = vec![
ValidationFindingStat {
finding_category: "LogicalFlow".to_string(),
average_severity: 2.5,
frequency: 0.3,
average_confidence_impact: 0.8,
},
ValidationFindingStat {
finding_category: "Compliance".to_string(),
average_severity: 4.0,
frequency: 0.1,
average_confidence_impact: 0.9,
},
];
let result = ScenarioResult {
scenario: BenchmarkScenario::ComplianceAnalysis,
validation_level: ValidationLevel::Rigorous,
iterations: 5,
average_duration_ms: 1000.0,
average_confidence: 0.75,
average_validation_confidence: 0.88,
success_rate: 0.80,
validation_success_rate: 0.60,
token_usage: TokenUsageMetrics {
average_input_tokens: 200.0,
average_output_tokens: 400.0,
average_total_tokens: 600.0,
average_cost_usd: 0.01,
token_per_second: 75.0,
},
validation_findings: findings,
};
assert_eq!(result.validation_findings.len(), 2);
assert_eq!(
result.validation_findings[0].finding_category,
"LogicalFlow"
);
}
#[test]
fn test_benchmark_summary_creation() {
let summary = BenchmarkSummary {
total_duration_seconds: 120.5,
total_iterations: 100,
overall_success_rate: 0.92,
average_confidence_gain: 0.05,
cost_per_validation: 0.025,
performance_improvements: vec!["Improved by 10%".to_string()],
recommendations: vec!["Ready for production".to_string()],
};
assert_eq!(summary.total_duration_seconds, 120.5);
assert_eq!(summary.total_iterations, 100);
assert_eq!(summary.overall_success_rate, 0.92);
assert_eq!(summary.average_confidence_gain, 0.05);
assert_eq!(summary.cost_per_validation, 0.025);
assert_eq!(summary.performance_improvements.len(), 1);
assert_eq!(summary.recommendations.len(), 1);
}
#[test]
fn test_benchmark_summary_serialization() {
let summary = BenchmarkSummary {
total_duration_seconds: 60.0,
total_iterations: 50,
overall_success_rate: 0.88,
average_confidence_gain: 0.03,
cost_per_validation: 0.015,
performance_improvements: vec![],
recommendations: vec!["Test recommendation".to_string()],
};
let json = serde_json::to_string(&summary).expect("Failed to serialize");
let deserialized: BenchmarkSummary =
serde_json::from_str(&json).expect("Failed to deserialize");
assert_eq!(deserialized.total_iterations, summary.total_iterations);
assert_eq!(
deserialized.overall_success_rate,
summary.overall_success_rate
);
}
#[test]
fn test_benchmark_results_creation() {
let config = BenchmarkConfig::default();
let summary = BenchmarkSummary {
total_duration_seconds: 300.0,
total_iterations: 160,
overall_success_rate: 0.90,
average_confidence_gain: 0.04,
cost_per_validation: 0.02,
performance_improvements: vec![],
recommendations: vec!["Production ready".to_string()],
};
let results = BenchmarkResults {
config: config.clone(),
scenario_results: HashMap::new(),
summary,
timestamp: chrono::Utc::now(),
version: "0.1.0".to_string(),
};
assert_eq!(results.config.iterations, config.iterations);
assert!(results.scenario_results.is_empty());
assert_eq!(results.summary.total_iterations, 160);
assert!(!results.version.is_empty());
}
#[test]
fn test_benchmark_results_with_scenario_results() {
let mut scenario_results = HashMap::new();
scenario_results.insert(
"BusinessDecision_Standard".to_string(),
ScenarioResult {
scenario: BenchmarkScenario::BusinessDecision,
validation_level: ValidationLevel::Standard,
iterations: 10,
average_duration_ms: 450.0,
average_confidence: 0.82,
average_validation_confidence: 0.87,
success_rate: 0.90,
validation_success_rate: 0.70,
token_usage: TokenUsageMetrics::default(),
validation_findings: vec![],
},
);
let results = BenchmarkResults {
config: BenchmarkConfig::default(),
scenario_results,
summary: BenchmarkSummary {
total_duration_seconds: 45.0,
total_iterations: 10,
overall_success_rate: 0.90,
average_confidence_gain: 0.05,
cost_per_validation: 0.0,
performance_improvements: vec![],
recommendations: vec![],
},
timestamp: chrono::Utc::now(),
version: "0.1.0".to_string(),
};
assert_eq!(results.scenario_results.len(), 1);
assert!(results
.scenario_results
.contains_key("BusinessDecision_Standard"));
}
#[test]
fn test_benchmark_runner_default() {
let runner = DeepSeekBenchmarkRunner::default();
assert_eq!(runner.config.iterations, 10);
assert!(runner.config.enable_statistics);
}
#[test]
fn test_benchmark_runner_with_custom_config() {
let config = BenchmarkConfig {
scenarios: vec![BenchmarkScenario::RiskAssessment],
iterations: 3,
validation_levels: vec![ValidationLevel::Quick],
enable_statistics: false,
timeout_secs: 30,
};
let runner = DeepSeekBenchmarkRunner::new(config);
assert_eq!(runner.config.scenarios.len(), 1);
assert_eq!(runner.config.iterations, 3);
assert!(!runner.config.enable_statistics);
}
#[test]
fn test_calculate_summary_high_success_rate() {
let runner = DeepSeekBenchmarkRunner::default();
let mut scenario_results = HashMap::new();
scenario_results.insert(
"Test_Standard".to_string(),
ScenarioResult {
scenario: BenchmarkScenario::BusinessDecision,
validation_level: ValidationLevel::Standard,
iterations: 10,
average_duration_ms: 100.0,
average_confidence: 0.80,
average_validation_confidence: 0.85,
success_rate: 0.90,
validation_success_rate: 0.80,
token_usage: TokenUsageMetrics {
average_input_tokens: 100.0,
average_output_tokens: 200.0,
average_total_tokens: 300.0,
average_cost_usd: 0.003,
token_per_second: 100.0,
},
validation_findings: vec![],
},
);
let summary = runner.calculate_summary(&scenario_results, 10.0, 10);
assert!(summary.overall_success_rate > 0.85);
assert!(summary
.recommendations
.iter()
.any(|r| r.contains("production")));
}
#[test]
fn test_calculate_summary_medium_success_rate() {
let runner = DeepSeekBenchmarkRunner::default();
let mut scenario_results = HashMap::new();
scenario_results.insert(
"Test_Standard".to_string(),
ScenarioResult {
scenario: BenchmarkScenario::BusinessDecision,
validation_level: ValidationLevel::Standard,
iterations: 10,
average_duration_ms: 100.0,
average_confidence: 0.70,
average_validation_confidence: 0.72,
success_rate: 0.75,
validation_success_rate: 0.60,
token_usage: TokenUsageMetrics::default(),
validation_findings: vec![],
},
);
let summary = runner.calculate_summary(&scenario_results, 10.0, 10);
assert!(summary.overall_success_rate >= 0.70);
assert!(summary.overall_success_rate <= 0.85);
assert!(summary
.recommendations
.iter()
.any(|r| r.contains("development") || r.contains("testing")));
}
#[test]
fn test_calculate_summary_low_success_rate() {
let runner = DeepSeekBenchmarkRunner::default();
let mut scenario_results = HashMap::new();
scenario_results.insert(
"Test_Standard".to_string(),
ScenarioResult {
scenario: BenchmarkScenario::BusinessDecision,
validation_level: ValidationLevel::Standard,
iterations: 10,
average_duration_ms: 100.0,
average_confidence: 0.50,
average_validation_confidence: 0.55,
success_rate: 0.60,
validation_success_rate: 0.40,
token_usage: TokenUsageMetrics::default(),
validation_findings: vec![],
},
);
let summary = runner.calculate_summary(&scenario_results, 10.0, 10);
assert!(summary.overall_success_rate < 0.70);
assert!(summary
.recommendations
.iter()
.any(|r| r.contains("optimization")));
}
#[test]
fn test_calculate_summary_confidence_gain() {
let runner = DeepSeekBenchmarkRunner::default();
let mut scenario_results = HashMap::new();
scenario_results.insert(
"Test_Standard".to_string(),
ScenarioResult {
scenario: BenchmarkScenario::BusinessDecision,
validation_level: ValidationLevel::Standard,
iterations: 10,
average_duration_ms: 100.0,
average_confidence: 0.70,
average_validation_confidence: 0.85, success_rate: 1.0,
validation_success_rate: 1.0,
token_usage: TokenUsageMetrics::default(),
validation_findings: vec![],
},
);
let summary = runner.calculate_summary(&scenario_results, 10.0, 10);
assert!(summary.average_confidence_gain > 0.0);
assert!(summary
.performance_improvements
.iter()
.any(|p| p.contains("confidence improvement")));
}
#[test]
fn test_calculate_summary_cost_effective() {
let runner = DeepSeekBenchmarkRunner::default();
let mut scenario_results = HashMap::new();
scenario_results.insert(
"Test_Standard".to_string(),
ScenarioResult {
scenario: BenchmarkScenario::BusinessDecision,
validation_level: ValidationLevel::Standard,
iterations: 10,
average_duration_ms: 100.0,
average_confidence: 0.80,
average_validation_confidence: 0.85,
success_rate: 1.0,
validation_success_rate: 1.0,
token_usage: TokenUsageMetrics {
average_input_tokens: 100.0,
average_output_tokens: 200.0,
average_total_tokens: 300.0,
average_cost_usd: 0.001, token_per_second: 100.0,
},
validation_findings: vec![],
},
);
let summary = runner.calculate_summary(&scenario_results, 10.0, 10);
assert!(summary.cost_per_validation < 0.05);
assert!(summary
.performance_improvements
.iter()
.any(|p| p.contains("Cost-effective")));
}
#[test]
fn test_calculate_summary_multiple_scenarios() {
let runner = DeepSeekBenchmarkRunner::default();
let mut scenario_results = HashMap::new();
scenario_results.insert(
"Business_Standard".to_string(),
ScenarioResult {
scenario: BenchmarkScenario::BusinessDecision,
validation_level: ValidationLevel::Standard,
iterations: 10,
average_duration_ms: 100.0,
average_confidence: 0.80,
average_validation_confidence: 0.85,
success_rate: 1.0,
validation_success_rate: 0.90,
token_usage: TokenUsageMetrics {
average_input_tokens: 100.0,
average_output_tokens: 200.0,
average_total_tokens: 300.0,
average_cost_usd: 0.002,
token_per_second: 100.0,
},
validation_findings: vec![],
},
);
scenario_results.insert(
"Technical_Rigorous".to_string(),
ScenarioResult {
scenario: BenchmarkScenario::TechnicalArchitecture,
validation_level: ValidationLevel::Rigorous,
iterations: 10,
average_duration_ms: 200.0,
average_confidence: 0.75,
average_validation_confidence: 0.82,
success_rate: 0.90,
validation_success_rate: 0.80,
token_usage: TokenUsageMetrics {
average_input_tokens: 150.0,
average_output_tokens: 250.0,
average_total_tokens: 400.0,
average_cost_usd: 0.003,
token_per_second: 80.0,
},
validation_findings: vec![],
},
);
let summary = runner.calculate_summary(&scenario_results, 30.0, 20);
assert_eq!(summary.total_iterations, 20);
assert_eq!(summary.total_duration_seconds, 30.0);
assert!(summary.overall_success_rate > 0.0);
assert!(summary.overall_success_rate <= 1.0);
}
#[test]
fn test_finding_stats_accumulator_default() {
let stats = FindingStatsAccumulator::default();
assert_eq!(stats.count, 0);
assert_eq!(stats.average_severity, 0.0);
assert_eq!(stats.total_confidence_impact, 0.0);
}
#[test]
fn test_finding_stats_accumulator_accumulation() {
let mut stats = FindingStatsAccumulator::default();
stats.count += 1;
stats.average_severity += 4.0;
stats.total_confidence_impact += 0.85;
assert_eq!(stats.count, 1);
assert_eq!(stats.average_severity, 4.0);
assert_eq!(stats.total_confidence_impact, 0.85);
stats.count += 1;
stats.average_severity += 2.0;
stats.total_confidence_impact += 0.90;
assert_eq!(stats.count, 2);
assert_eq!(stats.average_severity, 6.0);
assert_eq!(stats.total_confidence_impact, 1.75);
}
#[test]
fn test_benchmark_results_json_output_format() {
let config = BenchmarkConfig {
scenarios: vec![BenchmarkScenario::BusinessDecision],
iterations: 5,
validation_levels: vec![ValidationLevel::Standard],
enable_statistics: true,
timeout_secs: 60,
};
let mut scenario_results = HashMap::new();
scenario_results.insert(
"BusinessDecision_Standard".to_string(),
ScenarioResult {
scenario: BenchmarkScenario::BusinessDecision,
validation_level: ValidationLevel::Standard,
iterations: 5,
average_duration_ms: 250.0,
average_confidence: 0.82,
average_validation_confidence: 0.88,
success_rate: 0.80,
validation_success_rate: 0.60,
token_usage: TokenUsageMetrics {
average_input_tokens: 120.0,
average_output_tokens: 280.0,
average_total_tokens: 400.0,
average_cost_usd: 0.0045,
token_per_second: 88.0,
},
validation_findings: vec![ValidationFindingStat {
finding_category: "LogicalFlow".to_string(),
average_severity: 2.0,
frequency: 0.4,
average_confidence_impact: 0.85,
}],
},
);
let results = BenchmarkResults {
config,
scenario_results,
summary: BenchmarkSummary {
total_duration_seconds: 12.5,
total_iterations: 5,
overall_success_rate: 0.80,
average_confidence_gain: 0.06,
cost_per_validation: 0.0045,
performance_improvements: vec![
"Average confidence improvement: +6.0%".to_string(),
"Cost-effective validation: $0.005 per analysis".to_string(),
],
recommendations: vec!["Suitable for development and testing".to_string()],
},
timestamp: chrono::Utc::now(),
version: "0.1.0".to_string(),
};
let json = serde_json::to_string_pretty(&results).expect("Failed to serialize results");
assert!(json.contains("\"config\""));
assert!(json.contains("\"scenario_results\""));
assert!(json.contains("\"summary\""));
assert!(json.contains("\"timestamp\""));
assert!(json.contains("\"version\""));
assert!(json.contains("\"BusinessDecision_Standard\""));
assert!(json.contains("\"average_duration_ms\""));
assert!(json.contains("\"token_usage\""));
assert!(json.contains("\"validation_findings\""));
assert!(json.contains("\"performance_improvements\""));
assert!(json.contains("\"recommendations\""));
}
#[test]
fn test_scenario_result_key_format() {
let scenario = BenchmarkScenario::TechnicalArchitecture;
let level = ValidationLevel::Rigorous;
let key = format!("{:?}_{:?}", scenario, level);
assert_eq!(key, "TechnicalArchitecture_Rigorous");
}
#[test]
fn test_empty_scenario_results_summary() {
let _runner = DeepSeekBenchmarkRunner::default();
let _scenario_results: HashMap<String, ScenarioResult> = HashMap::new();
let summary = BenchmarkSummary {
total_duration_seconds: 0.0,
total_iterations: 0,
overall_success_rate: 0.0,
average_confidence_gain: 0.0,
cost_per_validation: 0.0,
performance_improvements: vec![],
recommendations: vec!["Further optimization recommended".to_string()],
};
assert_eq!(summary.total_iterations, 0);
assert!(!summary.recommendations.is_empty());
}
#[test]
fn test_zero_iterations_config() {
let config = BenchmarkConfig {
scenarios: vec![BenchmarkScenario::BusinessDecision],
iterations: 0,
validation_levels: vec![ValidationLevel::Standard],
enable_statistics: true,
timeout_secs: 60,
};
assert_eq!(config.iterations, 0);
}
#[test]
fn test_empty_scenarios_config() {
let config = BenchmarkConfig {
scenarios: vec![],
iterations: 10,
validation_levels: vec![ValidationLevel::Standard],
enable_statistics: true,
timeout_secs: 60,
};
assert!(config.scenarios.is_empty());
}
#[test]
fn test_empty_validation_levels_config() {
let config = BenchmarkConfig {
scenarios: vec![BenchmarkScenario::BusinessDecision],
iterations: 10,
validation_levels: vec![],
enable_statistics: true,
timeout_secs: 60,
};
assert!(config.validation_levels.is_empty());
}
#[test]
fn test_large_iteration_count() {
let config = BenchmarkConfig {
scenarios: vec![BenchmarkScenario::BusinessDecision],
iterations: 10000,
validation_levels: vec![ValidationLevel::Standard],
enable_statistics: true,
timeout_secs: 3600,
};
assert_eq!(config.iterations, 10000);
}
#[test]
fn test_perfect_success_rate_summary() {
let runner = DeepSeekBenchmarkRunner::default();
let mut scenario_results = HashMap::new();
scenario_results.insert(
"Test_Standard".to_string(),
ScenarioResult {
scenario: BenchmarkScenario::BusinessDecision,
validation_level: ValidationLevel::Standard,
iterations: 10,
average_duration_ms: 100.0,
average_confidence: 0.95,
average_validation_confidence: 0.98,
success_rate: 1.0, validation_success_rate: 1.0,
token_usage: TokenUsageMetrics::default(),
validation_findings: vec![],
},
);
let summary = runner.calculate_summary(&scenario_results, 10.0, 10);
assert_eq!(summary.overall_success_rate, 1.0);
assert!(summary
.recommendations
.iter()
.any(|r| r.contains("production")));
}
#[test]
fn test_zero_success_rate_summary() {
let runner = DeepSeekBenchmarkRunner::default();
let mut scenario_results = HashMap::new();
scenario_results.insert(
"Test_Standard".to_string(),
ScenarioResult {
scenario: BenchmarkScenario::BusinessDecision,
validation_level: ValidationLevel::Standard,
iterations: 10,
average_duration_ms: 100.0,
average_confidence: 0.30,
average_validation_confidence: 0.25,
success_rate: 0.0, validation_success_rate: 0.0,
token_usage: TokenUsageMetrics::default(),
validation_findings: vec![],
},
);
let summary = runner.calculate_summary(&scenario_results, 10.0, 10);
assert_eq!(summary.overall_success_rate, 0.0);
assert!(summary
.recommendations
.iter()
.any(|r| r.contains("optimization")));
}
#[test]
fn test_negative_confidence_gain() {
let runner = DeepSeekBenchmarkRunner::default();
let mut scenario_results = HashMap::new();
scenario_results.insert(
"Test_Standard".to_string(),
ScenarioResult {
scenario: BenchmarkScenario::BusinessDecision,
validation_level: ValidationLevel::Standard,
iterations: 10,
average_duration_ms: 100.0,
average_confidence: 0.85,
average_validation_confidence: 0.70, success_rate: 1.0,
validation_success_rate: 1.0,
token_usage: TokenUsageMetrics::default(),
validation_findings: vec![],
},
);
let summary = runner.calculate_summary(&scenario_results, 10.0, 10);
assert!(summary.average_confidence_gain < 0.0);
assert!(!summary
.performance_improvements
.iter()
.any(|p| p.contains("confidence improvement")));
}
#[test]
fn test_benchmark_config_clone() {
let config = BenchmarkConfig::default();
let cloned = config.clone();
assert_eq!(config.iterations, cloned.iterations);
assert_eq!(config.scenarios.len(), cloned.scenarios.len());
}
#[test]
fn test_benchmark_scenario_copy() {
let scenario = BenchmarkScenario::BusinessDecision;
let copied = scenario;
assert_eq!(scenario, copied);
}
#[test]
fn test_scenario_result_clone() {
let result = ScenarioResult {
scenario: BenchmarkScenario::BusinessDecision,
validation_level: ValidationLevel::Standard,
iterations: 10,
average_duration_ms: 100.0,
average_confidence: 0.80,
average_validation_confidence: 0.85,
success_rate: 0.90,
validation_success_rate: 0.80,
token_usage: TokenUsageMetrics::default(),
validation_findings: vec![],
};
let cloned = result.clone();
assert_eq!(result.scenario, cloned.scenario);
assert_eq!(result.iterations, cloned.iterations);
}
#[test]
fn test_debug_formatting() {
let config = BenchmarkConfig::default();
let debug_str = format!("{:?}", config);
assert!(debug_str.contains("BenchmarkConfig"));
assert!(debug_str.contains("iterations"));
}
#[test]
fn test_validation_level_variants() {
let levels = [
ValidationLevel::None,
ValidationLevel::Quick,
ValidationLevel::Standard,
ValidationLevel::Rigorous,
ValidationLevel::Paranoid,
];
assert_eq!(levels.len(), 5);
}
#[test]
fn test_validation_level_default() {
let level = ValidationLevel::default();
assert_eq!(level, ValidationLevel::Standard);
}
#[test]
fn test_validation_level_serialization() {
let level = ValidationLevel::Rigorous;
let json = serde_json::to_string(&level).expect("Failed to serialize");
assert_eq!(json, "\"rigorous\"");
}
}