Skip to main content

voirs_cli/commands/
accuracy.rs

1//! Accuracy benchmarking command for VoiRS CLI.
2//!
3//! This module provides CLI commands for running comprehensive accuracy benchmarks
4//! including CMU English phoneme tests, JVS Japanese mora tests, and Common Voice
5//! multilingual evaluations.
6
7use clap::{Args, Subcommand};
8use std::path::PathBuf;
9use tokio::time::Instant;
10
11#[cfg(not(doctest))]
12use async_trait::async_trait;
13
14#[cfg(not(doctest))]
15use voirs_evaluation::accuracy_benchmarks::{
16    AccuracyBenchmarkConfig, AccuracyBenchmarkRunner, DatasetConfig, DatasetType, LanguageCode,
17};
18
19/// Accuracy benchmarking commands
20#[derive(Debug, Clone, Args)]
21pub struct AccuracyCommand {
22    #[command(subcommand)]
23    pub command: AccuracySubcommand,
24}
25
26/// Accuracy benchmarking subcommands
27#[derive(Debug, Clone, Subcommand)]
28pub enum AccuracySubcommand {
29    /// Run comprehensive accuracy benchmarks
30    Run(RunAccuracyArgs),
31    /// Run specific dataset benchmark
32    Dataset(DatasetAccuracyArgs),
33    /// List available test datasets
34    List(ListDatasetsArgs),
35    /// Generate accuracy benchmark report
36    Report(ReportArgs),
37}
38
39/// Arguments for running comprehensive accuracy benchmarks
40#[derive(Debug, Clone, Args)]
41pub struct RunAccuracyArgs {
42    /// Output directory for benchmark results
43    #[arg(short, long, default_value = "/tmp/voirs_accuracy_benchmarks")]
44    pub output_dir: PathBuf,
45
46    /// Enable detailed per-case reporting
47    #[arg(long, default_value = "true")]
48    pub detailed: bool,
49
50    /// Maximum processing time per sample (seconds)
51    #[arg(long, default_value = "10.0")]
52    pub max_time: f64,
53
54    /// Include only specific languages (comma-separated)
55    #[arg(long)]
56    pub languages: Option<String>,
57
58    /// Custom dataset file path
59    #[arg(long)]
60    pub custom_dataset: Option<PathBuf>,
61
62    /// Maximum samples per dataset (for faster testing)
63    #[arg(long)]
64    pub max_samples: Option<usize>,
65}
66
67/// Arguments for running specific dataset benchmarks
68#[derive(Debug, Clone, Args)]
69pub struct DatasetAccuracyArgs {
70    /// Dataset type to benchmark
71    #[arg(value_enum)]
72    pub dataset: DatasetTypeArg,
73
74    /// Language for the dataset
75    #[arg(short, long, value_enum)]
76    pub language: LanguageCodeArg,
77
78    /// Custom data file path
79    #[arg(short, long)]
80    pub data_path: Option<PathBuf>,
81
82    /// Target accuracy threshold
83    #[arg(short, long)]
84    pub target_accuracy: Option<f64>,
85
86    /// Maximum number of test samples
87    #[arg(short, long)]
88    pub max_samples: Option<usize>,
89
90    /// Output file for results
91    #[arg(short, long)]
92    pub output: Option<PathBuf>,
93}
94
95/// Arguments for listing available datasets
96#[derive(Debug, Clone, Args)]
97pub struct ListDatasetsArgs {
98    /// Show detailed information about each dataset
99    #[arg(long)]
100    pub detailed: bool,
101
102    /// Filter by language
103    #[arg(short, long, value_enum)]
104    pub language: Option<LanguageCodeArg>,
105}
106
107/// Arguments for generating accuracy reports
108#[derive(Debug, Clone, Args)]
109pub struct ReportArgs {
110    /// Input benchmark results file
111    #[arg(short, long)]
112    pub input: PathBuf,
113
114    /// Output format (json, txt, html)
115    #[arg(short, long, default_value = "txt")]
116    pub format: String,
117
118    /// Output file path
119    #[arg(short, long)]
120    pub output: Option<PathBuf>,
121}
122
123/// Dataset type argument for CLI
124#[derive(Debug, Clone, clap::ValueEnum)]
125pub enum DatasetTypeArg {
126    Cmu,
127    Jvs,
128    CommonVoice,
129    Custom,
130}
131
132#[cfg(not(doctest))]
133impl From<DatasetTypeArg> for DatasetType {
134    fn from(arg: DatasetTypeArg) -> Self {
135        match arg {
136            DatasetTypeArg::Cmu => DatasetType::CMU,
137            DatasetTypeArg::Jvs => DatasetType::JVS,
138            DatasetTypeArg::CommonVoice => DatasetType::CommonVoice,
139            DatasetTypeArg::Custom => DatasetType::Custom,
140        }
141    }
142}
143
144/// Language code argument for CLI
145#[derive(Debug, Clone, clap::ValueEnum)]
146pub enum LanguageCodeArg {
147    EnUs,
148    Ja,
149    Es,
150    Fr,
151    De,
152    ZhCn,
153}
154
155#[cfg(not(doctest))]
156impl From<LanguageCodeArg> for LanguageCode {
157    fn from(arg: LanguageCodeArg) -> Self {
158        match arg {
159            LanguageCodeArg::EnUs => LanguageCode::EnUs,
160            LanguageCodeArg::Ja => LanguageCode::Ja,
161            LanguageCodeArg::Es => LanguageCode::Es,
162            LanguageCodeArg::Fr => LanguageCode::Fr,
163            LanguageCodeArg::De => LanguageCode::De,
164            LanguageCodeArg::ZhCn => LanguageCode::ZhCn,
165        }
166    }
167}
168
169/// Execute accuracy benchmarking commands
170#[cfg(not(doctest))]
171pub async fn execute_accuracy_command(
172    args: AccuracyCommand,
173) -> Result<(), Box<dyn std::error::Error>> {
174    match args.command {
175        AccuracySubcommand::Run(run_args) => run_comprehensive_benchmarks(run_args).await,
176        AccuracySubcommand::Dataset(dataset_args) => run_dataset_benchmark(dataset_args).await,
177        AccuracySubcommand::List(list_args) => list_available_datasets(list_args).await,
178        AccuracySubcommand::Report(report_args) => generate_accuracy_report(report_args).await,
179    }
180}
181
182/// Stub implementation for doctests
183#[cfg(doctest)]
184pub async fn execute_accuracy_command(
185    _args: AccuracyCommand,
186) -> Result<(), Box<dyn std::error::Error>> {
187    Ok(())
188}
189
190/// Run comprehensive accuracy benchmarks
191#[cfg(not(doctest))]
192async fn run_comprehensive_benchmarks(
193    args: RunAccuracyArgs,
194) -> Result<(), Box<dyn std::error::Error>> {
195    println!("šŸŽÆ VoiRS Comprehensive Accuracy Benchmarks");
196    println!("==========================================\n");
197
198    let start_time = Instant::now();
199
200    // Configure benchmark
201    let mut config = AccuracyBenchmarkConfig::default();
202    config.output_dir = args.output_dir.to_string_lossy().to_string();
203    config.detailed_reporting = args.detailed;
204    config.max_processing_time = args.max_time;
205
206    // Filter datasets by language if specified
207    if let Some(languages_str) = &args.languages {
208        let requested_languages: Vec<LanguageCode> = languages_str
209            .split(',')
210            .filter_map(|lang| match lang.trim() {
211                "en-US" | "en" => Some(LanguageCode::EnUs),
212                "ja" => Some(LanguageCode::Ja),
213                "es" => Some(LanguageCode::Es),
214                "fr" => Some(LanguageCode::Fr),
215                "de" => Some(LanguageCode::De),
216                "zh-CN" | "zh" => Some(LanguageCode::ZhCn),
217                _ => None,
218            })
219            .collect();
220
221        config
222            .datasets
223            .retain(|dataset| requested_languages.contains(&dataset.language));
224
225        println!(
226            "šŸ“Š Running benchmarks for languages: {:?}",
227            requested_languages
228        );
229    }
230
231    // Add custom dataset if specified
232    if let Some(custom_path) = &args.custom_dataset {
233        let custom_config = DatasetConfig {
234            name: "Custom_Dataset".to_string(),
235            dataset_type: DatasetType::Custom,
236            language: LanguageCode::EnUs, // Default, will be parsed from file
237            data_path: custom_path.to_string_lossy().to_string(),
238            target_accuracy: 0.90,
239            max_samples: args.max_samples,
240        };
241        config.datasets.push(custom_config);
242        println!("šŸ“ Added custom dataset: {}", custom_path.display());
243    }
244
245    // Override max samples if specified
246    if let Some(max_samples) = args.max_samples {
247        for dataset in &mut config.datasets {
248            dataset.max_samples = Some(max_samples);
249        }
250        println!("šŸ“ Limited to {} samples per dataset", max_samples);
251    }
252
253    // Create and run benchmark runner
254    let mut runner = AccuracyBenchmarkRunner::new(config);
255
256    println!("šŸ”„ Loading test cases...");
257    runner.load_test_cases().await?;
258
259    println!("šŸš€ Running accuracy benchmarks...");
260
261    // Note: In a real implementation, you would pass actual G2P/TTS/ASR systems here
262    // For now, we'll use the simulation mode built into the benchmark runner
263    let results = runner
264        .run_benchmarks(
265            None::<&DummyG2pSystem>,
266            None::<&DummyTtsSystem>,
267            None::<&DummyAsrSystem>,
268        )
269        .await?;
270
271    let total_time = start_time.elapsed();
272
273    // Display results summary
274    println!(
275        "\nāœ… Benchmark completed in {:.2} seconds",
276        total_time.as_secs_f64()
277    );
278    println!("\nšŸ“Š ACCURACY BENCHMARK RESULTS");
279    println!("{}", "=".repeat(50));
280
281    println!("\nOverall Metrics:");
282    println!(
283        "  • Total test cases: {}",
284        results.overall_metrics.total_cases
285    );
286    println!(
287        "  • Overall phoneme accuracy: {:.2}%",
288        results.overall_metrics.overall_phoneme_accuracy * 100.0
289    );
290    println!(
291        "  • Overall word accuracy: {:.2}%",
292        results.overall_metrics.overall_word_accuracy * 100.0
293    );
294    println!(
295        "  • Targets met: {}/{} ({:.1}%)",
296        results.overall_metrics.targets_met,
297        results.overall_metrics.total_targets,
298        results.overall_metrics.pass_rate
299    );
300
301    println!("\nLanguage-Specific Results:");
302    for (language, accuracy) in &results.overall_metrics.language_accuracies {
303        println!("  • {:?}: {:.2}%", language, accuracy * 100.0);
304    }
305
306    println!("\nDataset Results:");
307    for (dataset_name, dataset_result) in &results.dataset_results {
308        let status = if dataset_result.target_met {
309            "āœ…"
310        } else {
311            "āŒ"
312        };
313        println!(
314            "  {} {}: {:.2}% ({:.1}% target)",
315            status,
316            dataset_name,
317            dataset_result.phoneme_accuracy * 100.0,
318            dataset_result.target_accuracy * 100.0
319        );
320    }
321
322    println!("\nPerformance Statistics:");
323    println!(
324        "  • Average processing time: {:.2} ms",
325        results.performance_stats.avg_processing_time_ms
326    );
327    println!(
328        "  • Throughput: {:.1} cases/sec",
329        results.performance_stats.throughput_cases_per_sec
330    );
331    println!(
332        "  • Peak memory usage: {:.1} MB",
333        results.performance_stats.peak_memory_mb
334    );
335
336    println!("\nResults saved to: {}", args.output_dir.display());
337
338    // Exit with appropriate code
339    if results.overall_metrics.pass_rate >= 80.0 {
340        println!("\nšŸŽ‰ All accuracy targets achieved!");
341        std::process::exit(0);
342    } else {
343        println!("\nāš ļø  Some accuracy targets not met. See detailed report for recommendations.");
344        std::process::exit(1);
345    }
346}
347
348/// Run benchmark for specific dataset
349#[cfg(not(doctest))]
350async fn run_dataset_benchmark(
351    args: DatasetAccuracyArgs,
352) -> Result<(), Box<dyn std::error::Error>> {
353    println!("šŸŽÆ Running {:?} Dataset Benchmark", args.dataset);
354    println!("{}", "=".repeat(40));
355
356    let dataset_config = DatasetConfig {
357        name: format!("{:?}_Benchmark", args.dataset),
358        dataset_type: args.dataset.into(),
359        language: args.language.into(),
360        data_path: args
361            .data_path
362            .map(|p| p.to_string_lossy().to_string())
363            .unwrap_or_else(|| "tests/datasets/default.txt".to_string()),
364        target_accuracy: args.target_accuracy.unwrap_or(0.90),
365        max_samples: args.max_samples,
366    };
367
368    let config = AccuracyBenchmarkConfig {
369        datasets: vec![dataset_config],
370        detailed_reporting: true,
371        ..Default::default()
372    };
373
374    let mut runner = AccuracyBenchmarkRunner::new(config);
375    runner.load_test_cases().await?;
376
377    let results = runner
378        .run_benchmarks(
379            None::<&DummyG2pSystem>,
380            None::<&DummyTtsSystem>,
381            None::<&DummyAsrSystem>,
382        )
383        .await?;
384
385    // Display results
386    for (dataset_name, dataset_result) in &results.dataset_results {
387        println!("\nDataset: {}", dataset_name);
388        println!("Language: {:?}", dataset_result.language);
389        println!("Test cases: {}", dataset_result.total_cases);
390        println!(
391            "Phoneme accuracy: {:.2}%",
392            dataset_result.phoneme_accuracy * 100.0
393        );
394        println!(
395            "Word accuracy: {:.2}%",
396            dataset_result.word_accuracy * 100.0
397        );
398        println!("Target: {:.1}%", dataset_result.target_accuracy * 100.0);
399        println!(
400            "Result: {}",
401            if dataset_result.target_met {
402                "āœ… PASS"
403            } else {
404                "āŒ FAIL"
405            }
406        );
407    }
408
409    Ok(())
410}
411
412/// List available datasets
413#[cfg(not(doctest))]
414async fn list_available_datasets(args: ListDatasetsArgs) -> Result<(), Box<dyn std::error::Error>> {
415    println!("šŸ“‹ Available Accuracy Test Datasets");
416    println!("{}", "=".repeat(40));
417
418    let datasets = vec![
419        (
420            "CMU English Phoneme Test",
421            DatasetType::CMU,
422            LanguageCode::EnUs,
423            0.95,
424            "English phoneme accuracy using CMU pronunciation dictionary",
425        ),
426        (
427            "JVS Japanese Mora Test",
428            DatasetType::JVS,
429            LanguageCode::Ja,
430            0.90,
431            "Japanese mora accuracy using JVS speech corpus",
432        ),
433        (
434            "Common Voice Spanish",
435            DatasetType::CommonVoice,
436            LanguageCode::Es,
437            0.88,
438            "Spanish pronunciation from Mozilla Common Voice",
439        ),
440        (
441            "Common Voice French",
442            DatasetType::CommonVoice,
443            LanguageCode::Fr,
444            0.88,
445            "French pronunciation from Mozilla Common Voice",
446        ),
447        (
448            "Common Voice German",
449            DatasetType::CommonVoice,
450            LanguageCode::De,
451            0.88,
452            "German pronunciation from Mozilla Common Voice",
453        ),
454        (
455            "Common Voice Chinese",
456            DatasetType::CommonVoice,
457            LanguageCode::ZhCn,
458            0.85,
459            "Mandarin Chinese from Mozilla Common Voice",
460        ),
461    ];
462
463    for (name, dataset_type, language, target, description) in datasets {
464        // Filter by language if specified
465        if let Some(filter_lang) = &args.language {
466            let filter_lang_code: LanguageCode = filter_lang.clone().into();
467            if language != filter_lang_code {
468                continue;
469            }
470        }
471
472        println!("\nšŸ“Š {}", name);
473        println!("   Type: {:?}", dataset_type);
474        println!("   Language: {:?}", language);
475        println!("   Target accuracy: {:.1}%", target * 100.0);
476
477        if args.detailed {
478            println!("   Description: {}", description);
479            println!("   Status: Available");
480        }
481    }
482
483    println!("\nTo run a specific dataset:");
484    println!("  voirs-cli accuracy dataset <dataset_type> --language <lang>");
485    println!("\nTo run all datasets:");
486    println!("  voirs-cli accuracy run");
487
488    Ok(())
489}
490
491/// Generate accuracy report from results file
492#[cfg(not(doctest))]
493async fn generate_accuracy_report(args: ReportArgs) -> Result<(), Box<dyn std::error::Error>> {
494    println!(
495        "šŸ“„ Generating accuracy report from: {}",
496        args.input.display()
497    );
498
499    // Read and parse the JSON results file
500    let contents = std::fs::read_to_string(&args.input)
501        .map_err(|e| format!("Failed to read results file: {}", e))?;
502
503    let results: voirs_evaluation::accuracy_benchmarks::AccuracyBenchmarkResults =
504        serde_json::from_str(&contents)
505            .map_err(|e| format!("Failed to parse results JSON: {}", e))?;
506
507    // Generate report in requested format
508    let report_content = match args.format.to_lowercase().as_str() {
509        "json" => generate_json_report(&results)?,
510        "txt" => generate_text_report(&results),
511        "html" => generate_html_report(&results),
512        _ => {
513            return Err(format!(
514                "Unsupported format: {}. Supported formats: json, txt, html",
515                args.format
516            )
517            .into())
518        }
519    };
520
521    // Write to output file or stdout
522    match args.output {
523        Some(output_path) => {
524            std::fs::write(&output_path, report_content)
525                .map_err(|e| format!("Failed to write report: {}", e))?;
526            println!("āœ… Report generated: {}", output_path.display());
527        }
528        None => {
529            println!("\n{}", report_content);
530        }
531    }
532
533    Ok(())
534}
535
536/// Generate JSON format report (pretty-printed)
537#[cfg(not(doctest))]
538fn generate_json_report(
539    results: &voirs_evaluation::accuracy_benchmarks::AccuracyBenchmarkResults,
540) -> Result<String, Box<dyn std::error::Error>> {
541    let json = serde_json::to_string_pretty(results)?;
542    Ok(json)
543}
544
545/// Generate text format report
546#[cfg(not(doctest))]
547fn generate_text_report(
548    results: &voirs_evaluation::accuracy_benchmarks::AccuracyBenchmarkResults,
549) -> String {
550    let mut report = String::new();
551
552    // Header
553    report.push_str("VoiRS Accuracy Benchmark Report\n");
554    report.push_str(&"=".repeat(50));
555    report.push_str("\n\n");
556
557    // Timestamp and execution info
558    report.push_str(&format!("Generated: {}\n", results.timestamp));
559    report.push_str(&format!(
560        "Total execution time: {:.2} seconds\n\n",
561        results.total_time_seconds
562    ));
563
564    // Overall metrics
565    report.push_str("OVERALL METRICS\n");
566    report.push_str(&"-".repeat(30));
567    report.push('\n');
568    report.push_str(&format!(
569        "Total test cases: {}\n",
570        results.overall_metrics.total_cases
571    ));
572    report.push_str(&format!(
573        "Overall phoneme accuracy: {:.2}%\n",
574        results.overall_metrics.overall_phoneme_accuracy * 100.0
575    ));
576    report.push_str(&format!(
577        "Overall word accuracy: {:.2}%\n",
578        results.overall_metrics.overall_word_accuracy * 100.0
579    ));
580    report.push_str(&format!(
581        "Targets met: {}/{} ({:.1}%)\n\n",
582        results.overall_metrics.targets_met,
583        results.overall_metrics.total_targets,
584        results.overall_metrics.pass_rate
585    ));
586
587    // Language-specific results
588    report.push_str("LANGUAGE-SPECIFIC RESULTS\n");
589    report.push_str(&"-".repeat(30));
590    report.push('\n');
591    for (language, accuracy) in &results.overall_metrics.language_accuracies {
592        report.push_str(&format!("{:?}: {:.2}%\n", language, accuracy * 100.0));
593    }
594    report.push('\n');
595
596    // Dataset results
597    report.push_str("DATASET RESULTS\n");
598    report.push_str(&"-".repeat(30));
599    report.push('\n');
600    for (dataset_name, dataset_result) in &results.dataset_results {
601        let status = if dataset_result.target_met {
602            "āœ… PASS"
603        } else {
604            "āŒ FAIL"
605        };
606        report.push_str(&format!("Dataset: {}\n", dataset_name));
607        report.push_str(&format!("  Status: {}\n", status));
608        report.push_str(&format!("  Language: {:?}\n", dataset_result.language));
609        report.push_str(&format!(
610            "  Test cases: {} (Success: {}, Failed: {})\n",
611            dataset_result.total_cases,
612            dataset_result.successful_cases,
613            dataset_result.failed_cases
614        ));
615        report.push_str(&format!(
616            "  Phoneme accuracy: {:.2}%\n",
617            dataset_result.phoneme_accuracy * 100.0
618        ));
619        report.push_str(&format!(
620            "  Word accuracy: {:.2}%\n",
621            dataset_result.word_accuracy * 100.0
622        ));
623        report.push_str(&format!(
624            "  Target: {:.1}%\n",
625            dataset_result.target_accuracy * 100.0
626        ));
627        report.push_str(&format!(
628            "  Average edit distance: {:.2}\n",
629            dataset_result.average_edit_distance
630        ));
631        report.push_str(&format!(
632            "  Processing time: {:.2} ± {:.2} ms\n\n",
633            dataset_result.processing_time_ms.mean_ms, dataset_result.processing_time_ms.std_dev_ms
634        ));
635    }
636
637    // Performance statistics
638    report.push_str("PERFORMANCE STATISTICS\n");
639    report.push_str(&"-".repeat(30));
640    report.push('\n');
641    report.push_str(&format!(
642        "Average processing time: {:.2} ms\n",
643        results.performance_stats.avg_processing_time_ms
644    ));
645    report.push_str(&format!(
646        "Median processing time: {:.2} ms\n",
647        results.performance_stats.median_processing_time_ms
648    ));
649    report.push_str(&format!(
650        "95th percentile: {:.2} ms\n",
651        results.performance_stats.p95_processing_time_ms
652    ));
653    report.push_str(&format!(
654        "Throughput: {:.1} cases/sec\n",
655        results.performance_stats.throughput_cases_per_sec
656    ));
657    report.push_str(&format!(
658        "Peak memory usage: {:.1} MB\n",
659        results.performance_stats.peak_memory_mb
660    ));
661
662    report
663}
664
665/// Generate HTML format report
666#[cfg(not(doctest))]
667fn generate_html_report(
668    results: &voirs_evaluation::accuracy_benchmarks::AccuracyBenchmarkResults,
669) -> String {
670    let mut html = String::new();
671
672    // HTML header
673    html.push_str("<!DOCTYPE html>\n<html lang=\"en\">\n<head>\n");
674    html.push_str("    <meta charset=\"UTF-8\">\n");
675    html.push_str(
676        "    <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\n",
677    );
678    html.push_str("    <title>VoiRS Accuracy Benchmark Report</title>\n");
679    html.push_str("    <style>\n");
680    html.push_str("        body { font-family: Arial, sans-serif; margin: 40px; background-color: #f5f5f5; }\n");
681    html.push_str("        .container { max-width: 1200px; margin: 0 auto; background: white; padding: 30px; border-radius: 8px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); }\n");
682    html.push_str(
683        "        h1 { color: #2c3e50; border-bottom: 3px solid #3498db; padding-bottom: 10px; }\n",
684    );
685    html.push_str("        h2 { color: #34495e; margin-top: 30px; }\n");
686    html.push_str("        .metric { background: #ecf0f1; padding: 15px; margin: 10px 0; border-radius: 5px; }\n");
687    html.push_str("        .pass { color: #27ae60; font-weight: bold; }\n");
688    html.push_str("        .fail { color: #e74c3c; font-weight: bold; }\n");
689    html.push_str("        table { width: 100%; border-collapse: collapse; margin: 20px 0; }\n");
690    html.push_str(
691        "        th, td { padding: 12px; text-align: left; border-bottom: 1px solid #ddd; }\n",
692    );
693    html.push_str("        th { background-color: #3498db; color: white; }\n");
694    html.push_str("        .timestamp { color: #7f8c8d; font-style: italic; }\n");
695    html.push_str("    </style>\n");
696    html.push_str("</head>\n<body>\n");
697    html.push_str("    <div class=\"container\">\n");
698
699    // Header
700    html.push_str("        <h1>šŸŽÆ VoiRS Accuracy Benchmark Report</h1>\n");
701    html.push_str(&format!(
702        "        <p class=\"timestamp\">Generated: {}</p>\n",
703        results.timestamp
704    ));
705    html.push_str(&format!(
706        "        <p class=\"timestamp\">Execution time: {:.2} seconds</p>\n",
707        results.total_time_seconds
708    ));
709
710    // Overall metrics
711    html.push_str("        <h2>šŸ“Š Overall Metrics</h2>\n");
712    html.push_str("        <div class=\"metric\">\n");
713    html.push_str(&format!(
714        "            <strong>Total test cases:</strong> {}<br>\n",
715        results.overall_metrics.total_cases
716    ));
717    html.push_str(&format!(
718        "            <strong>Overall phoneme accuracy:</strong> {:.2}%<br>\n",
719        results.overall_metrics.overall_phoneme_accuracy * 100.0
720    ));
721    html.push_str(&format!(
722        "            <strong>Overall word accuracy:</strong> {:.2}%<br>\n",
723        results.overall_metrics.overall_word_accuracy * 100.0
724    ));
725    html.push_str(&format!(
726        "            <strong>Targets met:</strong> {}/{} ({:.1}%)\n",
727        results.overall_metrics.targets_met,
728        results.overall_metrics.total_targets,
729        results.overall_metrics.pass_rate
730    ));
731    html.push_str("        </div>\n");
732
733    // Language results table
734    html.push_str("        <h2>šŸŒ Language-Specific Results</h2>\n");
735    html.push_str("        <table>\n");
736    html.push_str("            <tr><th>Language</th><th>Accuracy</th></tr>\n");
737    for (language, accuracy) in &results.overall_metrics.language_accuracies {
738        html.push_str(&format!(
739            "            <tr><td>{:?}</td><td>{:.2}%</td></tr>\n",
740            language,
741            accuracy * 100.0
742        ));
743    }
744    html.push_str("        </table>\n");
745
746    // Dataset results table
747    html.push_str("        <h2>šŸ“š Dataset Results</h2>\n");
748    html.push_str("        <table>\n");
749    html.push_str("            <tr><th>Dataset</th><th>Status</th><th>Language</th><th>Cases</th><th>Phoneme Acc.</th><th>Word Acc.</th><th>Target</th></tr>\n");
750    for (dataset_name, dataset_result) in &results.dataset_results {
751        let status_class = if dataset_result.target_met {
752            "pass"
753        } else {
754            "fail"
755        };
756        let status_text = if dataset_result.target_met {
757            "āœ… PASS"
758        } else {
759            "āŒ FAIL"
760        };
761        html.push_str(&format!(
762            "            <tr><td>{}</td><td class=\"{}\"> {}</td><td>{:?}</td><td>{}</td><td>{:.2}%</td><td>{:.2}%</td><td>{:.1}%</td></tr>\n",
763            dataset_name, status_class, status_text, dataset_result.language,
764            dataset_result.total_cases, dataset_result.phoneme_accuracy * 100.0,
765            dataset_result.word_accuracy * 100.0, dataset_result.target_accuracy * 100.0
766        ));
767    }
768    html.push_str("        </table>\n");
769
770    // Performance statistics
771    html.push_str("        <h2>⚔ Performance Statistics</h2>\n");
772    html.push_str("        <div class=\"metric\">\n");
773    html.push_str(&format!(
774        "            <strong>Average processing time:</strong> {:.2} ms<br>\n",
775        results.performance_stats.avg_processing_time_ms
776    ));
777    html.push_str(&format!(
778        "            <strong>Median processing time:</strong> {:.2} ms<br>\n",
779        results.performance_stats.median_processing_time_ms
780    ));
781    html.push_str(&format!(
782        "            <strong>95th percentile:</strong> {:.2} ms<br>\n",
783        results.performance_stats.p95_processing_time_ms
784    ));
785    html.push_str(&format!(
786        "            <strong>Throughput:</strong> {:.1} cases/sec<br>\n",
787        results.performance_stats.throughput_cases_per_sec
788    ));
789    html.push_str(&format!(
790        "            <strong>Peak memory usage:</strong> {:.1} MB\n",
791        results.performance_stats.peak_memory_mb
792    ));
793    html.push_str("        </div>\n");
794
795    // HTML footer
796    html.push_str("    </div>\n</body>\n</html>");
797
798    html
799}
800
801// Dummy system implementations for demonstration
802// In a real implementation, these would be replaced with actual system interfaces
803
804#[cfg(not(doctest))]
805struct DummyG2pSystem;
806#[cfg(not(doctest))]
807#[async_trait]
808impl voirs_evaluation::accuracy_benchmarks::G2pSystem for DummyG2pSystem {
809    async fn convert_to_phonemes(
810        &self,
811        text: &str,
812        _language: LanguageCode,
813    ) -> Result<Vec<String>, voirs_evaluation::EvaluationError> {
814        Ok(text.chars().map(|c| c.to_string()).collect())
815    }
816}
817
818#[cfg(not(doctest))]
819struct DummyTtsSystem;
820#[cfg(not(doctest))]
821#[async_trait]
822impl voirs_evaluation::accuracy_benchmarks::TtsSystem for DummyTtsSystem {
823    async fn synthesize(
824        &self,
825        _text: &str,
826        _language: LanguageCode,
827    ) -> Result<voirs_sdk::AudioBuffer, voirs_evaluation::EvaluationError> {
828        Ok(voirs_sdk::AudioBuffer::mono(vec![0.1; 16000], 16000))
829    }
830}
831
832#[cfg(not(doctest))]
833struct DummyAsrSystem;
834#[cfg(not(doctest))]
835#[async_trait]
836impl voirs_evaluation::accuracy_benchmarks::AsrSystem for DummyAsrSystem {
837    async fn transcribe(
838        &self,
839        _audio: &voirs_sdk::AudioBuffer,
840        _language: LanguageCode,
841    ) -> Result<String, voirs_evaluation::EvaluationError> {
842        Ok("dummy transcription".to_string())
843    }
844}