use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::path::PathBuf;
use std::time::Duration;
#[derive(Debug, Clone)]
pub struct ComparisonConfig {
pub cores: usize,
pub output_dir: PathBuf,
pub datasets: Vec<DatasetConfig>,
pub run_parsing: bool,
pub run_normalization: bool,
pub include_mutalyzer: bool,
pub mutalyzer_api_url: Option<String>,
pub normalization_sample_size: usize,
pub reference_path: Option<PathBuf>,
pub skip_existing: bool,
}
impl Default for ComparisonConfig {
fn default() -> Self {
Self {
cores: 12,
output_dir: PathBuf::from("benchmark_results"),
datasets: Vec::new(),
run_parsing: true,
run_normalization: true,
include_mutalyzer: false,
mutalyzer_api_url: Some("http://localhost:8082".to_string()),
normalization_sample_size: 10_000,
reference_path: None,
skip_existing: true,
}
}
}
#[derive(Debug, Clone)]
pub struct DatasetConfig {
pub name: String,
pub source: PathBuf,
pub format: DatasetFormat,
pub description: String,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DatasetFormat {
ClinvarTsv,
TestCasesJson,
PlainText,
JsonArray,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RefMismatchInfo {
pub stated_ref: String,
pub actual_ref: String,
pub position: String,
pub corrected: bool,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ParseResult {
pub input: String,
pub success: bool,
#[serde(skip_serializing_if = "Option::is_none")]
pub output: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub error: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub error_category: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub ref_mismatch: Option<RefMismatchInfo>,
#[serde(skip_serializing_if = "Option::is_none")]
pub details: Option<ParsedVariantDetails>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ParsedVariantDetails {
pub reference: String,
pub coordinate_system: String,
pub variant_type: String,
pub position: PositionDetails,
#[serde(skip_serializing_if = "Option::is_none")]
pub deleted: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub inserted: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub was_shifted: Option<bool>,
#[serde(skip_serializing_if = "Option::is_none")]
pub original_position: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PositionDetails {
pub start: i64,
#[serde(skip_serializing_if = "Option::is_none")]
pub end: Option<i64>,
#[serde(skip_serializing_if = "Option::is_none")]
pub offset: Option<i32>,
pub display: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TimingInfo {
pub tool: String,
pub total_patterns: usize,
pub successful: usize,
pub failed: usize,
pub elapsed_seconds: f64,
pub patterns_per_second: f64,
pub avg_ns_per_pattern: f64,
}
impl TimingInfo {
pub fn new(tool: &str, total: usize, successful: usize, elapsed: Duration) -> Self {
let elapsed_secs = elapsed.as_secs_f64();
let throughput = if elapsed_secs > f64::EPSILON {
total as f64 / elapsed_secs
} else {
0.0
};
let avg_ns = if total > 0 {
elapsed.as_nanos() as f64 / total as f64
} else {
0.0
};
Self {
tool: tool.to_string(),
total_patterns: total,
successful,
failed: total - successful,
elapsed_seconds: elapsed_secs,
patterns_per_second: throughput,
avg_ns_per_pattern: avg_ns,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ShardResults {
pub shard_index: usize,
pub tool: String,
pub input_file: String,
pub timing: TimingInfo,
#[serde(skip_serializing_if = "Vec::is_empty")]
pub sample_results: Vec<ParseResult>,
#[serde(skip_serializing_if = "Vec::is_empty")]
pub failed_examples: Vec<ParseResult>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ParsingComparison {
pub dataset: String,
pub ferro_hgvs: AggregatedResults,
#[serde(skip_serializing_if = "Option::is_none")]
pub mutalyzer: Option<AggregatedResults>,
#[serde(skip_serializing_if = "Option::is_none")]
pub speedup: Option<f64>,
#[serde(skip_serializing_if = "Option::is_none")]
pub agreement: Option<AgreementStats>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AggregatedResults {
pub total_patterns: usize,
pub successful: usize,
pub failed: usize,
pub pass_rate: f64,
pub total_time_seconds: f64,
pub throughput: f64,
}
impl AggregatedResults {
pub fn from_timings(timings: &[TimingInfo]) -> Self {
let total: usize = timings.iter().map(|t| t.total_patterns).sum();
let successful: usize = timings.iter().map(|t| t.successful).sum();
let total_time: f64 = timings.iter().map(|t| t.elapsed_seconds).sum();
Self {
total_patterns: total,
successful,
failed: total - successful,
pass_rate: if total > 0 {
successful as f64 / total as f64
} else {
0.0
},
total_time_seconds: total_time,
throughput: if total_time > 0.0 {
total as f64 / total_time
} else {
0.0
},
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AgreementStats {
pub both_success: usize,
pub both_fail: usize,
pub ferro_only_success: usize,
pub mutalyzer_only_success: usize,
pub agreements: usize,
pub agreement_rate: f64,
#[serde(skip_serializing_if = "Vec::is_empty")]
pub disagreement_examples: Vec<DisagreementExample>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DisagreementExample {
pub input: String,
pub ferro_output: String,
pub mutalyzer_output: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ReferenceStats {
pub corrections_count: usize,
pub strict_agreement_rate: f64,
pub lenient_agreement_rate: f64,
#[serde(skip_serializing_if = "Vec::is_empty")]
pub patterns_with_corrections: Vec<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ComparisonSummary {
pub generated: DateTime<Utc>,
pub config: SummaryConfig,
pub parsing: HashMap<String, ParsingComparison>,
pub normalization: HashMap<String, NormalizationComparison>,
pub aggregate: AggregateStats,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SummaryConfig {
pub cores: usize,
pub include_mutalyzer: bool,
pub normalization_sample_size: usize,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AggregateStats {
pub total_patterns: usize,
pub ferro_throughput: f64,
pub mutalyzer_throughput: Option<f64>,
pub speedup: Option<f64>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct NormalizationComparison {
pub dataset: String,
pub sample_size: usize,
pub ferro_hgvs: AggregatedResults,
#[serde(skip_serializing_if = "Option::is_none")]
pub mutalyzer: Option<AggregatedResults>,
#[serde(skip_serializing_if = "Option::is_none")]
pub agreement: Option<AgreementStats>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum CompareMode {
Normalize,
Parse,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
pub enum Validator {
#[default]
Mutalyzer,
Biocommons,
HgvsRs,
}
impl std::fmt::Display for Validator {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Validator::Mutalyzer => write!(f, "mutalyzer"),
Validator::Biocommons => write!(f, "biocommons"),
Validator::HgvsRs => write!(f, "hgvs-rs"),
}
}
}
impl std::str::FromStr for Validator {
type Err = String;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s.to_lowercase().as_str() {
"mutalyzer" => Ok(Validator::Mutalyzer),
"biocommons" | "biocommons-hgvs" | "hgvs" => Ok(Validator::Biocommons),
"hgvs-rs" | "hgvsrs" => Ok(Validator::HgvsRs),
_ => Err(format!(
"Unknown validator: {}. Valid options: mutalyzer, biocommons, hgvs-rs",
s
)),
}
}
}
impl std::fmt::Display for CompareMode {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
CompareMode::Normalize => write!(f, "normalize"),
CompareMode::Parse => write!(f, "parse"),
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ComparisonResult {
pub mode: CompareMode,
pub timestamp: DateTime<Utc>,
pub sample_size: usize,
pub ferro: ComparisonToolResult,
pub mutalyzer: ComparisonToolResult,
pub speedup: f64,
pub agreement: AgreementStats,
#[serde(skip_serializing_if = "Vec::is_empty")]
pub differences: Vec<DisagreementExample>,
#[serde(skip_serializing_if = "Option::is_none")]
pub reference_stats: Option<ReferenceStats>,
#[serde(skip_serializing_if = "Option::is_none")]
pub cache_stats: Option<CacheStats>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ComparisonToolResult {
pub elapsed_seconds: f64,
pub successful: usize,
pub failed: usize,
pub throughput: f64,
#[serde(skip_serializing_if = "Option::is_none")]
pub error_counts: Option<HashMap<String, usize>>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CacheStats {
pub total_accessions: usize,
pub ferro_missing: usize,
pub mutalyzer_missing: usize,
#[serde(skip_serializing_if = "Vec::is_empty")]
pub ferro_missing_examples: Vec<String>,
#[serde(skip_serializing_if = "Vec::is_empty")]
pub mutalyzer_missing_examples: Vec<String>,
}
impl ComparisonToolResult {
pub fn new(total: usize, successful: usize, elapsed: std::time::Duration) -> Self {
let elapsed_secs = elapsed.as_secs_f64();
let throughput = if elapsed_secs > 0.0 {
total as f64 / elapsed_secs
} else {
0.0
};
Self {
elapsed_seconds: elapsed_secs,
successful,
failed: total - successful,
throughput,
error_counts: None,
}
}
pub fn with_error_counts(mut self, counts: HashMap<String, usize>) -> Self {
self.error_counts = Some(counts);
self
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum PatternCategory {
GenomicSnv,
GenomicDel,
GenomicIns,
GenomicDup,
GenomicDelins,
GenomicInv,
CodingSnv,
CodingIntronic,
CodingDel,
CodingIns,
CodingDup,
ProteinSub,
ProteinFs,
ProteinExt,
NonCoding,
Mitochondrial,
Rna,
Circular,
Repeat,
Uncertain,
Other,
}
impl PatternCategory {
pub fn is_protein(&self) -> bool {
matches!(
self,
PatternCategory::ProteinSub | PatternCategory::ProteinFs | PatternCategory::ProteinExt
)
}
pub fn categorize(pattern: &str) -> Self {
if pattern.contains(":g.") {
if pattern.ends_with('>') || pattern.chars().filter(|&c| c == '>').count() == 1 {
return Self::GenomicSnv;
}
if pattern.contains("del") && pattern.contains("ins") {
return Self::GenomicDelins;
}
if pattern.contains("del") {
return Self::GenomicDel;
}
if pattern.contains("ins") {
return Self::GenomicIns;
}
if pattern.contains("dup") {
return Self::GenomicDup;
}
if pattern.contains("inv") {
return Self::GenomicInv;
}
return Self::Other;
}
if pattern.contains(":c.") {
if pattern.contains('+') || pattern.contains('-') {
let after_dot = pattern.split(":c.").nth(1).unwrap_or("");
if after_dot.contains('+') || after_dot.chars().any(|c| c == '-') {
return Self::CodingIntronic;
}
}
if pattern.ends_with('>') || pattern.chars().filter(|&c| c == '>').count() == 1 {
return Self::CodingSnv;
}
if pattern.contains("del") {
return Self::CodingDel;
}
if pattern.contains("ins") {
return Self::CodingIns;
}
if pattern.contains("dup") {
return Self::CodingDup;
}
return Self::Other;
}
if pattern.contains(":p.") {
if pattern.contains("fs") {
return Self::ProteinFs;
}
if pattern.contains("ext") {
return Self::ProteinExt;
}
return Self::ProteinSub;
}
if pattern.contains(":n.") {
return Self::NonCoding;
}
if pattern.contains(":m.") {
return Self::Mitochondrial;
}
if pattern.contains(":r.") {
return Self::Rna;
}
if pattern.contains(":o.") {
return Self::Circular;
}
if pattern.contains('[') && pattern.contains(']') {
return Self::Repeat;
}
if pattern.contains("(?)") || pattern.contains('?') {
return Self::Uncertain;
}
Self::Other
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CombinedPatternResult {
pub input: String,
pub ferro_success: bool,
#[serde(skip_serializing_if = "Option::is_none")]
pub ferro_output: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub ferro_error: Option<String>,
pub mutalyzer_success: bool,
#[serde(skip_serializing_if = "Option::is_none")]
pub mutalyzer_output: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub mutalyzer_error: Option<String>,
#[serde(rename = "match")]
pub outputs_match: Option<bool>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DetailedResults {
pub mode: CompareMode,
pub timestamp: DateTime<Utc>,
pub sample_size: usize,
pub results: Vec<CombinedPatternResult>,
}
#[derive(Debug, Clone)]
pub struct MutalyzerBenchmarkConfig {
pub input_path: PathBuf,
pub output_path: PathBuf,
pub workers: usize,
pub batch_size: usize,
pub settings_file: Option<PathBuf>,
pub allow_network: bool,
pub existing_path: Option<PathBuf>,
pub progress_interval: u64,
pub skip_failed: bool,
}
impl Default for MutalyzerBenchmarkConfig {
fn default() -> Self {
Self {
input_path: PathBuf::new(),
output_path: PathBuf::new(),
workers: 24,
batch_size: 10_000,
settings_file: None,
allow_network: false,
existing_path: None,
progress_interval: 30,
skip_failed: false,
}
}
}
#[derive(Debug, Clone)]
pub struct ExistingResults {
pub successful: HashMap<String, ParseResult>,
pub failed: HashMap<String, ParseResult>,
}
impl ExistingResults {
pub fn new() -> Self {
Self {
successful: HashMap::new(),
failed: HashMap::new(),
}
}
pub fn should_process(&self, pattern: &str) -> bool {
!self.successful.contains_key(pattern)
}
pub fn successful_count(&self) -> usize {
self.successful.len()
}
pub fn failed_count(&self) -> usize {
self.failed.len()
}
}
impl Default for ExistingResults {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BenchmarkMetadata {
pub start_time: DateTime<Utc>,
#[serde(skip_serializing_if = "Option::is_none")]
pub end_time: Option<DateTime<Utc>>,
pub total_patterns: usize,
pub workers: usize,
pub batch_size: usize,
pub allow_network: bool,
#[serde(skip_serializing_if = "Option::is_none")]
pub existing_file: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BenchmarkSummary {
pub metadata: BenchmarkMetadata,
pub processed: usize,
pub successful: usize,
pub failed: usize,
pub elapsed_seconds: f64,
pub throughput: f64,
pub error_counts: HashMap<String, usize>,
#[serde(skip_serializing_if = "Option::is_none")]
pub existing_stats: Option<ExistingStats>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ExistingStats {
pub skipped: usize,
pub retried: usize,
pub retry_successes: usize,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ToolParseOutput {
pub tool: String,
pub total_patterns: usize,
pub successful: usize,
pub failed: usize,
pub elapsed_seconds: f64,
pub throughput: f64,
pub results: Vec<ParseResult>,
}
impl ToolParseOutput {
pub fn new(tool: &str, results: Vec<ParseResult>, elapsed: std::time::Duration) -> Self {
let total = results.len();
let successful = results.iter().filter(|r| r.success).count();
let elapsed_secs = elapsed.as_secs_f64();
let throughput = if elapsed_secs > 0.0 {
total as f64 / elapsed_secs
} else {
0.0
};
Self {
tool: tool.to_string(),
total_patterns: total,
successful,
failed: total - successful,
elapsed_seconds: elapsed_secs,
throughput,
results,
}
}
}