const MUTALYZER_PARSE_SCRIPT: &str = r#"
import sys
import json
import time
try:
from mutalyzer_hgvs_parser import to_model
except ImportError:
print("ERROR: mutalyzer-hgvs-parser not installed", file=sys.stderr)
sys.exit(1)
input_file = sys.argv[1]
output_file = sys.argv[2]
results = []
successful = 0
failed = 0
with open(input_file, 'r') as f:
patterns = [line.strip() for line in f if line.strip()]
start = time.perf_counter()
for pattern in patterns:
try:
model = to_model(pattern)
results.append({
"input": pattern,
"success": True,
"output": str(model) if model else pattern
})
successful += 1
except Exception as e:
results.append({
"input": pattern,
"success": False,
"error": str(e)[:200]
})
failed += 1
elapsed = time.perf_counter() - start
output = {
"tool": "mutalyzer-hgvs-parser",
"total_patterns": len(patterns),
"successful": successful,
"failed": failed,
"elapsed_seconds": elapsed,
"patterns_per_second": len(patterns) / elapsed if elapsed > 0 else 0,
"results": results
}
with open(output_file, 'w') as f:
json.dump(output, f, indent=2)
"#;
const MUTALYZER_NORMALIZE_SCRIPT: &str = r#"
import sys
import os
import gc
# Check if network is allowed (5th argument, default false)
allow_network = len(sys.argv) > 4 and sys.argv[4].lower() == 'true'
# Block network connections unless explicitly allowed
import socket
_original_connect = socket.socket.connect
if not allow_network:
def _blocked_connect(self, address):
raise RuntimeError(f"NETWORK ACCESS BLOCKED: Mutalyzer tried to connect to {address}. "
"Ensure all required sequences are in the local cache.")
socket.socket.connect = _blocked_connect
# CRITICAL: Set MUTALYZER_SETTINGS env var BEFORE importing mutalyzer
# mutalyzer-retriever reads config from a file specified by MUTALYZER_SETTINGS
if len(sys.argv) > 3 and sys.argv[3]:
settings_file = sys.argv[3]
os.environ['MUTALYZER_SETTINGS'] = settings_file
import json
import time
try:
from mutalyzer.description import Description
except ImportError:
print("ERROR: mutalyzer not installed", file=sys.stderr)
sys.exit(1)
input_file = sys.argv[1]
output_file = sys.argv[2]
# Extract worker ID from output filename for progress logging
worker_id = os.path.basename(output_file).replace('shard_', '').replace('_output.json', '')
from collections import defaultdict
results = []
successful = 0
failed = 0
error_counts = defaultdict(int)
def categorize_error(error_msg, pattern=None):
"""Categorize error message into a short category."""
msg = str(error_msg)
# Check for common mutalyzer error codes
if 'ESEQUENCEMISMATCH' in msg:
return 'ESEQUENCEMISMATCH'
elif 'ERETR' in msg or 'retriev' in msg.lower():
return 'RETRIEVAL_ERROR'
elif 'EPARSE' in msg or 'parse' in msg.lower() or 'syntax' in msg.lower():
return 'PARSE_ERROR'
elif 'EREF' in msg:
return 'EREF'
elif 'ERANGE' in msg or 'range' in msg.lower() or 'position' in msg.lower():
return 'RANGE_ERROR'
elif 'ENOTSUPPORTED' in msg or 'not supported' in msg.lower():
return 'NOT_SUPPORTED'
elif 'NETWORK ACCESS BLOCKED' in msg:
# LRG patterns require network for additional metadata
if pattern and pattern.startswith('LRG_'):
return 'LRG_NETWORK_REQUIRED'
return 'CACHE_MISS'
elif 'No model loaded' in msg or 'not found' in msg.lower():
return 'NOT_FOUND'
else:
# Extract error code if present (e.g., EXYZ)
import re
match = re.search(r'\bE[A-Z]{2,}', msg)
if match:
return match.group(0)
return 'OTHER'
with open(input_file, 'r') as f:
patterns = [line.strip() for line in f if line.strip()]
total = len(patterns)
start = time.perf_counter()
# Progress logging interval
progress_interval = max(50, total // 20) # Log ~20 times or every 50
for i, pattern in enumerate(patterns):
# Progress logging
if i > 0 and i % progress_interval == 0:
elapsed_so_far = time.perf_counter() - start
rate = i / elapsed_so_far if elapsed_so_far > 0 else 0
eta = (total - i) / rate if rate > 0 else 0
print(f"[worker {worker_id}] {i}/{total} ({100*i//total}%) - {rate:.1f} p/s - ETA {eta:.0f}s", file=sys.stderr, flush=True)
# Garbage collect to free memory from cached sequences
gc.collect()
try:
d = Description(description=pattern)
d.normalize()
if not d.errors:
output = d.output()
results.append({
"input": pattern,
"success": True,
"output": output.get("normalized_description", pattern)
})
successful += 1
else:
error_msg = str(d.errors[0]) if d.errors else "Unknown error"
category = categorize_error(error_msg, pattern)
error_counts[category] += 1
results.append({
"input": pattern,
"success": False,
"error": error_msg[:200],
"error_category": category
})
failed += 1
except Exception as e:
error_msg = str(e)
category = categorize_error(error_msg, pattern)
error_counts[category] += 1
results.append({
"input": pattern,
"success": False,
"error": error_msg[:200],
"error_category": category
})
failed += 1
elapsed = time.perf_counter() - start
print(f"[worker {worker_id}] Done: {total} patterns in {elapsed:.1f}s ({total/elapsed:.1f} p/s)", file=sys.stderr, flush=True)
output_data = {
"tool": "mutalyzer",
"total_patterns": len(patterns),
"successful": successful,
"failed": failed,
"elapsed_seconds": elapsed,
"patterns_per_second": len(patterns) / elapsed if elapsed > 0 else 0,
"error_counts": dict(error_counts),
"results": results
}
with open(output_file, 'w') as f:
json.dump(output_data, f, indent=2)
"#;
const BIOCOMMONS_NORMALIZE_SCRIPT: &str = r#"
import sys
import os
import json
import time
from collections import defaultdict
# Optional: Set UTA database URL from environment or argument
# By default, biocommons uses remote UTA at uta.biocommons.org
if len(sys.argv) > 3 and sys.argv[3]:
os.environ['UTA_DB_URL'] = sys.argv[3]
try:
import hgvs.parser
import hgvs.normalizer
import hgvs.dataproviders.uta
except ImportError:
print("ERROR: hgvs (biocommons) not installed. Run: pip install hgvs", file=sys.stderr)
sys.exit(1)
input_file = sys.argv[1]
output_file = sys.argv[2]
# Initialize biocommons/hgvs components
try:
hdp = hgvs.dataproviders.uta.connect()
hp = hgvs.parser.Parser()
hn = hgvs.normalizer.Normalizer(hdp)
except Exception as e:
print(f"ERROR: Failed to initialize biocommons/hgvs: {e}", file=sys.stderr)
sys.exit(1)
results = []
successful = 0
failed = 0
error_counts = defaultdict(int)
def categorize_error(error_msg, pattern=None):
"""Categorize error message into a short category."""
msg = str(error_msg)
if 'HGVSParseError' in msg or 'parse' in msg.lower():
return 'PARSE_ERROR'
elif 'HGVSInvalidVariantError' in msg:
return 'INVALID_VARIANT'
elif 'HGVSDataNotAvailableError' in msg or 'not found' in msg.lower():
return 'DATA_NOT_AVAILABLE'
elif 'HGVSInvalidIntervalError' in msg:
return 'INVALID_INTERVAL'
elif 'HGVSNormalizationError' in msg:
return 'NORMALIZATION_ERROR'
elif 'HGVSUnsupportedOperationError' in msg:
return 'UNSUPPORTED'
elif 'connection' in msg.lower() or 'timeout' in msg.lower():
return 'CONNECTION_ERROR'
else:
return 'OTHER'
with open(input_file, 'r') as f:
patterns = [line.strip() for line in f if line.strip()]
start = time.perf_counter()
for pattern in patterns:
try:
# Parse the HGVS string
var = hp.parse_hgvs_variant(pattern)
# Normalize the variant
normalized = hn.normalize(var)
results.append({
"input": pattern,
"success": True,
"output": str(normalized)
})
successful += 1
except Exception as e:
error_msg = str(e)
category = categorize_error(error_msg, pattern)
error_counts[category] += 1
results.append({
"input": pattern,
"success": False,
"error": error_msg[:200],
"error_category": category
})
failed += 1
elapsed = time.perf_counter() - start
output_data = {
"tool": "biocommons-hgvs",
"total_patterns": len(patterns),
"successful": successful,
"failed": failed,
"elapsed_seconds": elapsed,
"patterns_per_second": len(patterns) / elapsed if elapsed > 0 else 0,
"error_counts": dict(error_counts),
"results": results
}
with open(output_file, 'w') as f:
json.dump(output_data, f, indent=2)
"#;
use crate::benchmark::biocommons::has_biocommons_normalizer;
use crate::benchmark::cache::extract_accessions_from_pattern;
use crate::benchmark::mutalyzer::{has_mutalyzer_normalizer, has_mutalyzer_parser};
use crate::benchmark::sample::stratified_sample_vec;
use crate::benchmark::types::{
AgreementStats, CacheStats, CombinedPatternResult, CompareMode, ComparisonResult,
ComparisonToolResult, DetailedResults, DisagreementExample, ParseResult, RefMismatchInfo,
ReferenceStats, Validator,
};
use crate::error_handling::{ErrorConfig, ErrorMode, InputPreprocessor};
use crate::hgvs::parser::parse_hgvs_lenient;
use crate::normalize::NormalizeConfig;
use crate::reference::ReferenceProvider;
use crate::{FerroError, MockProvider, MultiFastaProvider, Normalizer};
use chrono::{DateTime, Utc};
use rand::seq::SliceRandom;
use rand::SeedableRng;
use serde::Deserialize;
use std::collections::{HashMap, HashSet};
use std::fs::File;
use std::io::{BufRead, BufReader, BufWriter, Write};
use std::path::{Path, PathBuf};
use std::time::Instant;
#[derive(Debug, Clone)]
pub struct CompareConfig {
pub validator: Validator,
pub sample_size: usize,
pub workers: usize,
pub seed: u64,
pub mutalyzer_settings: Option<String>,
pub uta_db_url: Option<String>,
pub biocommons_settings: Option<std::path::PathBuf>,
pub hgvs_rs_seqrepo_path: Option<String>,
pub hgvs_rs_uta_schema: Option<String>,
pub reference_dir: Option<std::path::PathBuf>,
pub transcript_mapping: Option<std::path::PathBuf>,
pub include_unnormalizable: bool,
pub include_genomic: bool,
pub include_protein: bool,
pub allow_mutalyzer_network: bool,
pub detailed_output: Option<std::path::PathBuf>,
pub existing_mutalyzer_results: Option<std::path::PathBuf>,
pub error_mode: String,
}
impl Default for CompareConfig {
fn default() -> Self {
Self {
validator: Validator::default(),
sample_size: 1000,
workers: num_cpus::get(),
seed: 42,
mutalyzer_settings: None,
uta_db_url: None,
biocommons_settings: None,
hgvs_rs_seqrepo_path: None,
hgvs_rs_uta_schema: None,
reference_dir: None,
transcript_mapping: None,
include_unnormalizable: false,
include_genomic: false,
include_protein: false,
allow_mutalyzer_network: false,
detailed_output: None,
existing_mutalyzer_results: None,
error_mode: "strict".to_string(),
}
}
}
fn is_intronic_variant(pattern: &str) -> bool {
let Some(colon_pos) = pattern.find(':') else {
return false;
};
let variant_part = &pattern[colon_pos + 1..];
if !variant_part.starts_with("c.") && !variant_part.starts_with("n.") {
return false;
}
let chars: Vec<char> = variant_part.chars().collect();
for i in 0..chars.len().saturating_sub(2) {
if chars[i].is_ascii_digit()
&& (chars[i + 1] == '+' || chars[i + 1] == '-')
&& chars[i + 2].is_ascii_digit()
{
return true;
}
}
false
}
fn is_genomic_pattern(pattern: &str) -> bool {
pattern.starts_with("NC_")
}
fn is_protein_pattern(pattern: &str) -> bool {
pattern.starts_with("NP_") || pattern.contains(":p.")
}
fn is_normalizable(pattern: &str) -> bool {
let variant_part = if let Some(colon_pos) = pattern.find(':') {
&pattern[colon_pos + 1..]
} else {
pattern
};
if variant_part.contains('?') || variant_part.contains('(') || variant_part.contains(')') {
return false;
}
if variant_part.ends_with('=') {
return false;
}
true
}
pub fn rewrite_with_genomic_context(
patterns: &[String],
mapping: &HashMap<String, String>,
) -> Vec<String> {
patterns
.iter()
.map(|pattern| {
if !is_intronic_variant(pattern) {
return pattern.clone();
}
let Some(colon_pos) = pattern.find(':') else {
return pattern.clone();
};
let accession = &pattern[..colon_pos];
if !accession.starts_with("NM_") && !accession.starts_with("NR_") {
return pattern.clone();
}
if let Some(chromosome) = mapping.get(accession) {
let variant_part = &pattern[colon_pos..];
format!("{}({}){}", chromosome, accession, variant_part)
} else {
pattern.clone()
}
})
.collect()
}
fn extract_accession(pattern: &str) -> Option<&str> {
if let Some(paren_start) = pattern.find('(') {
if let Some(paren_end) = pattern.find(')') {
return Some(&pattern[paren_start + 1..paren_end]);
}
}
pattern.find(':').map(|pos| &pattern[..pos])
}
fn load_fai_accessions(cache_dir: &Path) -> HashSet<String> {
use std::io::BufRead;
let mut accessions = HashSet::new();
let transcripts_dir = cache_dir.join("transcripts");
if transcripts_dir.exists() {
if let Ok(entries) = std::fs::read_dir(&transcripts_dir) {
for entry in entries.filter_map(|e| e.ok()) {
let path = entry.path();
if path.extension().is_some_and(|ext| ext == "fai") {
if let Ok(file) = File::open(&path) {
let reader = std::io::BufReader::new(file);
for line in reader.lines().map_while(Result::ok) {
if let Some(name) = line.split('\t').next() {
accessions.insert(name.to_string());
}
}
}
}
}
}
}
let genome_dir = cache_dir.join("genome");
if genome_dir.exists() {
if let Ok(entries) = std::fs::read_dir(&genome_dir) {
for entry in entries.filter_map(|e| e.ok()) {
let path = entry.path();
if path.extension().is_some_and(|ext| ext == "fai") {
if let Ok(file) = File::open(&path) {
let reader = std::io::BufReader::new(file);
for line in reader.lines().map_while(Result::ok) {
if let Some(name) = line.split('\t').next() {
accessions.insert(name.to_string());
}
}
}
}
}
}
}
accessions
}
pub fn check_cache_coverage(
patterns: &[String],
ferro_results: &[ParseResult],
ferro_cache_dir: Option<&Path>,
mutalyzer_cache_dir: Option<&Path>,
) -> CacheStats {
let all_accessions: HashSet<&str> = patterns
.iter()
.filter_map(|p| extract_accession(p))
.collect();
let ferro_failed_accessions: HashSet<&str> = ferro_results
.iter()
.filter(|r| !r.success)
.filter_map(|r| extract_accession(&r.input))
.collect();
let total = all_accessions.len();
let mut ferro_missing = Vec::new();
let mut muta_missing = Vec::new();
let ferro_accessions = ferro_cache_dir.map(load_fai_accessions);
let mut sorted_ferro_failed: Vec<_> = ferro_failed_accessions.into_iter().collect();
sorted_ferro_failed.sort();
for acc in &sorted_ferro_failed {
if let Some(ref ferro_set) = ferro_accessions {
if !ferro_set.contains(*acc) {
ferro_missing.push((*acc).to_string());
}
}
}
let mut sorted_all: Vec<_> = all_accessions.into_iter().collect();
sorted_all.sort();
for acc in &sorted_all {
if let Some(cache_dir) = mutalyzer_cache_dir {
let check_acc = if acc.starts_with("LRG_") {
let base: String = acc
.chars()
.take_while(|c| {
c.is_ascii_digit() || *c == '_' || *c == 'L' || *c == 'R' || *c == 'G'
})
.collect();
if base.len() < acc.len() {
base
} else {
(*acc).to_string()
}
} else {
(*acc).to_string()
};
let seq_path = cache_dir.join(format!("{}.sequence", check_acc));
let ann_path = cache_dir.join(format!("{}.annotations", check_acc));
if !seq_path.exists() || !ann_path.exists() {
muta_missing.push((*acc).to_string());
}
}
}
ferro_missing.sort();
muta_missing.sort();
if !muta_missing.is_empty() {
if let Ok(mut file) = File::create("missing_accessions.txt") {
for acc in &muta_missing {
let _ = writeln!(file, "{}", acc);
}
eprintln!(
" Saved {} missing accessions to missing_accessions.txt",
muta_missing.len()
);
}
}
CacheStats {
total_accessions: total,
ferro_missing: ferro_missing.len(),
mutalyzer_missing: muta_missing.len(),
ferro_missing_examples: ferro_missing.into_iter().take(10).collect(),
mutalyzer_missing_examples: muta_missing.into_iter().take(10).collect(),
}
}
#[derive(Debug, Deserialize)]
#[allow(dead_code)]
struct MutalyzerOutput {
#[allow(dead_code)]
tool: String,
total_patterns: usize,
successful: usize,
#[allow(dead_code)]
failed: usize,
elapsed_seconds: f64,
#[allow(dead_code)]
patterns_per_second: f64,
results: Vec<ParseResult>,
#[serde(default)]
error_counts: HashMap<String, usize>,
}
pub fn compare_normalize<P: AsRef<Path>>(
input: P,
output: P,
config: &CompareConfig,
) -> Result<ComparisonResult, FerroError> {
let input = input.as_ref();
let output = output.as_ref();
match config.validator {
Validator::Mutalyzer => {
if !has_mutalyzer_normalizer() {
return Err(FerroError::Io {
msg: "mutalyzer package not installed. Install with: pip install mutalyzer"
.to_string(),
});
}
}
Validator::Biocommons => {
if !has_biocommons_normalizer() {
return Err(FerroError::Io {
msg: "biocommons/hgvs package not installed. Install with: pip install hgvs"
.to_string(),
});
}
}
Validator::HgvsRs => {
#[cfg(not(feature = "hgvs-rs"))]
{
return Err(FerroError::Io {
msg:
"hgvs-rs feature not enabled. Rebuild with: cargo build --features hgvs-rs"
.to_string(),
});
}
#[cfg(feature = "hgvs-rs")]
{
if config.hgvs_rs_seqrepo_path.is_none() {
return Err(FerroError::Io {
msg: "hgvs-rs validator requires --hgvs-rs-seqrepo-path".to_string(),
});
}
}
}
}
eprintln!("Using validator: {}", config.validator);
let transcript_mapping: Option<HashMap<String, String>> =
if let Some(mapping_path) = &config.transcript_mapping {
eprintln!(
"Loading transcript mapping from {}...",
mapping_path.display()
);
let mapping = crate::benchmark::cache::load_transcript_mapping(mapping_path)?;
eprintln!(" Loaded {} transcript→chromosome mappings", mapping.len());
Some(mapping)
} else {
None
};
let all_patterns = load_patterns(input)?;
if all_patterns.is_empty() {
return Err(FerroError::Io {
msg: "No patterns found in input file".to_string(),
});
}
eprintln!(
"Loaded {} patterns from {}",
all_patterns.len(),
input.display()
);
let patterns = {
let mut filtered = all_patterns;
let original_count = filtered.len();
if !config.include_unnormalizable {
let before = filtered.len();
filtered.retain(|p| is_normalizable(p));
let removed = before - filtered.len();
if removed > 0 {
eprintln!(
" Excluded {} unnormalizable patterns (uncertain breakpoints, reference-only)",
removed
);
}
}
if !config.include_genomic {
let before = filtered.len();
filtered.retain(|p| !is_genomic_pattern(p));
let removed = before - filtered.len();
if removed > 0 {
eprintln!(
" Excluded {} genomic (NC_) patterns (use --include-genomic to include)",
removed
);
}
}
if !config.include_protein {
let before = filtered.len();
filtered.retain(|p| !is_protein_pattern(p));
let removed = before - filtered.len();
if removed > 0 {
eprintln!(
" Excluded {} protein (NP_/p.) patterns (use --include-protein to include)",
removed
);
}
}
if filtered.len() < original_count {
eprintln!(" Filtered to {} patterns", filtered.len());
}
filtered
};
if patterns.is_empty() {
return Err(FerroError::Io {
msg: "No normalizable patterns found after filtering".to_string(),
});
}
let sampled = if config.sample_size > 0 && config.sample_size < patterns.len() {
sample_patterns(&patterns, config.sample_size, config.seed)
} else {
patterns
};
eprintln!("Using {} patterns for comparison", sampled.len());
eprintln!("\n[1/2] Running ferro-hgvs normalize...");
let (ferro_results, ferro_elapsed) =
run_ferro_normalize(&sampled, &config.reference_dir, &config.error_mode)?;
let ferro_successful = ferro_results.iter().filter(|r| r.success).count();
eprintln!(
" {} patterns in {:.2}s ({:.0} p/s)",
sampled.len(),
ferro_elapsed.as_secs_f64(),
sampled.len() as f64 / ferro_elapsed.as_secs_f64()
);
let (muta_results, muta_elapsed, muta_error_counts) = if let Some(ref existing_path) =
config.existing_mutalyzer_results
{
let start = Instant::now();
let existing_results = load_existing_mutalyzer_results(existing_path)?;
eprintln!("\n[2/2] Using pre-computed mutalyzer results...");
let mut results = Vec::with_capacity(sampled.len());
let mut found = 0;
let mut not_found = 0;
for pattern in &sampled {
if let Some(result) = existing_results.get(pattern) {
results.push(result.clone());
found += 1;
} else {
results.push(ParseResult {
input: pattern.clone(),
success: false,
output: None,
error: Some("Not in pre-computed results".to_string()),
error_category: Some("NOT_IN_CACHE".to_string()),
ref_mismatch: None,
details: None,
});
not_found += 1;
}
}
let elapsed = start.elapsed();
let successful = results.iter().filter(|r| r.success).count();
eprintln!(
" {} patterns in {:.2}s ({} found, {} not in cache)",
sampled.len(),
elapsed.as_secs_f64(),
found,
not_found
);
eprintln!(
" {} successful, {} failed",
successful,
sampled.len() - successful
);
let mut error_counts: HashMap<String, usize> = HashMap::new();
for result in &results {
if let Some(ref category) = result.error_category {
*error_counts.entry(category.clone()).or_insert(0) += 1;
}
}
(results, elapsed, error_counts)
} else {
match config.validator {
Validator::Mutalyzer => {
let mutalyzer_patterns = if let Some(ref mapping) = transcript_mapping {
let rewritten = rewrite_with_genomic_context(&sampled, mapping);
let intronic_count = sampled.iter().filter(|p| is_intronic_variant(p)).count();
let rewritten_count = sampled
.iter()
.zip(rewritten.iter())
.filter(|(orig, new)| orig != new)
.count();
eprintln!(
" Rewrote {}/{} intronic variants with genomic context",
rewritten_count, intronic_count
);
rewritten
} else {
sampled.clone()
};
eprintln!(
"\n[2/2] Running mutalyzer normalize ({} workers)...",
config.workers
);
let (results, elapsed, error_counts) = run_mutalyzer_normalize_parallel(
&mutalyzer_patterns,
config.workers,
config.mutalyzer_settings.as_deref(),
config.allow_mutalyzer_network,
)?;
eprintln!(
" {} patterns in {:.2}s ({:.0} p/s)",
sampled.len(),
elapsed.as_secs_f64(),
sampled.len() as f64 / elapsed.as_secs_f64()
);
(results, elapsed, error_counts)
}
Validator::Biocommons => {
eprintln!(
"\n[2/2] Running biocommons/hgvs normalize ({} workers)...",
config.workers
);
let (results, elapsed, error_counts) = run_biocommons_normalize_parallel(
&sampled,
config.workers,
config.uta_db_url.as_deref(),
)?;
eprintln!(
" {} patterns in {:.2}s ({:.0} p/s)",
sampled.len(),
elapsed.as_secs_f64(),
sampled.len() as f64 / elapsed.as_secs_f64()
);
(results, elapsed, error_counts)
}
Validator::HgvsRs => {
#[cfg(not(feature = "hgvs-rs"))]
{
return Err(FerroError::Io {
msg: "hgvs-rs feature not enabled".to_string(),
});
}
#[cfg(feature = "hgvs-rs")]
{
let workers = config.workers;
if workers > 1 {
eprintln!("\n[2/2] Running hgvs-rs normalize ({} workers)...", workers);
} else {
eprintln!("\n[2/2] Running hgvs-rs normalize...");
}
let hgvs_rs_config = crate::benchmark::HgvsRsConfig {
uta_db_url: config.uta_db_url.clone().unwrap_or_else(|| {
"postgresql://anonymous:anonymous@localhost:5432/uta".to_string()
}),
uta_db_schema: config
.hgvs_rs_uta_schema
.clone()
.unwrap_or_else(|| "uta_20210129b".to_string()),
seqrepo_path: config.hgvs_rs_seqrepo_path.clone().unwrap_or_default(),
lrg_mapping_file: None, };
let (results, elapsed, error_counts) =
crate::benchmark::run_hgvs_rs_normalize_parallel(
&sampled,
&hgvs_rs_config,
workers,
)?;
eprintln!(
" {} patterns in {:.2}s ({:.0} p/s)",
sampled.len(),
elapsed.as_secs_f64(),
sampled.len() as f64 / elapsed.as_secs_f64()
);
(results, elapsed, error_counts)
}
}
}
};
let muta_successful = muta_results.iter().filter(|r| r.success).count();
let cache_miss_patterns: Vec<_> = muta_results
.iter()
.filter(|r| r.error_category.as_deref() == Some("CACHE_MISS"))
.map(|r| r.input.clone())
.collect();
if !cache_miss_patterns.is_empty() {
if let Ok(mut file) = File::create("cache_miss_patterns.txt") {
for pattern in &cache_miss_patterns {
let _ = writeln!(file, "{}", pattern);
}
eprintln!(
" Saved {} CACHE_MISS patterns to cache_miss_patterns.txt",
cache_miss_patterns.len()
);
}
}
let (agreement, differences) = compute_agreement(&ferro_results, &muta_results);
let reference_stats = Some(compute_reference_stats(
&ferro_results,
&muta_results,
&agreement,
));
let mutalyzer_cache_dir = config
.mutalyzer_settings
.as_ref()
.and_then(|settings_path| {
Path::new(settings_path).parent().map(|p| p.to_path_buf())
});
let cache_stats = Some(check_cache_coverage(
&sampled,
&ferro_results,
config.reference_dir.as_deref(),
mutalyzer_cache_dir.as_deref(),
));
let ferro_tool = ComparisonToolResult::new(sampled.len(), ferro_successful, ferro_elapsed);
let muta_tool = ComparisonToolResult::new(sampled.len(), muta_successful, muta_elapsed)
.with_error_counts(muta_error_counts);
let speedup = if ferro_tool.elapsed_seconds > 0.0 {
muta_tool.elapsed_seconds / ferro_tool.elapsed_seconds
} else {
0.0
};
let result = ComparisonResult {
mode: CompareMode::Normalize,
timestamp: Utc::now(),
sample_size: sampled.len(),
ferro: ferro_tool,
mutalyzer: muta_tool,
speedup,
agreement,
differences,
reference_stats,
cache_stats,
};
print_comparison_summary(&result);
save_comparison_result(&result, output)?;
if let Some(ref detailed_path) = config.detailed_output {
save_detailed_results(
&ferro_results,
&muta_results,
CompareMode::Normalize,
result.timestamp,
detailed_path,
)?;
}
Ok(result)
}
pub fn compare_parse<P: AsRef<Path>>(
input: P,
output: P,
config: &CompareConfig,
) -> Result<ComparisonResult, FerroError> {
let input = input.as_ref();
let output = output.as_ref();
if !has_mutalyzer_parser() {
return Err(FerroError::Io {
msg: "mutalyzer-hgvs-parser not installed. Install with: pip install mutalyzer-hgvs-parser".to_string(),
});
}
let patterns = load_patterns(input)?;
if patterns.is_empty() {
return Err(FerroError::Io {
msg: "No patterns found in input file".to_string(),
});
}
eprintln!(
"Loaded {} patterns from {}",
patterns.len(),
input.display()
);
let sampled = if config.sample_size > 0 && config.sample_size < patterns.len() {
sample_patterns(&patterns, config.sample_size, config.seed)
} else {
patterns
};
eprintln!("Using {} patterns for comparison", sampled.len());
eprintln!("\n[1/2] Running ferro-hgvs parse...");
let (ferro_results, ferro_elapsed) = run_ferro_parse(&sampled, &config.error_mode);
let ferro_successful = ferro_results.iter().filter(|r| r.success).count();
eprintln!(
" {} patterns in {:.2}s ({:.0} p/s)",
sampled.len(),
ferro_elapsed.as_secs_f64(),
sampled.len() as f64 / ferro_elapsed.as_secs_f64()
);
eprintln!(
"\n[2/2] Running mutalyzer parse ({} workers)...",
config.workers
);
let (muta_results, muta_elapsed) = run_mutalyzer_parse_parallel(&sampled, config.workers)?;
let muta_successful = muta_results.iter().filter(|r| r.success).count();
eprintln!(
" {} patterns in {:.2}s ({:.0} p/s)",
sampled.len(),
muta_elapsed.as_secs_f64(),
sampled.len() as f64 / muta_elapsed.as_secs_f64()
);
let (agreement, differences) = compute_agreement(&ferro_results, &muta_results);
let ferro_tool = ComparisonToolResult::new(sampled.len(), ferro_successful, ferro_elapsed);
let muta_tool = ComparisonToolResult::new(sampled.len(), muta_successful, muta_elapsed);
let speedup = if ferro_tool.elapsed_seconds > 0.0 {
muta_tool.elapsed_seconds / ferro_tool.elapsed_seconds
} else {
0.0
};
let result = ComparisonResult {
mode: CompareMode::Parse,
timestamp: Utc::now(),
sample_size: sampled.len(),
ferro: ferro_tool,
mutalyzer: muta_tool,
speedup,
agreement,
differences,
reference_stats: None, cache_stats: None, };
print_comparison_summary(&result);
save_comparison_result(&result, output)?;
if let Some(ref detailed_path) = config.detailed_output {
save_detailed_results(
&ferro_results,
&muta_results,
CompareMode::Parse,
result.timestamp,
detailed_path,
)?;
}
Ok(result)
}
fn load_existing_mutalyzer_results<P: AsRef<Path>>(
path: P,
) -> Result<HashMap<String, ParseResult>, FerroError> {
let path = path.as_ref();
eprintln!(
"Loading pre-computed mutalyzer results from {}...",
path.display()
);
let file = File::open(path).map_err(|e| FerroError::Io {
msg: format!("Failed to open {}: {}", path.display(), e),
})?;
let reader: Box<dyn BufRead> = if path.extension().is_some_and(|e| e == "gz") {
Box::new(BufReader::new(flate2::read::GzDecoder::new(file)))
} else {
Box::new(BufReader::new(file))
};
let mut results = HashMap::new();
let mut line_count = 0;
let mut success_count = 0;
for line in reader.lines() {
let line = line.map_err(|e| FerroError::Io {
msg: format!("Failed to read line: {}", e),
})?;
if line.is_empty() {
continue;
}
let result: ParseResult = serde_json::from_str(&line).map_err(|e| FerroError::Json {
msg: format!("Failed to parse JSONL line: {}", e),
})?;
if result.success {
success_count += 1;
}
results.insert(result.input.clone(), result);
line_count += 1;
if line_count % 1_000_000 == 0 {
eprintln!(" Loaded {} results...", line_count);
}
}
eprintln!(
" Loaded {} results ({} successful, {} failed)",
line_count,
success_count,
line_count - success_count
);
Ok(results)
}
fn load_patterns<P: AsRef<Path>>(path: P) -> Result<Vec<String>, FerroError> {
let path = path.as_ref();
let file = File::open(path).map_err(|e| FerroError::Io {
msg: format!("Failed to open {}: {}", path.display(), e),
})?;
let reader: Box<dyn BufRead> = if path.extension().is_some_and(|e| e == "gz") {
Box::new(BufReader::new(flate2::read::GzDecoder::new(file)))
} else {
Box::new(BufReader::new(file))
};
let mut patterns = Vec::new();
for line in reader.lines() {
let line = line.map_err(|e| FerroError::Io {
msg: format!("Failed to read line: {}", e),
})?;
let line = line.trim();
if line.is_empty() || line.starts_with('#') {
continue;
}
if line.contains('\t') {
let fields: Vec<&str> = line.split('\t').collect();
if fields.len() > 6 {
let hgvs = fields[6].trim();
if !hgvs.is_empty() && hgvs != "-" && hgvs.contains(':') {
patterns.push(hgvs.to_string());
}
}
if fields.len() > 8 {
let hgvs = fields[8].trim();
if !hgvs.is_empty() && hgvs != "-" && hgvs.contains(':') {
patterns.push(hgvs.to_string());
}
}
} else if line.contains(':') {
patterns.push(line.to_string());
}
}
patterns.sort();
patterns.dedup();
Ok(patterns)
}
fn sample_patterns(patterns: &[String], size: usize, seed: u64) -> Vec<String> {
if let Ok(sampled) = stratified_sample_vec(patterns, size, seed, false) {
return sampled;
}
let mut rng = rand::rngs::StdRng::seed_from_u64(seed);
let mut sampled: Vec<String> = patterns.to_vec();
sampled.shuffle(&mut rng);
sampled.truncate(size);
sampled
}
fn run_ferro_normalize(
patterns: &[String],
reference_dir: &Option<std::path::PathBuf>,
error_mode: &str,
) -> Result<(Vec<ParseResult>, std::time::Duration), FerroError> {
let config = NormalizeConfig::lenient();
let normalizer: Normalizer<Box<dyn ReferenceProvider>> = match reference_dir {
Some(ref ref_dir) => {
let manifest_path = ref_dir.join("manifest.json");
if manifest_path.exists() {
let provider = MultiFastaProvider::from_manifest(&manifest_path)?;
Normalizer::with_config(Box::new(provider) as Box<dyn ReferenceProvider>, config)
} else if ref_dir.is_dir() {
let provider = MultiFastaProvider::from_directory(ref_dir)?;
Normalizer::with_config(Box::new(provider) as Box<dyn ReferenceProvider>, config)
} else {
Normalizer::with_config(
Box::new(MockProvider::with_test_data()) as Box<dyn ReferenceProvider>,
config,
)
}
}
None => Normalizer::with_config(
Box::new(MockProvider::with_test_data()) as Box<dyn ReferenceProvider>,
config,
),
};
let base_mode = match error_mode {
"lenient" => ErrorMode::Lenient,
"silent" => ErrorMode::Silent,
_ => ErrorMode::Strict,
};
let error_config = ErrorConfig::new(base_mode);
let preprocessor = InputPreprocessor::new(error_config);
let start = Instant::now();
let results: Vec<ParseResult> = patterns
.iter()
.map(|pattern| {
let preprocess_result = preprocessor.preprocess(pattern);
let preprocessed = if preprocess_result.success {
preprocess_result.preprocessed
} else {
pattern.clone()
};
match parse_hgvs_lenient(&preprocessed) {
Ok(parse_result) => {
match normalizer.normalize_with_warnings(&parse_result.result) {
Ok(norm_result) => {
let ref_mismatch = norm_result
.warnings
.iter()
.find(|w| w.code == "REFSEQ_MISMATCH")
.map(|w| RefMismatchInfo {
stated_ref: w.stated_ref.clone(),
actual_ref: w.actual_ref.clone(),
position: w.position.clone(),
corrected: w.corrected,
});
ParseResult {
input: pattern.clone(),
success: true,
output: Some(norm_result.result.to_string()),
error: None,
error_category: None,
ref_mismatch,
details: None,
}
}
Err(e) => ParseResult {
input: pattern.clone(),
success: false,
output: None,
error: Some(format!("{}", e)),
error_category: Some("normalize_error".to_string()),
ref_mismatch: None,
details: None,
},
}
}
Err(e) => ParseResult {
input: pattern.clone(),
success: false,
output: None,
error: Some(format!("{}", e)),
error_category: Some("parse_error".to_string()),
ref_mismatch: None,
details: None,
},
}
})
.collect();
let elapsed = start.elapsed();
Ok((results, elapsed))
}
fn run_ferro_parse(
patterns: &[String],
error_mode: &str,
) -> (Vec<ParseResult>, std::time::Duration) {
let base_mode = match error_mode {
"lenient" => ErrorMode::Lenient,
"silent" => ErrorMode::Silent,
_ => ErrorMode::Strict,
};
let error_config = ErrorConfig::new(base_mode);
let preprocessor = InputPreprocessor::new(error_config);
let start = Instant::now();
let results: Vec<ParseResult> = patterns
.iter()
.map(|pattern| {
let preprocess_result = preprocessor.preprocess(pattern);
let preprocessed = if preprocess_result.success {
preprocess_result.preprocessed
} else {
pattern.clone()
};
match parse_hgvs_lenient(&preprocessed) {
Ok(parse_result) => ParseResult {
input: pattern.clone(),
success: true,
output: Some(parse_result.result.to_string()),
error: None,
error_category: None,
ref_mismatch: None,
details: None,
},
Err(e) => ParseResult {
input: pattern.clone(),
success: false,
output: None,
error: Some(format!("{}", e)),
error_category: Some("parse_error".to_string()),
ref_mismatch: None,
details: None,
},
}
})
.collect();
let elapsed = start.elapsed();
(results, elapsed)
}
#[allow(clippy::type_complexity)]
pub fn run_mutalyzer_normalize_parallel(
patterns: &[String],
workers: usize,
settings_file: Option<&str>,
allow_network: bool,
) -> Result<
(
Vec<ParseResult>,
std::time::Duration,
HashMap<String, usize>,
),
FerroError,
> {
let temp_dir = tempfile::tempdir().map_err(|e| FerroError::Io {
msg: format!("Failed to create temp directory: {}", e),
})?;
let (cache_dir, manifest) = if let Some(settings_path) = settings_file {
let settings_path = Path::new(settings_path);
match parse_mutalyzer_settings(settings_path) {
Ok(cache_dir) => {
let manifest = load_shard_manifest(&cache_dir);
(Some(cache_dir), manifest)
}
Err(_) => (None, None),
}
} else {
(None, None)
};
if let (Some(cache_dir), Some(manifest)) = (&cache_dir, &manifest) {
return run_mutalyzer_normalize_presharded(
patterns,
workers,
cache_dir,
manifest.num_shards,
allow_network,
temp_dir.path(),
settings_file,
);
}
eprintln!("[normalize] Using runtime partitioning (no pre-sharded cache found)");
let shards = shard_patterns(patterns, workers);
let partitioned_settings = if let Some(cache_dir) = &cache_dir {
eprintln!(
"[partition] Partitioning cache from {} for {} workers",
cache_dir.display(),
workers
);
match partition_cache_for_workers(&shards, cache_dir, temp_dir.path()) {
Ok(paths) => Some(paths),
Err(e) => {
eprintln!("[partition] Warning: Failed to partition cache: {}", e);
eprintln!("[partition] Falling back to shared cache");
None
}
}
} else {
None
};
let mut shard_paths = Vec::new();
for (i, shard) in shards.iter().enumerate() {
let input_path = temp_dir.path().join(format!("shard_{}_input.txt", i));
let output_path = temp_dir.path().join(format!("shard_{}_output.json", i));
let mut file = File::create(&input_path).map_err(|e| FerroError::Io {
msg: format!("Failed to create shard file: {}", e),
})?;
for pattern in shard {
writeln!(file, "{}", pattern).map_err(|e| FerroError::Io {
msg: format!("Failed to write pattern: {}", e),
})?;
}
shard_paths.push((input_path, output_path));
}
let start = Instant::now();
let mut children: Vec<(usize, std::process::Child)> = Vec::new();
for (i, (input, output)) in shard_paths.iter().enumerate() {
let mut cmd = std::process::Command::new("python3");
cmd.args(["-c", MUTALYZER_NORMALIZE_SCRIPT]);
cmd.arg(input.display().to_string());
cmd.arg(output.display().to_string());
if let Some(ref partitioned) = partitioned_settings {
cmd.arg(partitioned[i].display().to_string());
} else if let Some(settings) = settings_file {
cmd.arg(settings);
} else {
cmd.arg("");
}
cmd.arg(if allow_network { "true" } else { "false" });
match cmd.spawn() {
Ok(child) => children.push((i, child)),
Err(e) => {
cleanup_children(&mut children);
return Err(FerroError::Io {
msg: format!("Failed to spawn Python: {}", e),
});
}
}
}
let mut wait_error = None;
for (shard_id, child) in children.iter_mut() {
if wait_error.is_some() {
kill_and_wait(child, *shard_id);
continue;
}
match child.wait() {
Ok(status) if !status.success() => {
wait_error = Some(FerroError::Io {
msg: format!(
"Mutalyzer normalizer failed with exit code: {:?}",
status.code()
),
});
}
Err(e) => {
wait_error = Some(FerroError::Io {
msg: format!("Failed to wait for Python: {}", e),
});
}
_ => {}
}
}
if let Some(err) = wait_error {
return Err(err);
}
let elapsed = start.elapsed();
let mut all_results = Vec::new();
let mut aggregated_error_counts: HashMap<String, usize> = HashMap::new();
for (_, output_path) in &shard_paths {
let file = File::open(output_path).map_err(|e| FerroError::Io {
msg: format!("Failed to open shard output: {}", e),
})?;
let reader = BufReader::new(file);
let output: MutalyzerOutput =
serde_json::from_reader(reader).map_err(|e| FerroError::Json {
msg: format!("Failed to parse shard output: {}", e),
})?;
all_results.extend(output.results);
for (category, count) in output.error_counts {
*aggregated_error_counts.entry(category).or_insert(0) += count;
}
}
Ok((all_results, elapsed, aggregated_error_counts))
}
#[allow(clippy::type_complexity)]
fn run_mutalyzer_normalize_presharded(
patterns: &[String],
workers: usize,
cache_dir: &Path,
num_shards: usize,
allow_network: bool,
temp_dir: &Path,
settings_file: Option<&str>,
) -> Result<
(
Vec<ParseResult>,
std::time::Duration,
HashMap<String, usize>,
),
FerroError,
> {
eprintln!(
"[normalize] Using pre-sharded cache ({} shards, {} workers, work-stealing)",
num_shards, workers
);
let shard_groups = group_patterns_by_shard(patterns, num_shards);
let shard_tasks: Vec<(usize, Vec<String>)> = shard_groups
.into_iter()
.enumerate()
.filter(|(_, g)| !g.is_empty())
.collect();
let effective_workers = workers.min(shard_tasks.len());
eprintln!(
"[normalize] {} non-empty shards queued for {} workers",
shard_tasks.len(),
effective_workers
);
let settings_path = settings_file
.map(PathBuf::from)
.unwrap_or_else(|| cache_dir.join("mutalyzer_settings.conf"));
let mut task_io: Vec<(usize, PathBuf, PathBuf, usize)> = Vec::new(); for (shard_id, shard_patterns) in &shard_tasks {
let input_path = temp_dir.join(format!("shard_{}_input.txt", shard_id));
let output_path = temp_dir.join(format!("shard_{}_output.json", shard_id));
let mut file = File::create(&input_path).map_err(|e| FerroError::Io {
msg: format!("Failed to create input file: {}", e),
})?;
for pattern in shard_patterns {
writeln!(file, "{}", pattern).map_err(|e| FerroError::Io {
msg: format!("Failed to write pattern: {}", e),
})?;
}
task_io.push((*shard_id, input_path, output_path, shard_patterns.len()));
}
let start = Instant::now();
let mut task_queue = std::collections::VecDeque::from(task_io);
let mut active: Vec<(std::process::Child, usize, PathBuf, usize)> = Vec::new();
let mut completed_outputs: Vec<PathBuf> = Vec::new();
let mut shards_done = 0;
let total_shards = shard_tasks.len();
while active.len() < effective_workers {
if let Some((shard_id, input_path, output_path, count)) = task_queue.pop_front() {
match spawn_mutalyzer_worker(&input_path, &output_path, &settings_path, allow_network) {
Ok(child) => active.push((child, shard_id, output_path, count)),
Err(e) => {
cleanup_active_workers(&mut active);
return Err(e);
}
}
} else {
break;
}
}
while !active.is_empty() {
let mut progress_made = false;
let mut i = 0;
while i < active.len() {
match active[i].0.try_wait() {
Ok(Some(status)) => {
let (_, shard_id, output_path, count) = active.remove(i);
shards_done += 1;
progress_made = true;
if !status.success() {
cleanup_active_workers(&mut active);
return Err(FerroError::Io {
msg: format!(
"Mutalyzer normalizer failed on shard {} with exit code: {:?}",
shard_id,
status.code()
),
});
}
eprintln!(
"[normalize] Shard {} done ({} patterns) [{}/{}]",
shard_id, count, shards_done, total_shards
);
completed_outputs.push(output_path);
if let Some((next_shard, input_path, output_path, next_count)) =
task_queue.pop_front()
{
match spawn_mutalyzer_worker(
&input_path,
&output_path,
&settings_path,
allow_network,
) {
Ok(child) => {
active.push((child, next_shard, output_path, next_count));
}
Err(e) => {
cleanup_active_workers(&mut active);
return Err(e);
}
}
}
}
Ok(None) => {
i += 1;
}
Err(e) => {
cleanup_active_workers(&mut active);
return Err(FerroError::Io {
msg: format!("Failed to poll worker: {}", e),
});
}
}
}
if !active.is_empty() && !progress_made {
std::thread::sleep(std::time::Duration::from_millis(200));
}
}
let elapsed = start.elapsed();
let mut all_results = Vec::new();
let mut aggregated_error_counts: HashMap<String, usize> = HashMap::new();
for output_path in &completed_outputs {
let file = File::open(output_path).map_err(|e| FerroError::Io {
msg: format!("Failed to open output: {}", e),
})?;
let reader = BufReader::new(file);
let output: MutalyzerOutput =
serde_json::from_reader(reader).map_err(|e| FerroError::Json {
msg: format!("Failed to parse output: {}", e),
})?;
all_results.extend(output.results);
for (category, count) in output.error_counts {
*aggregated_error_counts.entry(category).or_insert(0) += count;
}
}
Ok((all_results, elapsed, aggregated_error_counts))
}
fn kill_and_wait(child: &mut std::process::Child, shard_id: usize) {
if let Err(e) = child.kill() {
let is_already_exited = cfg!(unix) && e.raw_os_error() == Some(3);
if !is_already_exited {
eprintln!(
"[normalize] Warning: failed to kill worker for shard {}: {}",
shard_id, e
);
}
}
if let Err(e) = child.wait() {
eprintln!(
"[normalize] Warning: failed to wait on worker for shard {}: {}",
shard_id, e
);
}
}
fn cleanup_active_workers(active: &mut Vec<(std::process::Child, usize, PathBuf, usize)>) {
for (child, shard_id, _, _) in active.iter_mut() {
kill_and_wait(child, *shard_id);
}
active.clear();
}
fn cleanup_children(children: &mut Vec<(usize, std::process::Child)>) {
for (shard_id, child) in children.iter_mut() {
kill_and_wait(child, *shard_id);
}
children.clear();
}
fn spawn_mutalyzer_worker(
input_path: &Path,
output_path: &Path,
settings_path: &Path,
allow_network: bool,
) -> Result<std::process::Child, FerroError> {
let mut cmd = std::process::Command::new("python3");
cmd.args(["-c", MUTALYZER_NORMALIZE_SCRIPT]);
cmd.arg(input_path.display().to_string());
cmd.arg(output_path.display().to_string());
cmd.arg(settings_path.display().to_string());
cmd.arg(if allow_network { "true" } else { "false" });
cmd.spawn().map_err(|e| FerroError::Io {
msg: format!("Failed to spawn Python: {}", e),
})
}
#[allow(clippy::type_complexity)]
fn run_biocommons_normalize_parallel(
patterns: &[String],
workers: usize,
uta_db_url: Option<&str>,
) -> Result<
(
Vec<ParseResult>,
std::time::Duration,
HashMap<String, usize>,
),
FerroError,
> {
let temp_dir = tempfile::tempdir().map_err(|e| FerroError::Io {
msg: format!("Failed to create temp directory: {}", e),
})?;
let shards = shard_patterns(patterns, workers);
let mut shard_paths = Vec::new();
for (i, shard) in shards.iter().enumerate() {
let input_path = temp_dir.path().join(format!("shard_{}_input.txt", i));
let output_path = temp_dir.path().join(format!("shard_{}_output.json", i));
let mut file = File::create(&input_path).map_err(|e| FerroError::Io {
msg: format!("Failed to create shard file: {}", e),
})?;
for pattern in shard {
writeln!(file, "{}", pattern).map_err(|e| FerroError::Io {
msg: format!("Failed to write pattern: {}", e),
})?;
}
shard_paths.push((input_path, output_path));
}
let start = Instant::now();
let mut children: Vec<(usize, std::process::Child)> = Vec::new();
for (i, (input, output)) in shard_paths.iter().enumerate() {
let mut cmd = std::process::Command::new("python3");
cmd.args(["-c", BIOCOMMONS_NORMALIZE_SCRIPT]);
cmd.arg(input.display().to_string());
cmd.arg(output.display().to_string());
cmd.arg(uta_db_url.unwrap_or(""));
match cmd.spawn() {
Ok(child) => children.push((i, child)),
Err(e) => {
cleanup_children(&mut children);
return Err(FerroError::Io {
msg: format!("Failed to spawn Python: {}", e),
});
}
}
}
let mut wait_error = None;
for (shard_id, child) in children.iter_mut() {
if wait_error.is_some() {
kill_and_wait(child, *shard_id);
continue;
}
match child.wait() {
Ok(status) if !status.success() => {
wait_error = Some(FerroError::Io {
msg: format!(
"biocommons/hgvs normalizer failed with exit code: {:?}",
status.code()
),
});
}
Err(e) => {
wait_error = Some(FerroError::Io {
msg: format!("Failed to wait for Python: {}", e),
});
}
_ => {}
}
}
if let Some(err) = wait_error {
return Err(err);
}
let elapsed = start.elapsed();
let mut all_results = Vec::new();
let mut aggregated_error_counts: HashMap<String, usize> = HashMap::new();
for (_, output_path) in &shard_paths {
let file = File::open(output_path).map_err(|e| FerroError::Io {
msg: format!("Failed to open shard output: {}", e),
})?;
let reader = BufReader::new(file);
let output: MutalyzerOutput =
serde_json::from_reader(reader).map_err(|e| FerroError::Json {
msg: format!("Failed to parse shard output: {}", e),
})?;
all_results.extend(output.results);
for (category, count) in output.error_counts {
*aggregated_error_counts.entry(category).or_insert(0) += count;
}
}
Ok((all_results, elapsed, aggregated_error_counts))
}
fn run_mutalyzer_parse_parallel(
patterns: &[String],
workers: usize,
) -> Result<(Vec<ParseResult>, std::time::Duration), FerroError> {
let temp_dir = tempfile::tempdir().map_err(|e| FerroError::Io {
msg: format!("Failed to create temp directory: {}", e),
})?;
let shards = shard_patterns(patterns, workers);
let mut shard_paths = Vec::new();
for (i, shard) in shards.iter().enumerate() {
let input_path = temp_dir.path().join(format!("shard_{}_input.txt", i));
let output_path = temp_dir.path().join(format!("shard_{}_output.json", i));
let mut file = File::create(&input_path).map_err(|e| FerroError::Io {
msg: format!("Failed to create shard file: {}", e),
})?;
for pattern in shard {
writeln!(file, "{}", pattern).map_err(|e| FerroError::Io {
msg: format!("Failed to write pattern: {}", e),
})?;
}
shard_paths.push((input_path, output_path));
}
let start = Instant::now();
let mut children: Vec<std::process::Child> = Vec::new();
for (input, output) in &shard_paths {
let mut cmd = std::process::Command::new("python3");
cmd.args(["-c", MUTALYZER_PARSE_SCRIPT]);
cmd.arg(input.display().to_string());
cmd.arg(output.display().to_string());
let child = cmd.spawn().map_err(|e| FerroError::Io {
msg: format!("Failed to spawn Python: {}", e),
})?;
children.push(child);
}
for mut child in children {
let status = child.wait().map_err(|e| FerroError::Io {
msg: format!("Failed to wait for Python: {}", e),
})?;
if !status.success() {
return Err(FerroError::Io {
msg: format!(
"Mutalyzer parser failed with exit code: {:?}",
status.code()
),
});
}
}
let elapsed = start.elapsed();
let mut all_results = Vec::new();
for (_, output_path) in &shard_paths {
let file = File::open(output_path).map_err(|e| FerroError::Io {
msg: format!("Failed to open shard output: {}", e),
})?;
let reader = BufReader::new(file);
let output: MutalyzerOutput =
serde_json::from_reader(reader).map_err(|e| FerroError::Json {
msg: format!("Failed to parse shard output: {}", e),
})?;
all_results.extend(output.results);
}
Ok((all_results, elapsed))
}
fn shard_patterns(patterns: &[String], num_shards: usize) -> Vec<Vec<String>> {
let mut shards: Vec<Vec<String>> = (0..num_shards).map(|_| Vec::new()).collect();
for (i, pattern) in patterns.iter().enumerate() {
shards[i % num_shards].push(pattern.clone());
}
shards
}
fn parse_mutalyzer_settings(settings_path: &Path) -> Result<PathBuf, FerroError> {
let content = std::fs::read_to_string(settings_path).map_err(|e| FerroError::Io {
msg: format!("Failed to read settings file: {}", e),
})?;
for line in content.lines() {
let line = line.trim();
if line.starts_with("MUTALYZER_CACHE_DIR") {
if let Some(value) = line.split('=').nth(1) {
return Ok(PathBuf::from(value.trim()));
}
}
}
Err(FerroError::Io {
msg: format!(
"MUTALYZER_CACHE_DIR not found in settings file: {}",
settings_path.display()
),
})
}
#[derive(Debug, serde::Deserialize)]
struct ShardManifest {
num_shards: usize,
#[allow(dead_code)]
total_accessions: usize,
#[allow(dead_code)]
shard_counts: Vec<usize>,
}
fn accession_to_shard(accession: &str, num_shards: usize) -> usize {
use std::hash::{Hash, Hasher};
let mut hasher = std::collections::hash_map::DefaultHasher::new();
accession.hash(&mut hasher);
(hasher.finish() as usize) % num_shards
}
fn load_shard_manifest(cache_dir: &Path) -> Option<ShardManifest> {
let manifest_path = cache_dir.join("shard_manifest.json");
if !manifest_path.exists() {
return None;
}
let file = File::open(&manifest_path).ok()?;
let reader = BufReader::new(file);
serde_json::from_reader(reader).ok()
}
fn group_patterns_by_shard(patterns: &[String], num_shards: usize) -> Vec<Vec<String>> {
let mut shard_groups: Vec<Vec<String>> = (0..num_shards).map(|_| Vec::new()).collect();
for pattern in patterns {
if let Some(accession) = extract_accession(pattern) {
let shard_id = accession_to_shard(accession, num_shards);
shard_groups[shard_id].push(pattern.clone());
} else {
shard_groups[0].push(pattern.clone());
}
}
shard_groups
}
fn partition_cache_for_workers(
shards: &[Vec<String>],
full_cache_dir: &Path,
work_dir: &Path,
) -> Result<Vec<PathBuf>, FerroError> {
let num_workers = shards.len();
let mut settings_paths = Vec::with_capacity(num_workers);
for (worker_id, shard) in shards.iter().enumerate() {
let mut accessions = HashSet::new();
for pattern in shard {
for acc in extract_accessions_from_pattern(pattern) {
accessions.insert(acc);
}
}
let worker_cache_dir = work_dir.join(format!("cache_worker_{}", worker_id));
std::fs::create_dir_all(&worker_cache_dir).map_err(|e| FerroError::Io {
msg: format!("Failed to create worker cache dir: {}", e),
})?;
for accession in &accessions {
for ext in &["sequence", "annotations"] {
let filename = format!("{}.{}", accession, ext);
let src = full_cache_dir.join(&filename);
let dst = worker_cache_dir.join(&filename);
if src.exists() && !dst.exists() {
#[cfg(unix)]
std::os::unix::fs::symlink(&src, &dst).map_err(|e| FerroError::Io {
msg: format!("Failed to create symlink {}: {}", dst.display(), e),
})?;
#[cfg(not(unix))]
std::fs::copy(&src, &dst).map_err(|e| FerroError::Io {
msg: format!("Failed to copy {}: {}", dst.display(), e),
})?;
}
}
}
let settings_path = work_dir.join(format!("mutalyzer_settings_{}.conf", worker_id));
let mut settings_file = File::create(&settings_path).map_err(|e| FerroError::Io {
msg: format!("Failed to create settings file: {}", e),
})?;
writeln!(
settings_file,
"MUTALYZER_CACHE_DIR = {}",
worker_cache_dir.display()
)
.map_err(|e| FerroError::Io {
msg: format!("Failed to write settings: {}", e),
})?;
writeln!(settings_file, "MUTALYZER_FILE_CACHE_ADD = false").map_err(|e| {
FerroError::Io {
msg: format!("Failed to write settings: {}", e),
}
})?;
settings_paths.push(settings_path);
eprintln!(
"[partition] Worker {} needs {} accessions, {} symlinks created",
worker_id,
accessions.len(),
accessions.len() * 2
);
}
Ok(settings_paths)
}
fn compute_agreement(
ferro_results: &[ParseResult],
muta_results: &[ParseResult],
) -> (AgreementStats, Vec<DisagreementExample>) {
let muta_map: HashMap<&str, &ParseResult> =
muta_results.iter().map(|r| (r.input.as_str(), r)).collect();
let mut both_success = 0usize;
let mut both_fail = 0usize;
let mut ferro_only_success = 0usize;
let mut mutalyzer_only_success = 0usize;
let mut agreements = 0usize;
let mut disagreement_examples = Vec::new();
for ferro_result in ferro_results {
if let Some(muta_result) = muta_map.get(ferro_result.input.as_str()) {
match (ferro_result.success, muta_result.success) {
(true, true) => {
both_success += 1;
let ferro_out = ferro_result.output.as_deref().unwrap_or("");
let muta_out = muta_result.output.as_deref().unwrap_or("");
if outputs_are_equivalent(ferro_out, muta_out) {
agreements += 1;
} else if disagreement_examples.len() < 50 {
disagreement_examples.push(DisagreementExample {
input: ferro_result.input.clone(),
ferro_output: ferro_result.output.clone().unwrap_or_default(),
mutalyzer_output: muta_result.output.clone().unwrap_or_default(),
});
}
}
(true, false) => ferro_only_success += 1,
(false, true) => mutalyzer_only_success += 1,
(false, false) => both_fail += 1,
}
}
}
let agreement_rate = if both_success > 0 {
agreements as f64 / both_success as f64
} else {
0.0
};
let stats = AgreementStats {
both_success,
both_fail,
ferro_only_success,
mutalyzer_only_success,
agreements,
agreement_rate,
disagreement_examples: disagreement_examples.clone(),
};
(stats, disagreement_examples)
}
fn compute_reference_stats(
ferro_results: &[ParseResult],
muta_results: &[ParseResult],
base_agreement: &AgreementStats,
) -> ReferenceStats {
let muta_map: HashMap<&str, &ParseResult> =
muta_results.iter().map(|r| (r.input.as_str(), r)).collect();
let patterns_with_corrections: Vec<String> = ferro_results
.iter()
.filter(|r| r.ref_mismatch.is_some())
.map(|r| r.input.clone())
.take(100) .collect();
let corrections_count = ferro_results
.iter()
.filter(|r| r.ref_mismatch.is_some())
.count();
let mut strict_both_success = 0usize;
let mut strict_agreements = 0usize;
for ferro_result in ferro_results {
if let Some(muta_result) = muta_map.get(ferro_result.input.as_str()) {
let ferro_strict_success = ferro_result.success && ferro_result.ref_mismatch.is_none();
if ferro_strict_success && muta_result.success {
strict_both_success += 1;
let ferro_out = ferro_result.output.as_deref().unwrap_or("");
let muta_out = muta_result.output.as_deref().unwrap_or("");
if outputs_are_equivalent(ferro_out, muta_out) {
strict_agreements += 1;
}
}
}
}
let strict_agreement_rate = if strict_both_success > 0 {
strict_agreements as f64 / strict_both_success as f64
} else {
0.0
};
ReferenceStats {
corrections_count,
strict_agreement_rate,
lenient_agreement_rate: base_agreement.agreement_rate,
patterns_with_corrections,
}
}
fn print_comparison_summary(result: &ComparisonResult) {
eprintln!();
eprintln!(
"COMPARISON RESULTS ({}, {} patterns)",
result.mode, result.sample_size
);
eprintln!("{}", "=".repeat(50));
eprintln!(
"ferro-hgvs: {:>6.2}s ({:>7.0} p/s) {}/{} success",
result.ferro.elapsed_seconds,
result.ferro.throughput,
result.ferro.successful,
result.sample_size
);
eprintln!(
"mutalyzer: {:>6.2}s ({:>7.0} p/s) {}/{} success",
result.mutalyzer.elapsed_seconds,
result.mutalyzer.throughput,
result.mutalyzer.successful,
result.sample_size
);
eprintln!();
eprintln!("Speedup: {:.0}x faster", result.speedup);
if let Some(ref ref_stats) = result.reference_stats {
if ref_stats.corrections_count > 0 {
eprintln!();
eprintln!("Reference Mismatch Handling:");
eprintln!(
" Patterns with ref mismatches: {}",
ref_stats.corrections_count
);
eprintln!(
" Ferro corrected (lenient): {}/{}",
ref_stats.corrections_count, ref_stats.corrections_count
);
eprintln!();
eprintln!("Agreement Analysis:");
eprintln!(
" Lenient mode (ferro corrects): {:>5}/{:<5} ({:.1}%)",
result.agreement.agreements,
result.agreement.both_success,
ref_stats.lenient_agreement_rate * 100.0
);
let strict_both_success = result
.agreement
.both_success
.saturating_sub(ref_stats.corrections_count);
let strict_agreements =
(strict_both_success as f64 * ref_stats.strict_agreement_rate).round() as usize;
eprintln!(
" Strict mode (both reject): {:>5}/{:<5} ({:.1}%) [fair comparison]",
strict_agreements,
strict_both_success,
ref_stats.strict_agreement_rate * 100.0
);
}
}
if result
.reference_stats
.as_ref()
.is_none_or(|s| s.corrections_count == 0)
{
eprintln!();
eprintln!("Agreement (where both succeeded):");
let different = result.agreement.both_success - result.agreement.agreements;
eprintln!(
" Same output: {:>5}/{:<5} ({:.1}%)",
result.agreement.agreements,
result.agreement.both_success,
result.agreement.agreement_rate * 100.0
);
eprintln!(
" Different output: {:>5}/{:<5} ({:.1}%)",
different,
result.agreement.both_success,
(1.0 - result.agreement.agreement_rate) * 100.0
);
}
eprintln!();
if let Some(ref ref_stats) = result.reference_stats {
if ref_stats.corrections_count > 0 {
eprintln!(
"Ferro-only success: {} ({} due to ref correction)",
result.agreement.ferro_only_success, ref_stats.corrections_count
);
} else {
eprintln!(
"Ferro-only success: {}",
result.agreement.ferro_only_success
);
}
} else {
eprintln!(
"Ferro-only success: {}",
result.agreement.ferro_only_success
);
}
eprintln!(
"Mutalyzer-only success: {}",
result.agreement.mutalyzer_only_success
);
eprintln!("Both failed: {}", result.agreement.both_fail);
if let Some(ref cache_stats) = result.cache_stats {
eprintln!();
eprintln!("Cache Coverage:");
eprintln!(" Unique accessions: {}", cache_stats.total_accessions);
eprintln!(" Ferro missing: {}", cache_stats.ferro_missing);
eprintln!(" Mutalyzer missing: {}", cache_stats.mutalyzer_missing);
if !cache_stats.mutalyzer_missing_examples.is_empty() {
eprintln!(
" Examples: {}",
cache_stats
.mutalyzer_missing_examples
.iter()
.take(5)
.cloned()
.collect::<Vec<_>>()
.join(", ")
);
}
}
if let Some(ref error_counts) = result.mutalyzer.error_counts {
if !error_counts.is_empty() {
eprintln!();
eprintln!("Mutalyzer Error Breakdown:");
let mut counts: Vec<_> = error_counts.iter().collect();
counts.sort_by(|a, b| b.1.cmp(a.1));
for (category, count) in counts.iter().take(10) {
eprintln!(" {:25} {:>5}", category, count);
}
}
}
if !result.differences.is_empty() {
eprintln!();
eprintln!("Sample differences:");
for (i, diff) in result.differences.iter().take(5).enumerate() {
eprintln!(" {}. Input: {}", i + 1, diff.input);
eprintln!(" ferro: {}", diff.ferro_output);
eprintln!(" mutalyzer: {}", diff.mutalyzer_output);
}
if result.differences.len() > 5 {
eprintln!(" ... and {} more", result.differences.len() - 5);
}
}
}
fn save_comparison_result<P: AsRef<Path>>(
result: &ComparisonResult,
path: P,
) -> Result<(), FerroError> {
let path = path.as_ref();
if let Some(parent) = path.parent() {
std::fs::create_dir_all(parent).map_err(|e| FerroError::Io {
msg: format!("Failed to create directory: {}", e),
})?;
}
let file = File::create(path).map_err(|e| FerroError::Io {
msg: format!("Failed to create {}: {}", path.display(), e),
})?;
let writer = BufWriter::new(file);
serde_json::to_writer_pretty(writer, result).map_err(|e| FerroError::Io {
msg: format!("Failed to write JSON: {}", e),
})?;
eprintln!();
eprintln!("Results saved to: {}", path.display());
Ok(())
}
pub fn outputs_are_equivalent(ferro: &str, mutalyzer: &str) -> bool {
if ferro == mutalyzer {
return true;
}
if check_allelic_equivalence(ferro, mutalyzer) {
return true;
}
if check_delins_inv_equivalence(ferro, mutalyzer) {
return true;
}
if check_delins_split_equivalence(ferro, mutalyzer) {
return true;
}
if check_dup_repeat_equivalence(ferro, mutalyzer) {
return true;
}
if check_three_prime_shift_equivalence(ferro, mutalyzer) {
return true;
}
if check_repeat_vs_del_dup_equivalence(ferro, mutalyzer) {
return true;
}
if check_dup_vs_expanded_repeat_equivalence(ferro, mutalyzer) {
return true;
}
if check_delins_sequence_vs_position_equivalence(ferro, mutalyzer) {
return true;
}
if check_delins_vs_bracketed_position_equivalence(ferro, mutalyzer) {
return true;
}
if check_repeat_position_range_equivalence(ferro, mutalyzer) {
return true;
}
if check_single_element_allelic_equivalence(ferro, mutalyzer) {
return true;
}
if check_allelic_ordering_equivalence(ferro, mutalyzer) {
return true;
}
if check_ins_position_reference_equivalence(ferro, mutalyzer) {
return true;
}
if check_ins_vs_repeat_equivalence(ferro, mutalyzer) {
return true;
}
if check_inv_with_bases_equivalence(ferro, mutalyzer) {
return true;
}
if check_dup_vs_ins_equivalence(ferro, mutalyzer) {
return true;
}
if check_del_dup_vs_identity_equivalence(ferro, mutalyzer) {
return true;
}
if check_ins_sequence_vs_range_equivalence(ferro, mutalyzer) {
return true;
}
if check_repeat_one_vs_identity_equivalence(ferro, mutalyzer) {
return true;
}
if check_identity_substitution_equivalence(ferro, mutalyzer) {
return true;
}
if check_stop_codon_position_equivalence(ferro, mutalyzer) {
return true;
}
if check_utr_position_off_by_one_equivalence(ferro, mutalyzer) {
return true;
}
if check_delins_vs_complex_allelic_equivalence(ferro, mutalyzer) {
return true;
}
if check_cds_end_vs_utr_start_equivalence(ferro, mutalyzer) {
return true;
}
if check_ins_n_count_vs_expanded_equivalence(ferro, mutalyzer) {
return true;
}
if check_delins_simplification_equivalence(ferro, mutalyzer) {
return true;
}
if check_utr_larger_position_shift_equivalence(ferro, mutalyzer) {
return true;
}
false
}
fn check_allelic_equivalence(ferro: &str, mutalyzer: &str) -> bool {
if !mutalyzer.contains(":[c.[") && !mutalyzer.contains(":c.[") {
return false;
}
if !ferro.starts_with('[') || !ferro.ends_with(']') {
return false;
}
let muta_acc = mutalyzer.split(':').next().unwrap_or("");
if muta_acc.is_empty() {
return false;
}
let after_acc = mutalyzer.strip_prefix(muta_acc).unwrap_or("");
let after_acc = after_acc.strip_prefix(':').unwrap_or(after_acc);
let var_type = if after_acc.starts_with("c.[") {
"c."
} else if after_acc.starts_with("n.[") {
"n."
} else if after_acc.starts_with("g.[") {
"g."
} else {
return false;
};
let alleles_str = after_acc
.strip_prefix(var_type)
.and_then(|s| s.strip_prefix('['))
.and_then(|s| s.strip_suffix(']'));
let Some(alleles_str) = alleles_str else {
return false;
};
let muta_alleles: Vec<&str> = alleles_str.split(';').collect();
let ferro_inner = ferro.strip_prefix('[').and_then(|s| s.strip_suffix(']'));
let Some(ferro_inner) = ferro_inner else {
return false;
};
let ferro_parts: Vec<&str> = ferro_inner.split(';').collect();
if ferro_parts.len() != muta_alleles.len() {
return false;
}
let expected_prefix = format!("{}:{}", muta_acc, var_type);
for (ferro_part, muta_allele) in ferro_parts.iter().zip(muta_alleles.iter()) {
let ferro_allele = ferro_part.strip_prefix(&expected_prefix);
if ferro_allele != Some(*muta_allele) {
return false;
}
}
true
}
fn check_delins_inv_equivalence(ferro: &str, mutalyzer: &str) -> bool {
let (delins, inv) = if ferro.contains("delins") && mutalyzer.contains("inv") {
(ferro, mutalyzer)
} else if mutalyzer.contains("delins") && ferro.contains("inv") {
(mutalyzer, ferro)
} else {
return false;
};
let delins_pos = extract_position_range(delins);
let inv_pos = extract_position_range(inv);
if delins_pos.is_none() || inv_pos.is_none() {
return false;
}
delins_pos == inv_pos
}
fn check_delins_split_equivalence(ferro: &str, mutalyzer: &str) -> bool {
let (delins, allelic) = if ferro.contains("delins") && mutalyzer.contains("[") {
(ferro, mutalyzer)
} else if mutalyzer.contains("delins") && ferro.contains("[") {
(mutalyzer, ferro)
} else {
return false;
};
let has_operation = allelic.contains(">")
|| allelic.contains("del")
|| allelic.contains("ins")
|| allelic.contains("dup")
|| allelic.contains("inv");
if !has_operation {
return false;
}
let delins_acc = delins.split(':').next().unwrap_or("");
let allelic_acc = allelic.split(':').next().unwrap_or("");
if delins_acc != allelic_acc || delins_acc.is_empty() {
return false;
}
let Some(delins_range) = extract_position_range(delins) else {
return false;
};
let parse_position = |s: &str| -> Option<i64> {
let s = s.trim_start_matches('*');
s.parse::<i64>().ok()
};
let (delins_start, delins_end) = if let Some((start, end)) = delins_range.split_once('_') {
match (parse_position(start), parse_position(end)) {
(Some(s), Some(e)) => (s, e),
_ => return false,
}
} else {
match parse_position(delins_range) {
Some(pos) => (pos, pos),
None => return false,
}
};
let slack = 5i64;
let allelic_inner = allelic
.find(":c.[")
.or_else(|| allelic.find(":n.["))
.or_else(|| allelic.find(":g.["))
.map(|i| &allelic[i + 4..])
.and_then(|s| s.strip_suffix(']'));
let Some(allelic_inner) = allelic_inner else {
return false;
};
for variant in allelic_inner.split(';') {
let variant = variant.trim_start_matches('*');
let pos_end = variant
.find(|c: char| !c.is_ascii_digit() && c != '-')
.unwrap_or(variant.len());
if pos_end == 0 {
return false;
}
let Ok(pos) = variant[..pos_end].parse::<i64>() else {
return false;
};
if pos < delins_start - slack || pos > delins_end + slack {
return false;
}
}
true
}
fn extract_position_range(hgvs: &str) -> Option<&str> {
let after_type = hgvs
.find(":c.")
.or_else(|| hgvs.find(":n."))
.or_else(|| hgvs.find(":g."))
.map(|i| &hgvs[i + 3..])?;
let end = after_type
.find(|c: char| !c.is_ascii_digit() && c != '_' && c != '-' && c != '*')
.unwrap_or(after_type.len());
if end > 0 {
Some(&after_type[..end])
} else {
None
}
}
fn check_dup_repeat_equivalence(ferro: &str, mutalyzer: &str) -> bool {
let (dup, repeat) =
if ferro.contains("dup") && mutalyzer.contains('[') && !mutalyzer.contains("delins") {
(ferro, mutalyzer)
} else if mutalyzer.contains("dup") && ferro.contains('[') && !ferro.contains("delins") {
(mutalyzer, ferro)
} else {
return false;
};
if repeat.contains(';') {
return false;
}
let dup_pos = extract_position_range(dup);
let repeat_pos = extract_position_range(repeat);
if dup_pos.is_none() || repeat_pos.is_none() {
return false;
}
if dup_pos != repeat_pos {
return false;
}
repeat.ends_with("[2]")
}
fn check_three_prime_shift_equivalence(ferro: &str, mutalyzer: &str) -> bool {
let ferro_acc = ferro.split(':').next().unwrap_or("");
let mutalyzer_acc = mutalyzer.split(':').next().unwrap_or("");
if ferro_acc != mutalyzer_acc || ferro_acc.is_empty() {
return false;
}
let ferro_type = extract_variant_type(ferro);
let mutalyzer_type = extract_variant_type(mutalyzer);
if ferro_type != mutalyzer_type || ferro_type.is_none() {
return false;
}
let ferro_edit = extract_edit_type(ferro);
let mutalyzer_edit = extract_edit_type(mutalyzer);
if ferro_edit != mutalyzer_edit || ferro_edit.is_none() {
return false;
}
let ferro_positions = extract_position_values(ferro);
let mutalyzer_positions = extract_position_values(mutalyzer);
let (Some((ferro_start, ferro_end)), Some((muta_start, muta_end))) =
(ferro_positions, mutalyzer_positions)
else {
return false;
};
let start_delta = ferro_start - muta_start;
let end_delta = ferro_end - muta_end;
if start_delta != end_delta {
return false;
}
let shift = start_delta.abs();
shift > 0 && shift <= 20
}
fn extract_variant_type(hgvs: &str) -> Option<char> {
if hgvs.contains(":c.") {
Some('c')
} else if hgvs.contains(":n.") {
Some('n')
} else if hgvs.contains(":g.") {
Some('g')
} else {
None
}
}
fn extract_edit_type(hgvs: &str) -> Option<&'static str> {
if hgvs.contains("delins") {
Some("delins")
} else if hgvs.contains("dup") {
Some("dup")
} else if hgvs.contains("del") {
Some("del")
} else if hgvs.contains("ins") {
Some("ins")
} else {
None
}
}
fn extract_position_values(hgvs: &str) -> Option<(i64, i64)> {
let pos_str = extract_position_range(hgvs)?;
if pos_str.contains('_') {
let parts: Vec<&str> = pos_str.split('_').collect();
if parts.len() == 2 {
let start: i64 = parts[0].parse().ok()?;
let end: i64 = parts[1].parse().ok()?;
Some((start, end))
} else {
None
}
} else {
let pos: i64 = pos_str.parse().ok()?;
Some((pos, pos))
}
}
fn check_repeat_vs_del_dup_equivalence(ferro: &str, mutalyzer: &str) -> bool {
let (repeat, del_dup) = if has_repeat_with_sequence(ferro) && !has_repeat_notation(mutalyzer) {
(ferro, mutalyzer)
} else if has_repeat_with_sequence(mutalyzer) && !has_repeat_notation(ferro) {
(mutalyzer, ferro)
} else {
return false;
};
if !del_dup.contains("del") && !del_dup.contains("dup") {
return false;
}
if del_dup.contains("delins") {
return false;
}
let repeat_acc = repeat.split(':').next().unwrap_or("");
let del_dup_acc = del_dup.split(':').next().unwrap_or("");
if repeat_acc != del_dup_acc || repeat_acc.is_empty() {
return false;
}
let repeat_type = extract_variant_type(repeat);
let del_dup_type = extract_variant_type(del_dup);
if repeat_type != del_dup_type || repeat_type.is_none() {
return false;
}
true
}
fn has_repeat_with_sequence(s: &str) -> bool {
if let Some(bracket_pos) = s.rfind('[') {
if let Some(close_pos) = s.rfind(']') {
if close_pos > bracket_pos {
let inside = &s[bracket_pos + 1..close_pos];
if inside.chars().all(|c| c.is_ascii_digit()) && !inside.is_empty() {
if bracket_pos >= 2 {
let before = &s[..bracket_pos];
if let Some(last_char) = before.chars().last() {
if "ACGT".contains(last_char) {
return true;
}
}
}
}
}
}
}
false
}
fn has_repeat_notation(s: &str) -> bool {
if let Some(bracket_pos) = s.rfind('[') {
if let Some(close_pos) = s.rfind(']') {
if close_pos > bracket_pos {
let inside = &s[bracket_pos + 1..close_pos];
if inside.chars().all(|c| c.is_ascii_digit()) && !inside.is_empty() {
if bracket_pos > 0 {
let before = &s[..bracket_pos];
if !before.ends_with(';') && !before.ends_with('.') {
return true;
}
}
}
}
}
}
false
}
fn check_dup_vs_expanded_repeat_equivalence(ferro: &str, mutalyzer: &str) -> bool {
let (dup_str, repeat_str) = if ferro.contains("dup")
&& !ferro.contains('[')
&& has_expanded_repeat_with_sequence(mutalyzer)
{
(ferro, mutalyzer)
} else if mutalyzer.contains("dup")
&& !mutalyzer.contains('[')
&& has_expanded_repeat_with_sequence(ferro)
{
(mutalyzer, ferro)
} else {
return false;
};
let dup_acc = dup_str.split(':').next().unwrap_or("");
let repeat_acc = repeat_str.split(':').next().unwrap_or("");
if dup_acc != repeat_acc || dup_acc.is_empty() {
return false;
}
let dup_type = extract_variant_type(dup_str);
let repeat_type = extract_variant_type(repeat_str);
if dup_type != repeat_type || dup_type.is_none() {
return false;
}
true
}
fn has_expanded_repeat_with_sequence(s: &str) -> bool {
if let Some(bracket_pos) = s.rfind('[') {
if let Some(close_pos) = s.rfind(']') {
if close_pos > bracket_pos {
let inside = &s[bracket_pos + 1..close_pos];
if inside.chars().all(|c| c.is_ascii_digit()) && !inside.is_empty() {
if bracket_pos >= 2 {
let before = &s[..bracket_pos];
let last_chars: String = before.chars().rev().take(3).collect();
if last_chars.chars().all(|c| "ACGT".contains(c)) {
return true;
}
}
}
}
}
}
false
}
fn check_delins_sequence_vs_position_equivalence(ferro: &str, mutalyzer: &str) -> bool {
if !ferro.contains("delins") || !mutalyzer.contains("delins") {
return false;
}
let ferro_acc = ferro.split(':').next().unwrap_or("");
let mutalyzer_acc = mutalyzer.split(':').next().unwrap_or("");
if ferro_acc != mutalyzer_acc || ferro_acc.is_empty() {
return false;
}
let ferro_delins = ferro.split("delins").nth(1).unwrap_or("");
let mutalyzer_delins = mutalyzer.split("delins").nth(1).unwrap_or("");
let ferro_is_seq = ferro_delins.chars().all(|c| c.is_ascii_alphabetic());
let mutalyzer_is_seq = mutalyzer_delins.chars().all(|c| c.is_ascii_alphabetic());
let ferro_is_pos = ferro_delins.contains('_')
&& ferro_delins
.chars()
.all(|c| c.is_ascii_digit() || c == '_' || c == '-' || c == '*' || c == '+');
let mutalyzer_is_pos = mutalyzer_delins.contains('_')
&& mutalyzer_delins
.chars()
.all(|c| c.is_ascii_digit() || c == '_' || c == '-' || c == '*' || c == '+');
(ferro_is_seq && mutalyzer_is_pos) || (mutalyzer_is_seq && ferro_is_pos)
}
fn check_delins_vs_bracketed_position_equivalence(ferro: &str, mutalyzer: &str) -> bool {
if !ferro.contains("delins") || !mutalyzer.contains("delins") {
return false;
}
let (simple, bracketed) = if !ferro.contains("delins[") && mutalyzer.contains("delins[") {
(ferro, mutalyzer)
} else if !mutalyzer.contains("delins[") && ferro.contains("delins[") {
(mutalyzer, ferro)
} else {
return false;
};
let simple_acc = simple.split(':').next().unwrap_or("");
let bracketed_acc = bracketed.split(':').next().unwrap_or("");
if simple_acc != bracketed_acc || simple_acc.is_empty() {
return false;
}
let simple_range = extract_position_range(simple);
let bracketed_range = extract_position_range(bracketed);
if simple_range != bracketed_range {
return false;
}
bracketed.contains("delins[") && bracketed.contains(';')
}
fn check_repeat_position_range_equivalence(ferro: &str, mutalyzer: &str) -> bool {
if !has_repeat_with_sequence(ferro) || !has_repeat_with_sequence(mutalyzer) {
return false;
}
let ferro_acc = ferro.split(':').next().unwrap_or("");
let mutalyzer_acc = mutalyzer.split(':').next().unwrap_or("");
if ferro_acc != mutalyzer_acc || ferro_acc.is_empty() {
return false;
}
let ferro_type = extract_variant_type(ferro);
let mutalyzer_type = extract_variant_type(mutalyzer);
if ferro_type != mutalyzer_type || ferro_type.is_none() {
return false;
}
let ferro_repeat = extract_repeat_info(ferro);
let mutalyzer_repeat = extract_repeat_info(mutalyzer);
match (ferro_repeat, mutalyzer_repeat) {
(Some((ferro_seq, ferro_count)), Some((muta_seq, muta_count))) => {
ferro_seq == muta_seq && ferro_count == muta_count
}
_ => false,
}
}
fn extract_repeat_info(s: &str) -> Option<(&str, u32)> {
let bracket_pos = s.rfind('[')?;
let close_pos = s.rfind(']')?;
if close_pos <= bracket_pos {
return None;
}
let count_str = &s[bracket_pos + 1..close_pos];
let count: u32 = count_str.parse().ok()?;
let before = &s[..bracket_pos];
let mut seq_start = bracket_pos;
for (i, c) in before.char_indices().rev() {
if "ACGT".contains(c) {
seq_start = i;
} else {
break;
}
}
if seq_start < bracket_pos {
let seq = &s[seq_start..bracket_pos];
Some((seq, count))
} else {
None
}
}
fn check_single_element_allelic_equivalence(ferro: &str, mutalyzer: &str) -> bool {
let (bracketed, unbracketed) = if ferro.starts_with('[')
&& ferro.ends_with(']')
&& !ferro.contains(';')
&& !mutalyzer.starts_with('[')
{
(ferro, mutalyzer)
} else if mutalyzer.starts_with('[')
&& mutalyzer.ends_with(']')
&& !mutalyzer.contains(';')
&& !ferro.starts_with('[')
{
(mutalyzer, ferro)
} else {
return false;
};
let inner = bracketed
.strip_prefix('[')
.and_then(|s| s.strip_suffix(']'));
match inner {
Some(inner) => inner == unbracketed,
None => false,
}
}
fn check_allelic_ordering_equivalence(ferro: &str, mutalyzer: &str) -> bool {
if !mutalyzer.contains(":c.[") && !mutalyzer.contains(":n.[") && !mutalyzer.contains(":g.[") {
return false;
}
if !ferro.starts_with('[') || !ferro.ends_with(']') {
return false;
}
let muta_acc = mutalyzer.split(':').next().unwrap_or("");
if muta_acc.is_empty() {
return false;
}
let var_type = if mutalyzer.contains(":c.[") {
"c."
} else if mutalyzer.contains(":n.[") {
"n."
} else if mutalyzer.contains(":g.[") {
"g."
} else {
return false;
};
let Some(muta_alleles_start) = mutalyzer.find(&format!("{}[", var_type)) else {
return false;
};
let muta_alleles_str = &mutalyzer[muta_alleles_start + var_type.len() + 1..];
let Some(muta_alleles_str) = muta_alleles_str.strip_suffix(']') else {
return false;
};
let mut muta_alleles: Vec<&str> = muta_alleles_str.split(';').collect();
let Some(ferro_inner) = ferro.strip_prefix('[').and_then(|s| s.strip_suffix(']')) else {
return false;
};
let ferro_parts: Vec<&str> = ferro_inner.split(';').collect();
if ferro_parts.len() != muta_alleles.len() {
return false;
}
let expected_prefix = format!("{}:{}", muta_acc, var_type);
let mut ferro_alleles: Vec<&str> = Vec::new();
for part in &ferro_parts {
let Some(allele) = part.strip_prefix(&expected_prefix) else {
return false;
};
ferro_alleles.push(allele);
}
muta_alleles.sort();
ferro_alleles.sort();
muta_alleles == ferro_alleles
}
fn check_ins_position_reference_equivalence(ferro: &str, mutalyzer: &str) -> bool {
let (explicit, reference) =
if ferro.contains("ins") && !ferro.contains("ins[") && mutalyzer.contains("ins[") {
(ferro, mutalyzer)
} else if mutalyzer.contains("ins") && !mutalyzer.contains("ins[") && ferro.contains("ins[")
{
(mutalyzer, ferro)
} else {
return false;
};
let explicit_acc = explicit.split(':').next().unwrap_or("");
let reference_acc = reference.split(':').next().unwrap_or("");
if explicit_acc != reference_acc || explicit_acc.is_empty() {
return false;
}
let explicit_type = extract_variant_type(explicit);
let reference_type = extract_variant_type(reference);
if explicit_type != reference_type || explicit_type.is_none() {
return false;
}
let explicit_pos = extract_ins_positions(explicit);
let reference_pos = extract_ins_positions(reference);
explicit_pos.is_some() && explicit_pos == reference_pos
}
fn extract_ins_positions(s: &str) -> Option<(i64, i64)> {
let ins_pos = s.find("ins")?;
let before_ins = &s[..ins_pos];
let pos_start = before_ins.rfind('.')?;
let pos_str = &before_ins[pos_start + 1..];
if let Some(underscore) = pos_str.find('_') {
let start: i64 = pos_str[..underscore].parse().ok()?;
let end: i64 = pos_str[underscore + 1..].parse().ok()?;
Some((start, end))
} else {
let pos: i64 = pos_str.parse().ok()?;
Some((pos, pos))
}
}
fn check_ins_vs_repeat_equivalence(ferro: &str, mutalyzer: &str) -> bool {
let (ins_str, repeat_str) =
if ferro.contains("ins") && !has_repeat_notation(ferro) && has_repeat_notation(mutalyzer) {
(ferro, mutalyzer)
} else if mutalyzer.contains("ins")
&& !has_repeat_notation(mutalyzer)
&& has_repeat_notation(ferro)
{
(mutalyzer, ferro)
} else {
return false;
};
let ins_acc = ins_str.split(':').next().unwrap_or("");
let repeat_acc = repeat_str.split(':').next().unwrap_or("");
if ins_acc != repeat_acc || ins_acc.is_empty() {
return false;
}
let ins_type = extract_variant_type(ins_str);
let repeat_type = extract_variant_type(repeat_str);
if ins_type != repeat_type || ins_type.is_none() {
return false;
}
true
}
fn check_inv_with_bases_equivalence(ferro: &str, mutalyzer: &str) -> bool {
if !ferro.contains("inv") || !mutalyzer.contains("inv") {
return false;
}
let ferro_acc = ferro.split(':').next().unwrap_or("");
let muta_acc = mutalyzer.split(':').next().unwrap_or("");
if ferro_acc != muta_acc || ferro_acc.is_empty() {
return false;
}
let Some(ferro_inv_pos) = ferro.find("inv") else {
return false;
};
let Some(muta_inv_pos) = mutalyzer.find("inv") else {
return false;
};
let ferro_before = &ferro[..ferro_inv_pos];
let muta_before = &mutalyzer[..muta_inv_pos];
let Some(ferro_pos_start) = ferro_before.rfind('.') else {
return false;
};
let Some(muta_pos_start) = muta_before.rfind('.') else {
return false;
};
let ferro_positions = &ferro_before[ferro_pos_start + 1..];
let muta_positions = &muta_before[muta_pos_start + 1..];
if ferro_positions != muta_positions {
return false;
}
let ferro_after_inv = &ferro[ferro_inv_pos + 3..];
let muta_after_inv = &mutalyzer[muta_inv_pos + 3..];
let ferro_has_bases = ferro_after_inv.chars().any(|c| "ACGT".contains(c));
let muta_has_bases = muta_after_inv.chars().any(|c| "ACGT".contains(c));
ferro_has_bases != muta_has_bases
}
fn check_dup_vs_ins_equivalence(ferro: &str, mutalyzer: &str) -> bool {
let (dup_str, ins_str) = if ferro.contains("dup") && mutalyzer.contains("ins") {
(ferro, mutalyzer)
} else if mutalyzer.contains("dup") && ferro.contains("ins") {
(mutalyzer, ferro)
} else {
return false;
};
if dup_str.contains("del") || ins_str.contains("del") {
return false;
}
let dup_acc = dup_str.split(':').next().unwrap_or("");
let ins_acc = ins_str.split(':').next().unwrap_or("");
if dup_acc != ins_acc || dup_acc.is_empty() {
return false;
}
let Some(dup_pos) = dup_str.find("dup") else {
return false;
};
let dup_before = &dup_str[..dup_pos];
let Some(dup_pos_start) = dup_before.rfind('.') else {
return false;
};
let dup_positions = &dup_before[dup_pos_start + 1..];
let Some(ins_pos) = ins_str.find("ins") else {
return false;
};
let ins_before = &ins_str[..ins_pos];
let Some(ins_pos_start) = ins_before.rfind('.') else {
return false;
};
let ins_positions = &ins_before[ins_pos_start + 1..];
let ins_seq = &ins_str[ins_pos + 3..];
if !dup_positions.contains('_') && ins_positions.contains('_') {
let Ok(dup_p) = parse_position(dup_positions) else {
return false;
};
let Some((ins_start, ins_end)) = parse_position_range(ins_positions) else {
return false;
};
let adjacent = ins_start == dup_p && ins_end == dup_p + 1;
if adjacent && ins_seq.len() == 1 {
return true;
}
}
if dup_positions.contains('_') && ins_positions.contains('_') {
let Some((dup_start, dup_end)) = parse_position_range(dup_positions) else {
return false;
};
let Some((ins_start, ins_end)) = parse_position_range(ins_positions) else {
return false;
};
let dup_len = (dup_end - dup_start + 1).unsigned_abs() as usize;
let ins_seq_len = ins_seq.chars().filter(|c| c.is_ascii_alphabetic()).count();
let adjacent_before = ins_end == dup_start && ins_start == dup_start - 1;
let adjacent_after = ins_start == dup_end && ins_end == dup_end + 1;
if (adjacent_before || adjacent_after) && ins_seq_len == dup_len {
return true;
}
}
false
}
fn check_del_dup_vs_identity_equivalence(ferro: &str, mutalyzer: &str) -> bool {
let (identity, variant) = if mutalyzer.contains("=") && !ferro.contains("=") {
(mutalyzer, ferro)
} else if ferro.contains("=") && !mutalyzer.contains("=") {
(ferro, mutalyzer)
} else {
return false;
};
if !variant.contains("del") && !variant.contains("dup") {
return false;
}
if variant.contains("delins") || variant.contains("ins") {
return false;
}
let identity_acc = identity.split(':').next().unwrap_or("");
let variant_acc = variant.split(':').next().unwrap_or("");
if identity_acc != variant_acc || identity_acc.is_empty() {
return false;
}
let identity_type = extract_variant_type(identity);
let variant_type = extract_variant_type(variant);
if identity_type != variant_type || identity_type.is_none() {
return false;
}
true
}
fn check_ins_sequence_vs_range_equivalence(ferro: &str, mutalyzer: &str) -> bool {
let (seq_str, range_str) = if ferro.contains("ins")
&& !ferro.contains("delins")
&& mutalyzer.contains("ins")
&& !mutalyzer.contains("delins")
{
let ferro_has_seq = ferro
.find("ins")
.and_then(|i| ferro.chars().nth(i + 3))
.is_some_and(|c| c.is_ascii_alphabetic());
let muta_has_range = mutalyzer
.find("ins")
.and_then(|i| mutalyzer.chars().nth(i + 3))
.is_some_and(|c| c.is_ascii_digit() || c == '-');
if ferro_has_seq && muta_has_range {
(ferro, mutalyzer)
} else {
let muta_has_seq = mutalyzer
.find("ins")
.and_then(|i| mutalyzer.chars().nth(i + 3))
.is_some_and(|c| c.is_ascii_alphabetic());
let ferro_has_range = ferro
.find("ins")
.and_then(|i| ferro.chars().nth(i + 3))
.is_some_and(|c| c.is_ascii_digit() || c == '-');
if muta_has_seq && ferro_has_range {
(mutalyzer, ferro)
} else {
return false;
}
}
} else {
return false;
};
let seq_acc = seq_str.split(':').next().unwrap_or("");
let range_acc = range_str.split(':').next().unwrap_or("");
if seq_acc != range_acc || seq_acc.is_empty() {
return false;
}
let Some(seq_ins_pos) = seq_str.find("ins") else {
return false;
};
let seq_before = &seq_str[..seq_ins_pos];
let Some(seq_pos_start) = seq_before.rfind('.') else {
return false;
};
let seq_positions = &seq_before[seq_pos_start + 1..];
let Some(range_ins_pos) = range_str.find("ins") else {
return false;
};
let range_before = &range_str[..range_ins_pos];
let Some(range_pos_start) = range_before.rfind('.') else {
return false;
};
let range_positions = &range_before[range_pos_start + 1..];
if seq_positions != range_positions {
return false;
}
true
}
fn check_repeat_one_vs_identity_equivalence(ferro: &str, mutalyzer: &str) -> bool {
let (repeat_str, identity_str) = if ferro.contains("[1]") && mutalyzer.contains("=") {
(ferro, mutalyzer)
} else if mutalyzer.contains("[1]") && ferro.contains("=") {
(mutalyzer, ferro)
} else {
return false;
};
let repeat_acc = repeat_str.split(':').next().unwrap_or("");
let identity_acc = identity_str.split(':').next().unwrap_or("");
if repeat_acc != identity_acc || repeat_acc.is_empty() {
return false;
}
let repeat_type = extract_variant_type(repeat_str);
let identity_type = extract_variant_type(identity_str);
if repeat_type != identity_type || repeat_type.is_none() {
return false;
}
true
}
fn check_identity_substitution_equivalence(ferro: &str, mutalyzer: &str) -> bool {
let (subst_str, identity_str) = if ferro.contains(">") && mutalyzer.contains("=") {
(ferro, mutalyzer)
} else if mutalyzer.contains(">") && ferro.contains("=") {
(mutalyzer, ferro)
} else {
return false;
};
let Some(gt_pos) = subst_str.find('>') else {
return false;
};
if gt_pos == 0 || gt_pos + 1 >= subst_str.len() {
return false;
}
let before = subst_str.chars().nth(gt_pos - 1);
let after = subst_str.chars().nth(gt_pos + 1);
match (before, after) {
(Some(b), Some(a)) if b == a && "ACGT".contains(b) => {}
_ => return false,
}
let subst_acc = subst_str.split(':').next().unwrap_or("");
let identity_acc = identity_str.split(':').next().unwrap_or("");
if subst_acc != identity_acc || subst_acc.is_empty() {
return false;
}
true
}
fn check_stop_codon_position_equivalence(ferro: &str, mutalyzer: &str) -> bool {
let (cds_str, utr_str) = if ferro.contains(":c.*1") && !mutalyzer.contains(":c.*") {
(mutalyzer, ferro)
} else if mutalyzer.contains(":c.*1") && !ferro.contains(":c.*") {
(ferro, mutalyzer)
} else {
return false;
};
let cds_acc = cds_str.split(':').next().unwrap_or("");
let utr_acc = utr_str.split(':').next().unwrap_or("");
if cds_acc != utr_acc || cds_acc.is_empty() {
return false;
}
let cds_op = cds_str.find('>').map(|i| &cds_str[i - 1..]);
let utr_op = utr_str.find('>').map(|i| &utr_str[i - 1..]);
matches!((cds_op, utr_op), (Some(c), Some(u)) if c == u)
}
fn check_utr_position_off_by_one_equivalence(ferro: &str, mutalyzer: &str) -> bool {
if !ferro.contains(":c.*") || !mutalyzer.contains(":c.*") {
return false;
}
let ferro_acc = ferro.split(':').next().unwrap_or("");
let muta_acc = mutalyzer.split(':').next().unwrap_or("");
if ferro_acc != muta_acc || ferro_acc.is_empty() {
return false;
}
let ferro_has_del = ferro.contains("del") && !ferro.contains("delins");
let ferro_has_dup = ferro.contains("dup");
let muta_has_del = mutalyzer.contains("del") && !mutalyzer.contains("delins");
let muta_has_dup = mutalyzer.contains("dup");
if !((ferro_has_del && muta_has_del) || (ferro_has_dup && muta_has_dup)) {
return false;
}
let extract_utr_pos = |s: &str| -> Option<(i64, Option<i64>)> {
let star_pos = s.find("*")?;
let after_star = &s[star_pos + 1..];
let end_pos = after_star
.find(|c: char| !c.is_ascii_digit() && c != '_' && c != '*')
.unwrap_or(after_star.len());
let pos_str = &after_star[..end_pos];
if pos_str.contains('_') {
let parts: Vec<&str> = pos_str.split('_').collect();
if parts.len() == 2 {
let start = parts[0].trim_start_matches('*').parse::<i64>().ok()?;
let end = parts[1].trim_start_matches('*').parse::<i64>().ok()?;
Some((start, Some(end)))
} else {
None
}
} else {
let pos = pos_str.parse::<i64>().ok()?;
Some((pos, None))
}
};
let ferro_pos = extract_utr_pos(ferro);
let muta_pos = extract_utr_pos(mutalyzer);
match (ferro_pos, muta_pos) {
(Some((f_start, f_end)), Some((m_start, m_end))) => {
let diff = (f_start - m_start).abs();
if diff != 1 {
return false;
}
match (f_end, m_end) {
(None, None) => true,
(Some(fe), Some(me)) => (fe - me).abs() == 1,
_ => false,
}
}
_ => false,
}
}
fn check_delins_vs_complex_allelic_equivalence(ferro: &str, mutalyzer: &str) -> bool {
let (delins_str, allelic_str) =
if ferro.contains("delins") && mutalyzer.contains("[") && mutalyzer.contains(";") {
(ferro, mutalyzer)
} else if mutalyzer.contains("delins") && ferro.contains("[") && ferro.contains(";") {
(mutalyzer, ferro)
} else {
return false;
};
let delins_acc = delins_str.split(':').next().unwrap_or("");
let allelic_acc = allelic_str.split(':').next().unwrap_or("");
let allelic_acc_clean = if allelic_acc.starts_with('[') {
allelic_str
.trim_start_matches('[')
.split(':')
.next()
.unwrap_or("")
} else {
allelic_acc
};
if delins_acc != allelic_acc_clean || delins_acc.is_empty() {
return false;
}
let has_operations = allelic_str.contains("ins") || allelic_str.contains("del");
if !has_operations {
return false;
}
true
}
fn parse_position(s: &str) -> Result<i64, std::num::ParseIntError> {
if let Some(rest) = s.strip_prefix('*') {
rest.parse::<i64>()
} else {
s.parse::<i64>()
}
}
fn parse_position_range(s: &str) -> Option<(i64, i64)> {
let (start_str, end_str) = s.split_once('_')?;
let start = parse_position(start_str).ok()?;
let end = parse_position(end_str).ok()?;
Some((start, end))
}
fn check_cds_end_vs_utr_start_equivalence(ferro: &str, mutalyzer: &str) -> bool {
let (cds_str, utr_str) = if ferro.contains("*") && !mutalyzer.contains("*") {
(mutalyzer, ferro)
} else if mutalyzer.contains("*") && !ferro.contains("*") {
(ferro, mutalyzer)
} else {
return false;
};
let cds_acc = cds_str.split(':').next().unwrap_or("");
let utr_acc = utr_str.split(':').next().unwrap_or("");
if cds_acc != utr_acc || cds_acc.is_empty() {
return false;
}
let has_del = cds_str.contains("del") && utr_str.contains("del");
if !has_del {
return false;
}
if !utr_str.contains("_*1") && !utr_str.contains("_*del") {
return false;
}
true
}
fn check_ins_n_count_vs_expanded_equivalence(ferro: &str, mutalyzer: &str) -> bool {
let (count_str, expanded_str) = if ferro.contains("N[") && !mutalyzer.contains("N[") {
(ferro, mutalyzer)
} else if mutalyzer.contains("N[") && !ferro.contains("N[") {
(mutalyzer, ferro)
} else {
return false;
};
let count_acc = count_str.split(':').next().unwrap_or("");
let expanded_acc = expanded_str.split(':').next().unwrap_or("");
if count_acc != expanded_acc || count_acc.is_empty() {
return false;
}
if !count_str.contains("ins") || !expanded_str.contains("ins") {
return false;
}
if !expanded_str.contains("NN") {
return false;
}
if let Some(n_start) = count_str.find("N[") {
if let Some(n_end) = count_str[n_start..].find(']') {
let count_str_part = &count_str[n_start + 2..n_start + n_end];
if let Ok(count) = count_str_part.parse::<usize>() {
let n_count = expanded_str.chars().filter(|&c| c == 'N').count();
if n_count >= count / 2 {
return true;
}
}
}
}
false
}
fn check_delins_simplification_equivalence(ferro: &str, mutalyzer: &str) -> bool {
if !ferro.contains("delins") || !mutalyzer.contains("delins") {
return false;
}
let ferro_acc = ferro.split(':').next().unwrap_or("");
let muta_acc = mutalyzer.split(':').next().unwrap_or("");
if ferro_acc != muta_acc || ferro_acc.is_empty() {
return false;
}
fn extract_delins(s: &str) -> Option<(i64, i64, &str)> {
let delins_pos = s.find("delins")?;
let before = &s[..delins_pos];
let after = &s[delins_pos + 6..];
let dot_pos = before.rfind('.')?;
let pos_str = &before[dot_pos + 1..];
if let Some((start, end)) = pos_str.split_once('_') {
let start_pos = start.parse::<i64>().ok()?;
let end_pos = end.parse::<i64>().ok()?;
Some((start_pos, end_pos, after))
} else {
let pos = pos_str.parse::<i64>().ok()?;
Some((pos, pos, after))
}
}
let Some((f_start, f_end, f_seq)) = extract_delins(ferro) else {
return false;
};
let Some((m_start, m_end, m_seq)) = extract_delins(mutalyzer) else {
return false;
};
let positions_close = (f_start - m_start).abs() <= 2 && (f_end - m_end).abs() <= 2;
if !positions_close {
return false;
}
let shorter = f_seq.len().min(m_seq.len());
if shorter == 0 {
return false;
}
let shared_prefix = f_seq
.chars()
.zip(m_seq.chars())
.take_while(|(a, b)| a == b)
.count();
if shared_prefix == 0 {
return false;
}
shared_prefix >= shorter.saturating_sub(2)
}
fn check_utr_larger_position_shift_equivalence(ferro: &str, mutalyzer: &str) -> bool {
if !ferro.contains("c.*") || !mutalyzer.contains("c.*") {
return false;
}
let ferro_acc = ferro.split(':').next().unwrap_or("");
let muta_acc = mutalyzer.split(':').next().unwrap_or("");
if ferro_acc != muta_acc || ferro_acc.is_empty() {
return false;
}
let ferro_has_del = ferro.contains("del") && !ferro.contains("delins");
let ferro_has_dup = ferro.contains("dup");
let muta_has_del = mutalyzer.contains("del") && !mutalyzer.contains("delins");
let muta_has_dup = mutalyzer.contains("dup");
if !((ferro_has_del && muta_has_del) || (ferro_has_dup && muta_has_dup)) {
return false;
}
let extract_utr_pos = |s: &str| -> Option<i64> {
let star_pos = s.find("*")?;
let after_star = &s[star_pos + 1..];
let end_pos = after_star
.find(|c: char| !c.is_ascii_digit())
.unwrap_or(after_star.len());
after_star[..end_pos].parse::<i64>().ok()
};
let Some(ferro_pos) = extract_utr_pos(ferro) else {
return false;
};
let Some(muta_pos) = extract_utr_pos(mutalyzer) else {
return false;
};
let diff = (ferro_pos - muta_pos).abs();
diff > 1 && diff <= 50
}
fn save_detailed_results<P: AsRef<Path>>(
ferro_results: &[ParseResult],
muta_results: &[ParseResult],
mode: CompareMode,
timestamp: DateTime<Utc>,
path: P,
) -> Result<(), FerroError> {
let path = path.as_ref();
if let Some(parent) = path.parent() {
std::fs::create_dir_all(parent).map_err(|e| FerroError::Io {
msg: format!("Failed to create directory: {}", e),
})?;
}
let results: Vec<CombinedPatternResult> = ferro_results
.iter()
.zip(muta_results.iter())
.map(|(f, m)| {
let outputs_match = if f.success && m.success {
let ferro_out = f.output.as_deref().unwrap_or("");
let muta_out = m.output.as_deref().unwrap_or("");
Some(outputs_are_equivalent(ferro_out, muta_out))
} else {
None
};
CombinedPatternResult {
input: f.input.clone(),
ferro_success: f.success,
ferro_output: f.output.clone(),
ferro_error: f.error.clone(),
mutalyzer_success: m.success,
mutalyzer_output: m.output.clone(),
mutalyzer_error: m.error.clone().or(m.error_category.clone()),
outputs_match,
}
})
.collect();
let detailed = DetailedResults {
mode,
timestamp,
sample_size: results.len(),
results,
};
let file = File::create(path).map_err(|e| FerroError::Io {
msg: format!("Failed to create {}: {}", path.display(), e),
})?;
let writer = BufWriter::new(file);
serde_json::to_writer_pretty(writer, &detailed).map_err(|e| FerroError::Io {
msg: format!("Failed to write JSON: {}", e),
})?;
eprintln!("Detailed results saved to: {}", path.display());
Ok(())
}
#[cfg(test)]
mod equivalence_tests {
use super::*;
#[test]
fn test_exact_match() {
assert!(outputs_are_equivalent(
"NM_000060.2:c.1207T>G",
"NM_000060.2:c.1207T>G"
));
}
#[test]
fn test_allelic_equivalence() {
assert!(outputs_are_equivalent(
"[NM_000060.2:c.1207T>G;NM_000060.2:c.1330G>C]",
"NM_000060.2:c.[1207T>G;1330G>C]"
));
assert!(!outputs_are_equivalent(
"[NM_000060.2:c.1207T>G;NM_000060.2:c.1331G>C]",
"NM_000060.2:c.[1207T>G;1330G>C]"
));
assert!(!outputs_are_equivalent(
"[NM_000061.2:c.1207T>G;NM_000061.2:c.1330G>C]",
"NM_000060.2:c.[1207T>G;1330G>C]"
));
}
#[test]
fn test_delins_inv_equivalence() {
assert!(outputs_are_equivalent(
"NM_144670.4:c.2082_2083delinsTG",
"NM_144670.4:c.2082_2083inv"
));
assert!(!outputs_are_equivalent(
"NM_144670.4:c.2082_2084delinsTG",
"NM_144670.4:c.2082_2083inv"
));
}
#[test]
fn test_delins_split_equivalence() {
assert!(outputs_are_equivalent(
"NM_001282424.3:c.2142_2144delinsAA",
"NM_001282424.3:c.[2142del;2144C>A]"
));
assert!(!outputs_are_equivalent(
"NM_001282424.3:c.2142_2144delinsAA",
"NM_001282425.3:c.[2142del;2144C>A]"
));
}
#[test]
fn test_dup_repeat_equivalence() {
assert!(outputs_are_equivalent(
"NM_000060.2:c.100_102dup",
"NM_000060.2:c.100_102[2]"
));
assert!(outputs_are_equivalent(
"NM_000060.2:c.100_102dup",
"NM_000060.2:c.100_102ATG[2]"
));
assert!(!outputs_are_equivalent(
"NM_000060.2:c.100_103dup",
"NM_000060.2:c.100_102[2]"
));
assert!(!outputs_are_equivalent(
"NM_000060.2:c.100_102dup",
"NM_000060.2:c.100_102[3]"
));
}
#[test]
fn test_dup_vs_ins_equivalence() {
assert!(outputs_are_equivalent(
"NM_001605.3:c.-27dup",
"NM_001605.3:c.-27_-26insA"
));
assert!(!outputs_are_equivalent(
"NM_001605.3:c.-27dup",
"NM_001605.4:c.-27_-26insA"
));
assert!(!outputs_are_equivalent(
"NM_001605.3:c.-27dup",
"NM_001605.3:c.-28_-27insA"
));
}
#[test]
fn test_no_false_positives() {
assert!(!outputs_are_equivalent(
"NM_000060.2:c.100del",
"NM_000060.2:c.200del"
));
assert!(!outputs_are_equivalent(
"NM_000060.2:c.100_101delinsAA",
"NM_000060.2:c.100_101delinsTT"
));
}
#[test]
fn test_three_prime_shift_equivalence() {
assert!(outputs_are_equivalent(
"NM_001394148.2:c.-55_-46dup",
"NM_001394148.2:c.-56_-47dup"
));
assert!(outputs_are_equivalent(
"NM_001400774.1:c.-145_-142del",
"NM_001400774.1:c.-146_-143del"
));
assert!(outputs_are_equivalent(
"NM_001425842.1:c.-609del",
"NM_001425842.1:c.-610del"
));
assert!(!outputs_are_equivalent(
"NM_001394148.2:c.-55_-46dup",
"NM_001394149.2:c.-56_-47dup"
));
assert!(!outputs_are_equivalent(
"NM_001394148.2:c.-55_-46dup",
"NM_001394148.2:c.-56_-47del"
));
assert!(!outputs_are_equivalent(
"NM_000060.2:c.100_105del",
"NM_000060.2:c.101_108del"
));
assert!(outputs_are_equivalent(
"NM_000060.2:c.100_102dup",
"NM_000060.2:c.100_102dup"
));
}
#[test]
fn test_repeat_vs_del_dup_equivalence() {
assert!(outputs_are_equivalent(
"NM_001318856.2:c.-94TGC[5]",
"NM_001318856.2:c.-79_-68del"
));
assert!(outputs_are_equivalent(
"NM_001318856.2:c.-94TGC[10]",
"NM_001318856.2:c.-70_-68dup"
));
assert!(outputs_are_equivalent(
"NM_001400774.1:c.-126AGA[1]",
"NM_001400774.1:c.-123_-121del"
));
assert!(!outputs_are_equivalent(
"NM_001318856.2:c.-94TGC[5]",
"NM_001318857.2:c.-79_-68del"
));
}
#[test]
fn test_dup_vs_expanded_repeat_equivalence() {
assert!(outputs_are_equivalent(
"NM_020732.3:c.1023_1028dup",
"NM_020732.3:c.1017_1028GGC[6]"
));
assert!(outputs_are_equivalent(
"NM_020732.3:c.348_362dup",
"NM_020732.3:c.342_362GCA[12]"
));
assert!(outputs_are_equivalent(
"NM_020732.3:c.257_298dup",
"NM_020732.3:c.257_298ACCACCACCATGCCCACCACC[4]"
));
assert!(!outputs_are_equivalent(
"NM_020732.3:c.1023_1028dup",
"NM_020733.3:c.1017_1028GGC[6]"
));
}
#[test]
fn test_delins_sequence_vs_position_equivalence() {
assert!(outputs_are_equivalent(
"NM_001414902.1:c.174_182delinsCAGGAGGAGA",
"NM_001414902.1:c.174_182delins186_195"
));
assert!(!outputs_are_equivalent(
"NM_001414902.1:c.174_182delinsCAGGAGGAGA",
"NM_001414903.1:c.174_182delins186_195"
));
assert!(!outputs_are_equivalent(
"NM_001414902.1:c.174_182delinsCAGGAGGAGA",
"NM_001414902.1:c.174_182delinsTTTT"
));
}
#[test]
fn test_repeat_position_range_equivalence() {
assert!(outputs_are_equivalent(
"NM_001318856.2:c.-94TGC[11]",
"NM_001318856.2:c.-94_-68TGC[11]"
));
assert!(outputs_are_equivalent(
"NM_001318856.2:c.-94TGC[14]",
"NM_001318856.2:c.-94_-68TGC[14]"
));
assert!(!outputs_are_equivalent(
"NM_001318856.2:c.-94TGC[11]",
"NM_001318856.2:c.-94_-68TGC[14]"
));
assert!(!outputs_are_equivalent(
"NM_001318856.2:c.-94TGC[11]",
"NM_001318856.2:c.-94_-68AGC[11]"
));
assert!(!outputs_are_equivalent(
"NM_001318856.2:c.-94TGC[11]",
"NM_001318857.2:c.-94_-68TGC[11]"
));
}
#[test]
fn test_ins_sequence_vs_range_equivalence() {
assert!(outputs_are_equivalent(
"NM_000060.2:c.172_173insTGCAGCAGCAGCAGCAGC",
"NM_000060.2:c.172_173ins170_187"
));
assert!(outputs_are_equivalent(
"NM_000060.2:c.172_173ins170_187",
"NM_000060.2:c.172_173insTGCAGCAGCAGCAGCAGC"
));
assert!(!outputs_are_equivalent(
"NM_000060.2:c.172_173insTGCAGCAGCAGCAGCAGC",
"NM_000061.2:c.172_173ins170_187"
));
}
#[test]
fn test_repeat_one_vs_identity_equivalence() {
assert!(outputs_are_equivalent(
"NM_000060.2:c.217_219AGA[1]",
"NM_000060.2:c.="
));
assert!(outputs_are_equivalent(
"NM_000060.2:c.=",
"NM_000060.2:c.217_219AGA[1]"
));
assert!(!outputs_are_equivalent(
"NM_000060.2:c.217_219AGA[1]",
"NM_000061.2:c.="
));
assert!(!outputs_are_equivalent(
"NM_000060.2:c.217_219AGA[2]",
"NM_000060.2:c.="
));
}
#[test]
fn test_identity_substitution_equivalence() {
assert!(outputs_are_equivalent(
"NM_000060.2:c.1632T>T",
"NM_000060.2:c.="
));
assert!(outputs_are_equivalent(
"NM_000060.2:c.=",
"NM_000060.2:c.1632T>T"
));
assert!(!outputs_are_equivalent(
"NM_000060.2:c.1632T>T",
"NM_000061.2:c.="
));
assert!(!outputs_are_equivalent(
"NM_000060.2:c.1632T>G",
"NM_000060.2:c.="
));
}
#[test]
fn test_stop_codon_position_equivalence() {
assert!(outputs_are_equivalent(
"NM_000060.2:c.1284G>C",
"NM_000060.2:c.*1G>C"
));
assert!(outputs_are_equivalent(
"NM_000060.2:c.*1G>C",
"NM_000060.2:c.1284G>C"
));
assert!(!outputs_are_equivalent(
"NM_000060.2:c.1284G>C",
"NM_000061.2:c.*1G>C"
));
assert!(!outputs_are_equivalent(
"NM_000060.2:c.1284G>C",
"NM_000060.2:c.*1G>T"
));
}
#[test]
fn test_utr_position_off_by_one_equivalence() {
assert!(outputs_are_equivalent(
"NM_000016.4:c.*63dup",
"NM_000016.4:c.*64dup"
));
assert!(outputs_are_equivalent(
"NM_000016.4:c.*63del",
"NM_000016.4:c.*64del"
));
assert!(!outputs_are_equivalent(
"NM_000016.4:c.*63dup",
"NM_000017.4:c.*64dup"
));
assert!(!outputs_are_equivalent(
"NM_000016.4:c.*63dup",
"NM_000016.4:c.*64del"
));
}
#[test]
fn test_cds_end_vs_utr_start_equivalence() {
assert!(outputs_are_equivalent(
"NM_000038.5:c.-18_8532del",
"NM_000038.5:c.-18_*1del"
));
assert!(!outputs_are_equivalent(
"NM_000038.5:c.-18_8532del",
"NM_000039.5:c.-18_*1del"
));
}
#[test]
fn test_ins_n_count_vs_expanded_equivalence() {
assert!(outputs_are_equivalent(
"NM_000055.4:c.1160_1161ins[N[10];1146_1160]",
"NM_000055.4:c.1160_1161ins[NNNNNNNNNN;1146_1160]"
));
let fifty_ns = "N".repeat(50);
assert!(outputs_are_equivalent(
"NM_000055.4:c.1160_1161insN[50]",
&format!("NM_000055.4:c.1160_1161ins{}", fifty_ns)
));
assert!(!outputs_are_equivalent(
"NM_000055.4:c.1160_1161insN[10]",
"NM_000056.4:c.1160_1161insNNNNNNNNNN"
));
}
#[test]
fn test_utr_larger_position_shift_equivalence() {
assert!(outputs_are_equivalent(
"NM_000059.3:c.*373del",
"NM_000059.3:c.*382del"
));
assert!(outputs_are_equivalent(
"NM_000075.2:c.*290dup",
"NM_000075.2:c.*303dup"
));
assert!(!outputs_are_equivalent(
"NM_000059.3:c.*373del",
"NM_000060.3:c.*382del"
));
assert!(!outputs_are_equivalent(
"NM_000059.3:c.*100del",
"NM_000059.3:c.*200del"
));
}
#[test]
fn test_delins_simplification_equivalence() {
assert!(outputs_are_equivalent(
"NM_000051.3:c.3108_3120delinsTAAG",
"NM_000051.3:c.3108_3119delinsTAA"
));
assert!(!outputs_are_equivalent(
"NM_000051.3:c.3108_3120delinsTAAG",
"NM_000052.3:c.3108_3119delinsTAA"
));
}
#[test]
fn test_delins_vs_complex_allelic_equivalence() {
assert!(outputs_are_equivalent(
"NM_001321134.2:c.423_428delinsACCTAGGGGAAACAAGATGTAGTGCTATTGCC",
"NM_001321134.2:c.[422_423insA;423_424ins*572_*725;428_429ins[*731_*833;CTTTGAG]]"
));
assert!(!outputs_are_equivalent(
"NM_001321134.2:c.423_428delinsACCTAGGGG",
"NM_001321135.2:c.[422_423insA;423_424ins*572_*725]"
));
}
#[test]
fn test_repeat_del_vs_dup_not_equivalent() {
assert!(!outputs_are_equivalent(
"NM_000017.4:c.364_367del", "NM_000017.4:c.366_367dup" ));
assert!(!outputs_are_equivalent(
"NM_001407640.1:c.470_471del", "NM_001407640.1:c.470_471dup" ));
}
#[test]
fn test_repeat_count_diff_not_equivalent() {
assert!(!outputs_are_equivalent(
"NM_000028.3:c.1497_1500AG[4]",
"NM_000028.3:c.1497_1500AG[5]"
));
assert!(!outputs_are_equivalent(
"NM_000044.3:c.172_237CAG[35]",
"NM_000044.3:c.171_239GCA[57]"
));
}
}