use anyhow::{Context, Result};
use serde::{Deserialize, Serialize};
use std::path::{Path, PathBuf};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Config {
pub reference: PathBuf,
pub output: OutputConfig,
#[serde(default)]
pub sample: SampleConfig,
#[serde(default)]
pub fragment: FragmentConfig,
#[serde(default)]
pub quality: QualityConfig,
#[serde(default)]
pub tumour: Option<TumourConfig>,
#[serde(default)]
pub mutations: Option<MutationConfig>,
#[serde(default)]
pub umi: Option<UmiConfig>,
#[serde(default)]
pub artifacts: Option<ArtifactConfig>,
#[serde(default)]
pub seed: Option<u64>,
#[serde(default)]
pub threads: Option<usize>,
#[serde(default)]
pub chromosomes: Option<Vec<String>>,
#[serde(default)]
pub regions_bed: Option<PathBuf>,
#[serde(default)]
pub copy_number: Option<Vec<CopyNumberConfig>>,
#[serde(default)]
pub gc_bias: Option<GcBiasConfig>,
#[serde(default)]
pub samples: Option<Vec<SampleEntry>>,
#[serde(default)]
pub capture: Option<CaptureConfig>,
#[serde(default)]
pub performance: PerformanceConfig,
#[serde(default)]
pub preset: Option<String>,
#[serde(default)]
pub vafs: Option<Vec<f64>>,
#[serde(default)]
pub germline: Option<GermlineConfig>,
#[serde(default)]
pub paired: Option<PairedConfig>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PerformanceConfig {
#[serde(default = "default_output_buffer_regions")]
pub output_buffer_regions: usize,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OutputConfig {
pub directory: PathBuf,
#[serde(default = "default_true")]
pub fastq: bool,
#[serde(default)]
pub bam: bool,
#[serde(default = "default_true")]
pub truth_vcf: bool,
#[serde(default = "default_true")]
pub manifest: bool,
#[serde(default)]
pub single_read_bam: bool,
#[serde(default = "default_true")]
pub germline_vcf: bool,
#[serde(default = "default_mapq")]
pub mapq: u8,
#[serde(default)]
pub annotate_reads: bool,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SampleConfig {
#[serde(default = "default_sample_name")]
pub name: String,
#[serde(default = "default_read_length")]
pub read_length: usize,
#[serde(default = "default_coverage")]
pub coverage: f64,
#[serde(default)]
pub platform: Option<String>,
}
impl Default for SampleConfig {
fn default() -> Self {
Self {
name: default_sample_name(),
read_length: default_read_length(),
coverage: default_coverage(),
platform: None,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FragmentConfig {
#[serde(default = "default_fragment_model")]
pub model: FragmentModel,
#[serde(default = "default_fragment_mean")]
pub mean: f64,
#[serde(default = "default_fragment_sd")]
pub sd: f64,
#[serde(default)]
pub long_read: Option<LongReadFragmentConfig>,
#[serde(default)]
pub end_motif_model: Option<String>,
#[serde(default)]
pub ctdna_fraction: Option<f64>,
#[serde(default)]
pub mono_sd: Option<f64>,
#[serde(default)]
pub di_sd: Option<f64>,
}
impl Default for FragmentConfig {
fn default() -> Self {
Self {
model: FragmentModel::Normal,
mean: default_fragment_mean(),
sd: default_fragment_sd(),
long_read: None,
end_motif_model: None,
ctdna_fraction: None,
mono_sd: None,
di_sd: None,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LongReadFragmentConfig {
#[serde(default = "default_lr_mean")]
pub mean: usize,
#[serde(default = "default_lr_sd")]
pub sd: usize,
#[serde(default = "default_lr_min")]
pub min_len: usize,
#[serde(default = "default_lr_max")]
pub max_len: usize,
}
fn default_lr_mean() -> usize {
15000
}
fn default_lr_sd() -> usize {
5000
}
fn default_lr_min() -> usize {
1000
}
fn default_lr_max() -> usize {
100000
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum FragmentModel {
Normal,
Cfda,
}
fn default_fragment_model() -> FragmentModel {
FragmentModel::Normal
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct QualityConfig {
#[serde(default = "default_mean_quality")]
pub mean_quality: u8,
#[serde(default = "default_quality_decay")]
pub tail_decay: f64,
#[serde(default)]
pub profile_path: Option<PathBuf>,
}
impl Default for QualityConfig {
fn default() -> Self {
Self {
mean_quality: default_mean_quality(),
tail_decay: default_quality_decay(),
profile_path: None,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TumourConfig {
#[serde(default = "default_purity")]
pub purity: f64,
#[serde(default = "default_ploidy")]
pub ploidy: u32,
#[serde(default)]
pub clones: Vec<CloneConfig>,
#[serde(default)]
pub msi: bool,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CloneConfig {
pub id: String,
pub ccf: f64,
#[serde(default)]
pub parent: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MutationConfig {
#[serde(default)]
pub vcf: Option<PathBuf>,
#[serde(default)]
pub random: Option<RandomMutationConfig>,
#[serde(default)]
pub sv_signature: Option<String>,
#[serde(default = "default_sv_count")]
pub sv_count: usize,
#[serde(default)]
pub include_driver_mutations: bool,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RandomMutationConfig {
pub count: usize,
#[serde(default = "default_vaf_min")]
pub vaf_min: f64,
#[serde(default = "default_vaf_max")]
pub vaf_max: f64,
#[serde(default = "default_snv_fraction")]
pub snv_fraction: f64,
#[serde(default = "default_indel_fraction")]
pub indel_fraction: f64,
#[serde(default = "default_mnv_fraction")]
pub mnv_fraction: f64,
#[serde(default)]
pub signature: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct UmiConfig {
#[serde(default = "default_umi_length")]
pub length: usize,
#[serde(default)]
pub duplex: bool,
#[serde(default = "default_pcr_cycles")]
pub pcr_cycles: u32,
#[serde(default = "default_family_size_mean")]
pub family_size_mean: f64,
#[serde(default = "default_family_size_sd")]
pub family_size_sd: f64,
#[serde(default)]
pub inline: bool,
#[serde(default)]
pub spacer: Option<String>,
#[serde(default)]
pub duplex_conversion_rate: Option<f64>,
#[serde(default)]
pub error_rate: Option<f64>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ArtifactConfig {
#[serde(default)]
pub ffpe_damage_rate: Option<f64>,
#[serde(default)]
pub oxog_rate: Option<f64>,
#[serde(default)]
pub duplicate_rate: Option<f64>,
#[serde(default)]
pub pcr_error_rate: Option<f64>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CopyNumberConfig {
pub region: String,
#[serde(default = "default_normal_cn")]
pub tumor_cn: u32,
#[serde(default = "default_normal_cn")]
pub normal_cn: u32,
#[serde(default)]
pub major_cn: Option<u32>,
#[serde(default)]
pub minor_cn: Option<u32>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GcBiasConfig {
#[serde(default = "default_true")]
pub enabled: bool,
#[serde(default = "default_gc_bias_model")]
pub model: String,
#[serde(default = "default_gc_bias_severity")]
pub severity: f64,
}
impl Default for GcBiasConfig {
fn default() -> Self {
Self {
enabled: true,
model: default_gc_bias_model(),
severity: default_gc_bias_severity(),
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SampleEntry {
pub name: String,
#[serde(default = "default_coverage")]
pub coverage: f64,
#[serde(default = "default_tumour_fraction")]
pub tumour_fraction: f64,
#[serde(default)]
pub fragment_model: Option<FragmentModel>,
#[serde(default)]
pub clonal_shift: std::collections::HashMap<String, f64>,
}
fn default_tumour_fraction() -> f64 {
1.0
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CaptureConfig {
#[serde(default = "default_true")]
pub enabled: bool,
#[serde(default)]
pub targets_bed: Option<PathBuf>,
#[serde(default = "default_off_target_fraction")]
pub off_target_fraction: f64,
#[serde(default = "default_coverage_uniformity")]
pub coverage_uniformity: f64,
#[serde(default = "default_edge_dropoff_bases")]
pub edge_dropoff_bases: u32,
#[serde(default = "default_capture_mode")]
pub mode: String,
#[serde(default)]
pub primer_trim: usize,
#[serde(default)]
pub coverage_cv_target: Option<f64>,
#[serde(default)]
pub on_target_fraction_target: Option<f64>,
}
impl Default for CaptureConfig {
fn default() -> Self {
Self {
enabled: true,
targets_bed: None,
off_target_fraction: default_off_target_fraction(),
coverage_uniformity: default_coverage_uniformity(),
edge_dropoff_bases: default_edge_dropoff_bases(),
mode: default_capture_mode(),
primer_trim: 0,
coverage_cv_target: None,
on_target_fraction_target: None,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GermlineConfig {
#[serde(default = "default_het_snp_density")]
pub het_snp_density: f64,
#[serde(default = "default_hom_snp_density")]
pub hom_snp_density: f64,
#[serde(default = "default_het_indel_density")]
pub het_indel_density: f64,
#[serde(default)]
pub vcf: Option<std::path::PathBuf>,
}
impl Default for GermlineConfig {
fn default() -> Self {
Self {
het_snp_density: default_het_snp_density(),
hom_snp_density: default_hom_snp_density(),
het_indel_density: default_het_indel_density(),
vcf: None,
}
}
}
fn default_het_snp_density() -> f64 {
0.6
}
fn default_hom_snp_density() -> f64 {
0.3
}
fn default_het_indel_density() -> f64 {
0.05
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PairedConfig {
#[serde(default = "default_coverage")]
pub normal_coverage: f64,
#[serde(default = "default_normal_sample_name")]
pub normal_sample_name: String,
#[serde(default)]
pub tumour_contamination_in_normal: f64,
}
impl Default for PairedConfig {
fn default() -> Self {
Self {
normal_coverage: default_coverage(),
normal_sample_name: default_normal_sample_name(),
tumour_contamination_in_normal: 0.0,
}
}
}
fn default_normal_sample_name() -> String {
"NORMAL".to_string()
}
fn default_true() -> bool {
true
}
fn default_mapq() -> u8 {
60
}
fn default_normal_cn() -> u32 {
2
}
fn default_sample_name() -> String {
"SAMPLE".to_string()
}
fn default_read_length() -> usize {
150
}
pub fn default_coverage() -> f64 {
30.0
}
pub fn default_fragment_mean() -> f64 {
300.0
}
pub fn default_fragment_sd() -> f64 {
50.0
}
fn default_mean_quality() -> u8 {
36
}
fn default_quality_decay() -> f64 {
0.003
}
fn default_purity() -> f64 {
1.0
}
fn default_ploidy() -> u32 {
2
}
fn default_vaf_min() -> f64 {
0.001
}
fn default_vaf_max() -> f64 {
0.5
}
fn default_snv_fraction() -> f64 {
0.80
}
fn default_indel_fraction() -> f64 {
0.15
}
fn default_mnv_fraction() -> f64 {
0.05
}
fn default_sv_count() -> usize {
10
}
fn default_umi_length() -> usize {
8
}
fn default_pcr_cycles() -> u32 {
10
}
fn default_family_size_mean() -> f64 {
3.0
}
fn default_family_size_sd() -> f64 {
1.5
}
fn default_gc_bias_model() -> String {
"default".to_string()
}
fn default_gc_bias_severity() -> f64 {
1.0
}
impl Default for PerformanceConfig {
fn default() -> Self {
Self {
output_buffer_regions: default_output_buffer_regions(),
}
}
}
fn default_output_buffer_regions() -> usize {
64
}
fn default_off_target_fraction() -> f64 {
0.2
}
fn default_coverage_uniformity() -> f64 {
0.3
}
fn default_edge_dropoff_bases() -> u32 {
50
}
fn default_capture_mode() -> String {
"panel".to_string()
}
#[derive(Debug, Clone, PartialEq)]
pub enum ChemistryPreset {
TwistUmiDuplex,
IlluminaWgs,
IlluminaWes,
IlluminaCtdna,
PacbioHifi,
NanoporeR10,
}
impl ChemistryPreset {
pub fn from_name(name: &str) -> Option<Self> {
match name {
"twist-umi-duplex" => Some(Self::TwistUmiDuplex),
"illumina-wgs" => Some(Self::IlluminaWgs),
"illumina-wes" => Some(Self::IlluminaWes),
"illumina-ctdna" => Some(Self::IlluminaCtdna),
"pacbio-hifi" => Some(Self::PacbioHifi),
"nanopore-r10" => Some(Self::NanoporeR10),
_ => None,
}
}
}
pub fn apply_preset(config: &mut Config, preset: &ChemistryPreset) {
let default_fragment = FragmentConfig::default();
let default_quality = QualityConfig::default();
match preset {
ChemistryPreset::TwistUmiDuplex => {
fill_fragment(
config,
&default_fragment,
FragmentModel::Normal,
200.0,
30.0,
);
fill_quality(config, &default_quality, 37);
fill_umi(config, 8, true, false);
}
ChemistryPreset::IlluminaWgs => {
fill_fragment(
config,
&default_fragment,
FragmentModel::Normal,
300.0,
50.0,
);
fill_quality(config, &default_quality, 36);
}
ChemistryPreset::IlluminaWes => {
fill_fragment(
config,
&default_fragment,
FragmentModel::Normal,
200.0,
40.0,
);
fill_quality(config, &default_quality, 35);
}
ChemistryPreset::IlluminaCtdna => {
fill_fragment(config, &default_fragment, FragmentModel::Cfda, 167.0, 20.0);
fill_quality(config, &default_quality, 36);
fill_umi(config, 8, false, false);
}
ChemistryPreset::PacbioHifi => {
fill_fragment(
config,
&default_fragment,
FragmentModel::Normal,
15000.0,
5000.0,
);
fill_quality(config, &default_quality, 25);
}
ChemistryPreset::NanoporeR10 => {
fill_fragment(
config,
&default_fragment,
FragmentModel::Normal,
20000.0,
10000.0,
);
fill_quality(config, &default_quality, 20);
}
}
}
fn fill_fragment(
config: &mut Config,
default: &FragmentConfig,
model: FragmentModel,
mean: f64,
sd: f64,
) {
config.fragment.model = model;
if (config.fragment.mean - default.mean).abs() < f64::EPSILON {
config.fragment.mean = mean;
}
if (config.fragment.sd - default.sd).abs() < f64::EPSILON {
config.fragment.sd = sd;
}
}
fn fill_quality(config: &mut Config, default: &QualityConfig, mean_quality: u8) {
if config.quality.mean_quality == default.mean_quality {
config.quality.mean_quality = mean_quality;
}
}
fn fill_umi(config: &mut Config, length: usize, duplex: bool, inline: bool) {
if config.umi.is_none() {
config.umi = Some(UmiConfig {
length,
duplex,
inline,
pcr_cycles: default_pcr_cycles(),
family_size_mean: default_family_size_mean(),
family_size_sd: default_family_size_sd(),
spacer: None,
duplex_conversion_rate: None,
error_rate: None,
});
} else if let Some(umi) = config.umi.as_mut() {
if umi.length == 0 {
umi.length = length;
}
if !umi.duplex {
umi.duplex = duplex;
}
if !umi.inline {
umi.inline = inline;
}
}
}
pub fn load(path: &Path) -> Result<Config> {
let contents = std::fs::read_to_string(path)
.with_context(|| format!("failed to read config file: {}", path.display()))?;
let mut config: Config = serde_yaml::from_str(&contents)
.with_context(|| format!("failed to parse config file: {}", path.display()))?;
if let Some(preset_name) = config.preset.clone() {
if let Some(preset) = ChemistryPreset::from_name(&preset_name) {
apply_preset(&mut config, &preset);
} else {
anyhow::bail!("unknown chemistry preset: {}", preset_name);
}
}
Ok(config)
}
pub fn load_with_vars(
path: &Path,
vars: &std::collections::HashMap<String, String>,
) -> Result<Config> {
let raw = std::fs::read_to_string(path)
.with_context(|| format!("failed to read config file: {}", path.display()))?;
let substituted = substitute_vars(&raw, vars)?;
let mut config: Config = serde_yaml::from_str(&substituted)
.with_context(|| format!("failed to parse config file: {}", path.display()))?;
if let Some(preset_name) = config.preset.clone() {
if let Some(preset) = ChemistryPreset::from_name(&preset_name) {
apply_preset(&mut config, &preset);
} else {
anyhow::bail!("unknown chemistry preset: {}", preset_name);
}
}
Ok(config)
}
fn substitute_vars(text: &str, vars: &std::collections::HashMap<String, String>) -> Result<String> {
let mut result = text.to_string();
let mut i = 0;
while let Some(start) = result[i..].find("${") {
let abs_start = i + start;
if let Some(end_offset) = result[abs_start + 2..].find('}') {
let abs_end = abs_start + 2 + end_offset;
let key = result[abs_start + 2..abs_end].to_string();
if let Some(val) = vars.get(&key) {
let placeholder = format!("${{{key}}}");
result = result.replacen(&placeholder, val, 1);
} else {
anyhow::bail!(
"config placeholder '${{{key}}}' has no --set value; \
supply --set {key}=<value>"
);
}
} else {
i = abs_start + 2;
}
}
Ok(result)
}
pub fn parse_region(s: &str) -> Result<(String, u64, u64)> {
let colon = s
.find(':')
.ok_or_else(|| anyhow::anyhow!("expected 'chrom:start-end' but found no ':' in '{}'", s))?;
let chrom = s[..colon].to_string();
let coords = &s[colon + 1..];
let dash = coords.find('-').ok_or_else(|| {
anyhow::anyhow!(
"expected 'chrom:start-end' but found no '-' after ':' in '{}'",
s
)
})?;
let start: u64 = coords[..dash].parse().map_err(|_| {
anyhow::anyhow!(
"start coordinate '{}' in '{}' is not a valid integer",
&coords[..dash],
s
)
})?;
let end: u64 = coords[dash + 1..].parse().map_err(|_| {
anyhow::anyhow!(
"end coordinate '{}' in '{}' is not a valid integer",
&coords[dash + 1..],
s
)
})?;
Ok((chrom, start, end))
}
pub fn validate(config: &Config) -> Result<()> {
anyhow::ensure!(
config.reference.exists(),
"reference file not found: {}",
config.reference.display()
);
if let Some(ref bed) = config.regions_bed {
anyhow::ensure!(
bed.exists(),
"regions_bed file not found: {}",
bed.display()
);
}
anyhow::ensure!(
config.sample.coverage > 0.0,
"coverage must be positive, got {}",
config.sample.coverage
);
anyhow::ensure!(
config.sample.read_length > 0,
"read_length must be positive, got {}",
config.sample.read_length
);
anyhow::ensure!(
config.fragment.mean > 0.0,
"fragment mean must be positive, got {}",
config.fragment.mean
);
anyhow::ensure!(
config.fragment.sd > 0.0,
"fragment.sd must be greater than zero, got {}",
config.fragment.sd
);
if let Some(mono_sd) = config.fragment.mono_sd {
anyhow::ensure!(
mono_sd > 0.0,
"fragment.mono_sd must be greater than zero, got {}",
mono_sd
);
}
if let Some(di_sd) = config.fragment.di_sd {
anyhow::ensure!(
di_sd > 0.0,
"fragment.di_sd must be greater than zero, got {}",
di_sd
);
}
if let Some(ref lr) = config.fragment.long_read {
anyhow::ensure!(
lr.sd > 0,
"fragment.long_read.sd must be greater than zero, got {}",
lr.sd
);
}
if let Some(ctdna_frac) = config.fragment.ctdna_fraction {
anyhow::ensure!(
(0.0..=1.0).contains(&ctdna_frac),
"fragment.ctdna_fraction must be in [0.0, 1.0], got {}",
ctdna_frac
);
}
if let Some(tumour) = &config.tumour {
anyhow::ensure!(
(0.0..=1.0).contains(&tumour.purity),
"tumour purity must be between 0.0 and 1.0, got {}",
tumour.purity
);
for clone in &tumour.clones {
anyhow::ensure!(
(0.0..=1.0).contains(&clone.ccf),
"clone {} CCF must be between 0.0 and 1.0, got {}",
clone.id,
clone.ccf
);
}
}
if let Some(mutations) = &config.mutations {
if let Some(random) = &mutations.random {
anyhow::ensure!(random.count > 0, "random mutation count must be > 0");
anyhow::ensure!(
random.vaf_min < random.vaf_max,
"vaf_min ({}) must be less than vaf_max ({})",
random.vaf_min,
random.vaf_max
);
let total = random.snv_fraction + random.indel_fraction + random.mnv_fraction;
anyhow::ensure!(
(total - 1.0).abs() < 1e-6,
"mutation type fractions must sum to 1.0, got {}",
total
);
}
}
anyhow::ensure!(
config.output.mapq <= 254,
"mapq must be 0-254 (255 is reserved in SAM spec)"
);
if config.sample.read_length > config.fragment.mean as usize {
tracing::warn!(
read_length = config.sample.read_length,
fragment_mean = config.fragment.mean,
"read_length exceeds fragment mean: reads will overlap, \
producing artifactual paired-end overlap"
);
}
if let Some(ref capture) = config.capture {
if let Some(ref bed) = capture.targets_bed {
anyhow::ensure!(
bed.exists(),
"capture targets_bed file not found: {}",
bed.display()
);
}
}
if let Some(umi) = &config.umi {
anyhow::ensure!(umi.length > 0, "UMI length must be > 0");
anyhow::ensure!(
umi.length < config.sample.read_length,
"UMI length ({}) must be less than read_length ({})",
umi.length,
config.sample.read_length
);
if umi.inline {
let spacer_len = umi.spacer.as_deref().unwrap_or("").len();
anyhow::ensure!(
umi.length + spacer_len < config.sample.read_length,
"umi.length ({}) plus spacer length ({}) must be less than read_length ({})",
umi.length,
spacer_len,
config.sample.read_length
);
}
if let Some(ref spacer) = umi.spacer {
anyhow::ensure!(
spacer
.chars()
.all(|c| matches!(c, 'A' | 'C' | 'G' | 'T' | 'a' | 'c' | 'g' | 't')),
"umi.spacer must contain only A, C, G, T bases"
);
}
if let Some(rate) = umi.duplex_conversion_rate {
anyhow::ensure!(
(0.0..=1.0).contains(&rate),
"umi.duplex_conversion_rate ({}) must be in [0.0, 1.0]",
rate
);
}
if let Some(rate) = umi.error_rate {
anyhow::ensure!(
(0.0..=1.0).contains(&rate),
"umi.error_rate ({}) must be in [0.0, 1.0]",
rate
);
}
if umi.duplex_conversion_rate.is_some() && !umi.duplex {
tracing::warn!(
"umi.duplex_conversion_rate is set but umi.duplex is false; the value has no effect"
);
}
if umi.error_rate.is_some_and(|r| r > 0.0) && !umi.inline {
tracing::warn!(
"umi.error_rate is set but umi.inline is false; UMI error injection has no effect"
);
}
}
if let Some(ref cn_regions) = config.copy_number {
for cn in cn_regions {
parse_region(&cn.region).with_context(|| {
format!(
"invalid copy_number region '{}': must be chrom:start-end",
cn.region
)
})?;
}
}
if let Some(ref mutations) = config.mutations {
if let Some(ref vcf_path) = mutations.vcf {
anyhow::ensure!(
vcf_path.exists(),
"mutations vcf file not found: {}",
vcf_path.display()
);
}
}
if let Some(vafs) = &config.vafs {
for &vaf in vafs {
anyhow::ensure!(
vaf > 0.0 && vaf <= 1.0,
"each VAF in `vafs` must be in (0.0, 1.0], got {}",
vaf
);
}
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
use tempfile::NamedTempFile;
fn write_yaml(content: &str) -> NamedTempFile {
let mut f = NamedTempFile::new().unwrap();
f.write_all(content.as_bytes()).unwrap();
f
}
#[test]
fn test_minimal_config_parse() {
let yaml = r#"
reference: /tmp/ref.fa
output:
directory: /tmp/out
"#;
let f = write_yaml(yaml);
let cfg = load(f.path()).unwrap();
assert_eq!(cfg.sample.read_length, 150);
assert_eq!(cfg.sample.coverage, 30.0);
assert_eq!(cfg.fragment.mean, 300.0);
}
#[test]
fn test_full_config_parse() {
let yaml = r#"
reference: /tmp/ref.fa
output:
directory: /tmp/out
fastq: true
bam: true
truth_vcf: true
sample:
name: TUMOUR_01
read_length: 100
coverage: 60.0
fragment:
model: normal
mean: 250.0
sd: 40.0
tumour:
purity: 0.7
ploidy: 2
clones:
- id: clone_a
ccf: 1.0
- id: clone_b
ccf: 0.3
parent: clone_a
mutations:
random:
count: 50
vaf_min: 0.01
vaf_max: 0.5
umi:
length: 12
duplex: true
pcr_cycles: 8
artifacts:
ffpe_damage_rate: 0.01
duplicate_rate: 0.15
seed: 42
chromosomes:
- chr22
"#;
let f = write_yaml(yaml);
let cfg = load(f.path()).unwrap();
assert_eq!(cfg.sample.name, "TUMOUR_01");
assert_eq!(cfg.sample.read_length, 100);
assert_eq!(cfg.tumour.as_ref().unwrap().purity, 0.7);
assert_eq!(cfg.tumour.as_ref().unwrap().clones.len(), 2);
assert_eq!(
cfg.mutations
.as_ref()
.unwrap()
.random
.as_ref()
.unwrap()
.count,
50
);
assert!(cfg.umi.as_ref().unwrap().duplex);
assert_eq!(cfg.seed, Some(42));
}
#[test]
fn test_validate_bad_purity() {
let yaml = r#"
reference: /dev/null
output:
directory: /tmp/out
tumour:
purity: 1.5
"#;
let f = write_yaml(yaml);
let cfg = load(f.path()).unwrap();
assert!(validate(&cfg).is_err());
}
#[test]
fn test_validate_bad_vaf_range() {
let yaml = r#"
reference: /dev/null
output:
directory: /tmp/out
mutations:
random:
count: 10
vaf_min: 0.5
vaf_max: 0.1
"#;
let f = write_yaml(yaml);
let cfg = load(f.path()).unwrap();
assert!(validate(&cfg).is_err());
}
#[test]
fn test_validate_bad_type_fractions() {
let yaml = r#"
reference: /dev/null
output:
directory: /tmp/out
mutations:
random:
count: 10
snv_fraction: 0.5
indel_fraction: 0.5
mnv_fraction: 0.5
"#;
let f = write_yaml(yaml);
let cfg = load(f.path()).unwrap();
assert!(validate(&cfg).is_err());
}
#[test]
fn test_cfda_fragment_model() {
let yaml = r#"
reference: /tmp/ref.fa
output:
directory: /tmp/out
fragment:
model: cfda
mean: 167.0
sd: 20.0
"#;
let f = write_yaml(yaml);
let cfg = load(f.path()).unwrap();
assert!(matches!(cfg.fragment.model, FragmentModel::Cfda));
}
#[test]
fn test_preset_twist_umi_duplex_fills_defaults() {
let yaml = r#"
reference: /tmp/ref.fa
output:
directory: /tmp/out
preset: twist-umi-duplex
"#;
let f = write_yaml(yaml);
let cfg = load(f.path()).unwrap();
let umi = cfg.umi.expect("umi should be set by preset");
assert_eq!(umi.length, 8);
assert!(umi.duplex);
assert!(!umi.inline);
assert_eq!(cfg.fragment.mean, 200.0);
assert_eq!(cfg.fragment.sd, 30.0);
assert_eq!(cfg.quality.mean_quality, 37);
}
#[test]
fn test_preset_illumina_wgs_no_umi() {
let yaml = r#"
reference: /tmp/ref.fa
output:
directory: /tmp/out
preset: illumina-wgs
"#;
let f = write_yaml(yaml);
let cfg = load(f.path()).unwrap();
assert!(
cfg.umi.is_none(),
"illumina-wgs should not create UMI config"
);
assert_eq!(cfg.fragment.mean, 300.0);
assert_eq!(cfg.fragment.sd, 50.0);
assert_eq!(cfg.quality.mean_quality, 36);
}
#[test]
fn test_explicit_umi_length_beats_preset() {
let yaml = r#"
reference: /tmp/ref.fa
output:
directory: /tmp/out
preset: twist-umi-duplex
umi:
length: 12
"#;
let f = write_yaml(yaml);
let cfg = load(f.path()).unwrap();
let umi = cfg.umi.expect("umi should be set");
assert_eq!(umi.length, 12);
}
#[test]
fn test_inline_umi_accepted() {
let yaml = r#"
reference: /dev/null
output:
directory: /tmp/out
sample:
read_length: 150
umi:
length: 5
inline: true
spacer: "AT"
"#;
let f = write_yaml(yaml);
let cfg = load(f.path()).unwrap();
assert!(
validate(&cfg).is_ok(),
"inline UMI mode should be accepted by validation"
);
let umi = cfg.umi.expect("umi should be set");
assert!(umi.inline);
assert_eq!(umi.spacer.as_deref(), Some("AT"));
}
#[test]
fn test_ctdna_fraction_field_parses() {
let yaml = r#"
reference: /dev/null
output:
directory: /tmp/out
fragment:
model: cfda
mean: 167.0
sd: 20.0
ctdna_fraction: 0.03
"#;
let f = write_yaml(yaml);
let cfg = load(f.path()).unwrap();
assert_eq!(cfg.fragment.ctdna_fraction, Some(0.03));
}
#[test]
fn test_ctdna_fraction_out_of_range_fails() {
let yaml = r#"
reference: /dev/null
output:
directory: /tmp/out
fragment:
ctdna_fraction: 1.5
"#;
let f = write_yaml(yaml);
let cfg = load(f.path()).unwrap();
assert!(validate(&cfg).is_err());
}
#[test]
fn test_mono_di_sd_fields_parse() {
let yaml = r#"
reference: /dev/null
output:
directory: /tmp/out
fragment:
model: cfda
mean: 167.0
sd: 20.0
mono_sd: 15.0
di_sd: 25.0
"#;
let f = write_yaml(yaml);
let cfg = load(f.path()).unwrap();
assert_eq!(cfg.fragment.mono_sd, Some(15.0));
assert_eq!(cfg.fragment.di_sd, Some(25.0));
}
#[test]
fn test_substitute_vars_unknown_key_errors() {
let vars = std::collections::HashMap::new();
let result = substitute_vars("prefix_${unknown}", &vars);
assert!(result.is_err(), "unknown key should return an error");
let msg = result.unwrap_err().to_string();
assert!(msg.contains("unknown"), "error should name the missing key");
}
#[test]
fn test_substitute_vars_empty_value() {
let mut vars = std::collections::HashMap::new();
vars.insert("key".to_string(), String::new());
let result = substitute_vars("prefix_${key}_suffix", &vars).unwrap();
assert_eq!(result, "prefix__suffix");
}
#[test]
fn test_substitute_vars_adjacent_placeholders() {
let mut vars = std::collections::HashMap::new();
vars.insert("a".to_string(), "hello".to_string());
vars.insert("b".to_string(), "world".to_string());
let result = substitute_vars("${a}${b}", &vars).unwrap();
assert_eq!(result, "helloworld");
}
#[test]
fn test_substitute_vars_unterminated_passes_through() {
let vars = std::collections::HashMap::new();
let result = substitute_vars("prefix_${key", &vars).unwrap();
assert_eq!(
result, "prefix_${key",
"unterminated placeholder should pass through"
);
}
#[test]
fn test_substitute_vars_no_placeholders() {
let vars = std::collections::HashMap::new();
let result = substitute_vars("plain string", &vars).unwrap();
assert_eq!(result, "plain string");
}
#[test]
fn test_fragment_sd_zero_fails_validation() {
let yaml = r#"
reference: /dev/null
output:
directory: /tmp/out
fragment:
sd: 0.0
"#;
let f = write_yaml(yaml);
let cfg = load(f.path()).unwrap();
let err = validate(&cfg).unwrap_err();
assert!(
err.to_string().contains("greater than zero"),
"error should mention 'greater than zero', got: {}",
err
);
}
#[test]
fn test_fragment_mono_sd_zero_fails_validation() {
let yaml = r#"
reference: /dev/null
output:
directory: /tmp/out
fragment:
sd: 20.0
mono_sd: 0.0
"#;
let f = write_yaml(yaml);
let cfg = load(f.path()).unwrap();
assert!(
validate(&cfg).is_err(),
"mono_sd: 0.0 should fail validation"
);
}
#[test]
fn test_fragment_di_sd_zero_fails_validation() {
let yaml = r#"
reference: /dev/null
output:
directory: /tmp/out
fragment:
sd: 20.0
di_sd: 0.0
"#;
let f = write_yaml(yaml);
let cfg = load(f.path()).unwrap();
assert!(validate(&cfg).is_err(), "di_sd: 0.0 should fail validation");
}
}