varforge 0.2.0 - Docs.rs

//! Learn error/quality profiles from real BAM files.
//!
//! Reads a BAM file (or a sampled subset) and produces a [`ProfileJson`] that
//! can be written to disk and consumed by [`crate::core::error_profile`].

use std::collections::HashMap;
use std::io::BufReader;
use std::path::Path;

use anyhow::{Context, Result};
use noodles_bam as bam;
use noodles_sam::{
    self as sam,
    alignment::{record::Sequence as _, record_buf::RecordBuf},
};

use crate::core::error_profile::{ContextEffectJson, ProfileJson, QualityDistributionJson};

// ---------------------------------------------------------------------------
// Configuration
// ---------------------------------------------------------------------------

/// Parameters that control the profile learning pass.
#[derive(Debug, Clone)]
pub struct LearnerConfig {
    /// Maximum number of reads (individual, not pairs) to examine.
    pub sample_size: usize,
    /// Minimum mapping quality to include a read.
    pub min_mapq: u8,
}

impl Default for LearnerConfig {
    fn default() -> Self {
        Self {
            sample_size: 1_000_000,
            min_mapq: 20,
        }
    }
}

// ---------------------------------------------------------------------------
// Summary statistics gathered from reads
// ---------------------------------------------------------------------------

/// Accumulated counts from scanning a BAM.
#[derive(Debug)]
pub struct LearnerStats {
    /// Total reads examined (after flag filtering).
    pub reads_examined: u64,
    /// Reads skipped because they were duplicates, secondary, etc.
    pub reads_skipped: u64,
    /// Per-position quality histograms for R1.
    /// `quality_counts_r1[pos][q]` = number of times quality `q` appeared at position `pos`.
    pub quality_counts_r1: Vec<HashMap<u8, u64>>,
    /// Per-position quality histograms for R2.
    pub quality_counts_r2: Vec<HashMap<u8, u64>>,
    /// Insert-size histogram (template length → count).
    pub insert_size_counts: HashMap<i32, u64>,
    /// Substitution counts: "A>C" style key → count.
    pub substitution_counts: HashMap<String, u64>,
    /// GC bias: gc_percent (0–100) → (depth_sum, bin_count).
    pub gc_bias: Vec<(u64, u64)>,
    /// Tri-nucleotide context error counts: "ACG" → mismatches.
    pub context_error_counts: HashMap<String, u64>,
    /// Tri-nucleotide context total observations: "ACG" → total.
    pub context_total_counts: HashMap<String, u64>,
    /// Maximum read-length seen (used to size position arrays).
    pub max_read_length: usize,
    /// Per-cycle insertion counts (from CIGAR `I` operations, MAPQ ≥ 30 only).
    pub insertion_counts_by_cycle: Vec<u64>,
    /// Per-cycle deletion counts (from CIGAR `D` operations, MAPQ ≥ 30 only).
    pub deletion_counts_by_cycle: Vec<u64>,
    /// Total aligned bases observed per cycle (denominator for per-cycle indel rates).
    pub bases_by_cycle: Vec<u64>,
}

impl LearnerStats {
    fn new(max_read_length: usize) -> Self {
        Self {
            reads_examined: 0,
            reads_skipped: 0,
            quality_counts_r1: vec![HashMap::new(); max_read_length],
            quality_counts_r2: vec![HashMap::new(); max_read_length],
            insert_size_counts: HashMap::new(),
            substitution_counts: HashMap::new(),
            gc_bias: vec![(0, 0); 101],
            context_error_counts: HashMap::new(),
            context_total_counts: HashMap::new(),
            max_read_length,
            insertion_counts_by_cycle: vec![0u64; max_read_length],
            deletion_counts_by_cycle: vec![0u64; max_read_length],
            bases_by_cycle: vec![0u64; max_read_length],
        }
    }

    /// Grow the per-cycle indel and base count vectors to `new_len`.
    ///
    /// Called whenever a read longer than `max_read_length` is encountered so
    /// that bounds checks never fail.
    fn grow_cycle_vecs(&mut self, new_len: usize) {
        if new_len > self.insertion_counts_by_cycle.len() {
            self.insertion_counts_by_cycle.resize(new_len, 0);
            self.deletion_counts_by_cycle.resize(new_len, 0);
            self.bases_by_cycle.resize(new_len, 0);
        }
    }
}

// ---------------------------------------------------------------------------
// Profile learner
// ---------------------------------------------------------------------------

/// Learns a sequencing error profile from a BAM file.
pub struct ProfileLearner {
    config: LearnerConfig,
}

impl ProfileLearner {
    /// Create a new learner with the supplied configuration.
    pub fn new(config: LearnerConfig) -> Self {
        Self { config }
    }

    /// Read `bam_path` and return a [`ProfileJson`] derived from it.
    ///
    /// The BAM does **not** need to be indexed; we stream through it.
    pub fn learn_from_bam(&self, bam_path: &Path) -> Result<ProfileJson> {
        let file = std::fs::File::open(bam_path)
            .with_context(|| format!("cannot open BAM: {}", bam_path.display()))?;
        let mut reader = bam::io::Reader::new(BufReader::new(file));
        let header = reader.read_header().context("failed to read BAM header")?;

        // First pass: determine maximum read length from header or by peeking.
        // We'll use a default of 150 and grow as needed.
        let initial_read_length = read_length_from_header(&header).unwrap_or(150);

        let mut stats = LearnerStats::new(initial_read_length);
        let mut records_seen: u64 = 0;

        for result in reader.record_bufs(&header) {
            if records_seen >= self.config.sample_size as u64 {
                break;
            }

            let record = result.context("failed to read BAM record")?;

            if should_skip(&record, self.config.min_mapq) {
                stats.reads_skipped += 1;
                continue;
            }

            // Grow quality arrays if we hit a longer read.
            let rlen = record.sequence().len();
            if rlen > stats.max_read_length {
                let new_len = rlen;
                stats.quality_counts_r1.resize(new_len, HashMap::new());
                stats.quality_counts_r2.resize(new_len, HashMap::new());
                stats.max_read_length = new_len;
                stats.grow_cycle_vecs(new_len);
            }

            let is_r2 = record.flags().is_last_segment();

            // Quality scores.
            let quals: Vec<u8> = record.quality_scores().as_ref().to_vec();
            let target = if is_r2 {
                &mut stats.quality_counts_r2
            } else {
                &mut stats.quality_counts_r1
            };
            for (pos, &q) in quals.iter().enumerate() {
                if pos < target.len() {
                    *target[pos].entry(q).or_insert(0) += 1;
                }
            }

            // Insert size (only for first-in-pair to avoid double counting).
            if !is_r2 {
                let tlen = record.template_length();
                if tlen > 0 {
                    *stats.insert_size_counts.entry(tlen).or_insert(0) += 1;
                }
            }

            // GC bias from sequence.
            let seq_bytes: Vec<u8> = record.sequence().iter().collect();
            if !seq_bytes.is_empty() {
                let gc = gc_percent(&seq_bytes);
                let depth = 1u64; // each read contributes 1 unit of "depth"
                stats.gc_bias[gc].0 += depth;
                stats.gc_bias[gc].1 += 1;
            }

            // CIGAR-based per-cycle indel counts.
            // Only process reads with MAPQ ≥ 30 to reduce false positives from
            // misaligned reads carrying true variants. A VCF exclusion filter
            // is deferred.
            let mapq_ok = record
                .mapping_quality()
                .map(|m| u8::from(m) >= 30)
                .unwrap_or(false);
            if mapq_ok {
                accumulate_cigar_counts(
                    record.cigar().as_ref(),
                    &mut stats.insertion_counts_by_cycle,
                    &mut stats.deletion_counts_by_cycle,
                    &mut stats.bases_by_cycle,
                );
            }

            stats.reads_examined += 1;
            records_seen += 1;
        }

        self.build_profile(stats)
    }

    /// Build a [`ProfileJson`] from accumulated [`LearnerStats`].
    pub fn build_profile(&self, stats: LearnerStats) -> Result<ProfileJson> {
        anyhow::ensure!(
            stats.reads_examined > 0,
            "no usable reads found in BAM (all filtered or file empty)"
        );

        let read_length = stats.max_read_length;

        // R1 quality distribution.
        let read1 = build_quality_distribution(&stats.quality_counts_r1, read_length);
        // R2 quality distribution (if any R2 data present).
        let has_r2 = stats.quality_counts_r2.iter().any(|h| !h.is_empty());
        let read2 = if has_r2 {
            Some(build_quality_distribution(
                &stats.quality_counts_r2,
                read_length,
            ))
        } else {
            None
        };

        // Substitution matrix: convert raw counts to probabilities.
        let substitution_matrix = build_substitution_matrix(&stats.substitution_counts);

        // Context effects: flag contexts with substantially elevated error rates.
        let context_effects =
            build_context_effects(&stats.context_error_counts, &stats.context_total_counts);

        // Compute per-cycle and overall indel rates from CIGAR-derived counts.
        let total_insertions: u64 = stats.insertion_counts_by_cycle.iter().sum();
        let total_deletions: u64 = stats.deletion_counts_by_cycle.iter().sum();
        let total_bases: u64 = stats.bases_by_cycle.iter().sum();

        let (
            overall_insertion_rate,
            overall_deletion_rate,
            indel_insertion_fraction,
            per_cycle_insertion_rates,
            per_cycle_deletion_rates,
        ) = if total_bases > 0 {
            let ins_rate = total_insertions as f64 / total_bases as f64;
            let del_rate = total_deletions as f64 / total_bases as f64;
            let total_indels = total_insertions + total_deletions;
            let ins_fraction = if total_indels > 0 {
                total_insertions as f64 / total_indels as f64
            } else {
                0.5
            };
            let per_ins: Vec<f64> = stats
                .insertion_counts_by_cycle
                .iter()
                .zip(stats.bases_by_cycle.iter())
                .map(|(&i, &b)| if b > 0 { i as f64 / b as f64 } else { 0.0 })
                .collect();
            let per_del: Vec<f64> = stats
                .deletion_counts_by_cycle
                .iter()
                .zip(stats.bases_by_cycle.iter())
                .map(|(&d, &b)| if b > 0 { d as f64 / b as f64 } else { 0.0 })
                .collect();
            (
                Some(ins_rate),
                Some(del_rate),
                Some(ins_fraction),
                per_ins,
                per_del,
            )
        } else {
            (None, None, None, Vec::new(), Vec::new())
        };

        Ok(ProfileJson {
            platform: None,
            read_length,
            quality_distribution: QualityDistributionJson { read1, read2 },
            substitution_matrix,
            context_effects,
            overall_insertion_rate,
            overall_deletion_rate,
            indel_insertion_fraction,
            per_cycle_insertion_rates,
            per_cycle_deletion_rates,
        })
    }

    /// Learn a profile directly from in-memory [`RecordBuf`] records (useful for testing
    /// without writing a full BAM to disk).
    // Called only in tests; production entry point is learn_from_bam.
    #[cfg(test)]
    pub fn learn_from_records(
        &self,
        records: &[RecordBuf],
        read_length: usize,
    ) -> Result<ProfileJson> {
        let mut stats = LearnerStats::new(read_length);

        for record in records {
            if should_skip(record, self.config.min_mapq) {
                stats.reads_skipped += 1;
                continue;
            }

            if stats.reads_examined >= self.config.sample_size as u64 {
                break;
            }

            let rlen = record.sequence().len();
            if rlen > stats.max_read_length {
                stats.quality_counts_r1.resize(rlen, HashMap::new());
                stats.quality_counts_r2.resize(rlen, HashMap::new());
                stats.max_read_length = rlen;
                stats.grow_cycle_vecs(rlen);
            }

            let is_r2 = record.flags().is_last_segment();

            let quals: Vec<u8> = record.quality_scores().as_ref().to_vec();
            let target = if is_r2 {
                &mut stats.quality_counts_r2
            } else {
                &mut stats.quality_counts_r1
            };
            for (pos, &q) in quals.iter().enumerate() {
                if pos < target.len() {
                    *target[pos].entry(q).or_insert(0) += 1;
                }
            }

            if !is_r2 {
                let tlen = record.template_length();
                if tlen > 0 {
                    *stats.insert_size_counts.entry(tlen).or_insert(0) += 1;
                }
            }

            let seq_bytes: Vec<u8> = record.sequence().iter().collect();
            if !seq_bytes.is_empty() {
                let gc = gc_percent(&seq_bytes);
                stats.gc_bias[gc].0 += 1;
                stats.gc_bias[gc].1 += 1;
            }

            // CIGAR-based per-cycle indel counts (MAPQ ≥ 30 only).
            let mapq_ok = record
                .mapping_quality()
                .map(|m| u8::from(m) >= 30)
                .unwrap_or(false);
            if mapq_ok {
                accumulate_cigar_counts(
                    record.cigar().as_ref(),
                    &mut stats.insertion_counts_by_cycle,
                    &mut stats.deletion_counts_by_cycle,
                    &mut stats.bases_by_cycle,
                );
            }

            stats.reads_examined += 1;
        }

        self.build_profile(stats)
    }
}

// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------

/// Walk a CIGAR and update per-cycle insertion, deletion, and base counts.
///
/// `read_pos` tracks the current position in the read (not the reference).
/// Only `M`/`=`/`X` operations contribute to `bases_by_cycle`. Soft-clipped
/// bases advance `read_pos` but are not counted as aligned bases or errors.
/// Hard clips, pads, and skips do not move `read_pos`.
///
/// MAPQ filtering must be applied by the caller; this function counts
/// unconditionally.
pub fn accumulate_cigar_counts(
    cigar: &[noodles_sam::alignment::record::cigar::Op],
    insertion_counts: &mut [u64],
    deletion_counts: &mut [u64],
    bases_by_cycle: &mut [u64],
) {
    use noodles_sam::alignment::record::cigar::op::Kind;

    let cap = bases_by_cycle.len();
    let mut read_pos: usize = 0;

    for op in cigar {
        let len = op.len();
        match op.kind() {
            Kind::Match | Kind::SequenceMatch | Kind::SequenceMismatch => {
                // These operations consume both read and reference.
                for i in 0..len {
                    let p = read_pos + i;
                    if p < cap {
                        bases_by_cycle[p] += 1;
                    }
                }
                read_pos += len;
            }
            Kind::Insertion => {
                // An insertion at `read_pos`: the read has extra bases here.
                if read_pos < cap {
                    insertion_counts[read_pos] += 1;
                }
                read_pos += len;
            }
            Kind::Deletion => {
                // A deletion at `read_pos`: the reference has bases the read
                // skips. The read position does not advance.
                if read_pos < cap {
                    deletion_counts[read_pos] += 1;
                }
                // Do not advance read_pos.
            }
            Kind::SoftClip => {
                // Soft-clipped bases are present in the read but are not
                // counted as aligned bases or errors.
                read_pos += len;
            }
            Kind::HardClip | Kind::Pad | Kind::Skip => {
                // These do not consume read bases.
            }
        }
    }
}

/// Return true if the read should be excluded from learning.
fn should_skip(record: &RecordBuf, min_mapq: u8) -> bool {
    let flags = record.flags();

    // Skip unmapped, supplementary, secondary.
    if flags.is_unmapped()
        || flags.is_supplementary()
        || flags.is_secondary()
        || flags.is_duplicate()
    {
        return true;
    }

    // Skip reads below minimum mapping quality.
    if let Some(mapq) = record.mapping_quality() {
        if u8::from(mapq) < min_mapq {
            return true;
        }
    }

    false
}

/// Compute GC percent (0–100) for a sequence.
fn gc_percent(seq: &[u8]) -> usize {
    if seq.is_empty() {
        return 0;
    }
    let gc_count = seq
        .iter()
        .filter(|&&b| matches!(b, b'G' | b'C' | b'g' | b'c'))
        .count();
    (gc_count * 100 / seq.len()).min(100)
}

/// Convert per-position quality count maps into the `[[q, weight], ...]` format.
fn build_quality_distribution(
    counts: &[HashMap<u8, u64>],
    read_length: usize,
) -> Vec<Vec<[f64; 2]>> {
    let mut dist = Vec::with_capacity(read_length);
    for pos_counts in counts.iter().take(read_length) {
        let total: u64 = pos_counts.values().sum();
        if total == 0 {
            // No data for this position; emit a single Q30 entry with weight 1.
            dist.push(vec![[30.0, 1.0]]);
            continue;
        }
        let total_f = total as f64;
        let mut entries: Vec<[f64; 2]> = pos_counts
            .iter()
            .map(|(&q, &cnt)| [q as f64, cnt as f64 / total_f])
            .collect();
        // Sort by quality descending for readability.
        entries.sort_by(|a, b| b[0].partial_cmp(&a[0]).unwrap_or(std::cmp::Ordering::Equal));
        dist.push(entries);
    }
    // Pad to read_length with Q30 if shorter.
    while dist.len() < read_length {
        dist.push(vec![[30.0, 1.0]]);
    }
    dist
}

/// Convert raw substitution counts to a probability map ("A>C" → f64).
fn build_substitution_matrix(counts: &HashMap<String, u64>) -> HashMap<String, f64> {
    // Group by from-base to get marginals.
    let mut from_totals: HashMap<u8, u64> = HashMap::new();
    for (key, &cnt) in counts {
        if let Some(from) = key.as_bytes().first() {
            *from_totals.entry(*from).or_insert(0) += cnt;
        }
    }

    let mut matrix = HashMap::new();
    for (key, &cnt) in counts {
        if cnt == 0 {
            continue;
        }
        let from = key.as_bytes()[0];
        let total = *from_totals.get(&from).unwrap_or(&1);
        if total > 0 {
            matrix.insert(key.clone(), cnt as f64 / total as f64);
        }
    }
    matrix
}

/// Build context effects from tri-nucleotide error statistics.
///
/// Any context whose error rate exceeds 5× the genome-wide average gets a
/// penalty proportional to the excess.
fn build_context_effects(
    error_counts: &HashMap<String, u64>,
    total_counts: &HashMap<String, u64>,
) -> HashMap<String, ContextEffectJson> {
    if error_counts.is_empty() || total_counts.is_empty() {
        return HashMap::new();
    }

    // Genome-wide average error rate.
    let total_errors: u64 = error_counts.values().sum();
    let total_obs: u64 = total_counts.values().sum();
    if total_obs == 0 {
        return HashMap::new();
    }
    let global_rate = total_errors as f64 / total_obs as f64;

    let mut effects = HashMap::new();
    for (ctx, &obs) in total_counts {
        if obs < 100 {
            continue; // insufficient data
        }
        let errors = error_counts.get(ctx).copied().unwrap_or(0);
        let rate = errors as f64 / obs as f64;
        if rate > global_rate * 5.0 {
            // Penalty in Phred points: -10 * log10(rate / global_rate), capped at 20.
            let penalty = (-10.0 * (global_rate / rate).log10()).round() as u8;
            let penalty = penalty.clamp(1, 20);
            effects.insert(
                ctx.clone(),
                ContextEffectJson {
                    quality_penalty: penalty,
                },
            );
        }
    }
    effects
}

/// Attempt to extract read length from the BAM `@PG` or `@CO` header fields.
/// Falls back to `None` if not determinable.
fn read_length_from_header(header: &sam::Header) -> Option<usize> {
    // Some tools write read length into comments; we don't rely on this.
    let _ = header;
    None
}

// ---------------------------------------------------------------------------
// TSV serialisation / deserialisation
// ---------------------------------------------------------------------------

/// Serialise a quality-by-cycle matrix to a TSV file.
///
/// Rows correspond to read positions (cycles). Columns correspond to raw
/// Phred quality values 0–93. Values are probability weights.
/// The header row is: `cycle\tq0\tq1\t…\tq93`.
pub fn write_quality_tsv(profile: &ProfileJson, path: &std::path::Path) -> anyhow::Result<()> {
    use std::io::Write as _;
    let f = std::fs::File::create(path)
        .with_context(|| format!("failed to create profile TSV: {}", path.display()))?;
    let mut w = std::io::BufWriter::new(f);

    // Header.
    let mut header = "cycle".to_string();
    for q in 0u8..=93 {
        header.push_str(&format!("\tq{}", q));
    }
    writeln!(w, "{}", header)?;

    for (cycle, entries) in profile.quality_distribution.read1.iter().enumerate() {
        // Build a dense weight array indexed by quality value.
        let mut weights = vec![0.0f64; 94];
        for entry in entries {
            let q = entry[0] as usize;
            if q < 94 {
                weights[q] = entry[1];
            }
        }
        let mut row = cycle.to_string();
        for &wt in &weights {
            row.push_str(&format!("\t{:.6}", wt));
        }
        writeln!(w, "{}", row)?;
    }

    w.flush()?;
    Ok(())
}

/// Deserialise a quality-by-cycle TSV into a per-cycle distribution.
///
/// Returns a `Vec` of `[quality, weight]` entry lists per cycle, in the same
/// format used by `ProfileJson.quality_distribution.read1`.
/// Positions with no non-zero weight fall back to a single Q30 entry.
// Called by load_quality_tsv; not yet wired into the production engine.
#[allow(dead_code)]
pub fn read_quality_tsv(path: &std::path::Path) -> anyhow::Result<Vec<Vec<[f64; 2]>>> {
    use std::io::BufRead as _;
    let f = std::fs::File::open(path)
        .with_context(|| format!("failed to open profile TSV: {}", path.display()))?;
    let reader = std::io::BufReader::new(f);
    let mut lines = reader.lines();

    // Skip header row.
    lines
        .next()
        .ok_or_else(|| anyhow::anyhow!("profile TSV is empty"))??;

    let mut distribution = Vec::new();
    for line in lines {
        let line = line?;
        if line.trim().is_empty() {
            continue;
        }
        let fields: Vec<&str> = line.split('\t').collect();
        // fields[0] is the cycle index; fields[1..] are weights for q0..q93.
        let mut entries: Vec<[f64; 2]> = fields[1..]
            .iter()
            .enumerate()
            .filter_map(|(i, s)| {
                let w: f64 = s.trim().parse().unwrap_or(0.0);
                if w > 0.0 {
                    Some([i as f64, w])
                } else {
                    None
                }
            })
            .collect();
        if entries.is_empty() {
            entries.push([30.0, 1.0]); // fallback to Q30
        }
        distribution.push(entries);
    }

    Ok(distribution)
}

// ---------------------------------------------------------------------------
// BAM builder helpers (used in tests)
// ---------------------------------------------------------------------------

/// Build an in-memory [`RecordBuf`] for use in tests.
///
/// - `is_r2`: sets LAST_SEGMENT flag
/// - `template_len`: SAM TLEN field (positive for R1, negative for R2 in practice)
// Lives outside the test module so it can be called across test functions without
// visibility issues; not used in production code.
#[cfg(test)]
pub fn make_test_record(
    seq: &[u8],
    quals: &[u8],
    is_r2: bool,
    template_len: i32,
    mapq: u8,
    pos: usize,
    ref_id: usize,
) -> Result<RecordBuf> {
    use crate::io::bam::parse_cigar;
    use noodles_core::Position;
    use noodles_sam::alignment::{
        record::MappingQuality,
        record_buf::{Cigar, QualityScores, Sequence},
    };

    let cigar_str = format!("{}M", seq.len());
    let cigar_ops = parse_cigar(&cigar_str)?;

    let mut flags = noodles_sam::alignment::record::Flags::SEGMENTED
        | noodles_sam::alignment::record::Flags::PROPERLY_SEGMENTED;
    if is_r2 {
        flags |= noodles_sam::alignment::record::Flags::REVERSE_COMPLEMENTED
            | noodles_sam::alignment::record::Flags::LAST_SEGMENT;
    } else {
        flags |= noodles_sam::alignment::record::Flags::MATE_REVERSE_COMPLEMENTED
            | noodles_sam::alignment::record::Flags::FIRST_SEGMENT;
    }

    let alignment_pos =
        Position::new(pos + 1).ok_or_else(|| anyhow::anyhow!("invalid position"))?;
    let mate_pos = Position::new(pos + seq.len() + 1)
        .ok_or_else(|| anyhow::anyhow!("invalid mate position"))?;

    let record = RecordBuf::builder()
        .set_flags(flags)
        .set_reference_sequence_id(ref_id)
        .set_alignment_start(alignment_pos)
        .set_mapping_quality(
            MappingQuality::new(mapq).ok_or_else(|| anyhow::anyhow!("invalid mapq"))?,
        )
        .set_cigar(cigar_ops.into_iter().collect::<Cigar>())
        .set_mate_reference_sequence_id(ref_id)
        .set_mate_alignment_start(mate_pos)
        .set_template_length(template_len)
        .set_sequence(Sequence::from(seq))
        .set_quality_scores(QualityScores::from(quals.to_vec()))
        .build();

    Ok(record)
}

/// Build an in-memory [`RecordBuf`] with an explicit CIGAR string (used in tests).
///
/// Unlike [`make_test_record`], this function lets the caller supply the CIGAR
/// directly, which is needed for testing CIGAR-based indel counting.
#[cfg(test)]
pub fn make_test_record_with_cigar(
    seq: &[u8],
    quals: &[u8],
    cigar_str: &str,
    mapq: u8,
    pos: usize,
    ref_id: usize,
) -> Result<RecordBuf> {
    use crate::io::bam::parse_cigar;
    use noodles_core::Position;
    use noodles_sam::alignment::{
        record::MappingQuality,
        record_buf::{Cigar, QualityScores, Sequence},
    };

    let cigar_ops = parse_cigar(cigar_str)?;

    let flags = noodles_sam::alignment::record::Flags::SEGMENTED
        | noodles_sam::alignment::record::Flags::PROPERLY_SEGMENTED
        | noodles_sam::alignment::record::Flags::MATE_REVERSE_COMPLEMENTED
        | noodles_sam::alignment::record::Flags::FIRST_SEGMENT;

    let alignment_pos =
        Position::new(pos + 1).ok_or_else(|| anyhow::anyhow!("invalid position"))?;
    let mate_pos = Position::new(pos + seq.len() + 1)
        .ok_or_else(|| anyhow::anyhow!("invalid mate position"))?;

    let record = RecordBuf::builder()
        .set_flags(flags)
        .set_reference_sequence_id(ref_id)
        .set_alignment_start(alignment_pos)
        .set_mapping_quality(
            MappingQuality::new(mapq).ok_or_else(|| anyhow::anyhow!("invalid mapq"))?,
        )
        .set_cigar(cigar_ops.into_iter().collect::<Cigar>())
        .set_mate_reference_sequence_id(ref_id)
        .set_mate_alignment_start(mate_pos)
        .set_template_length(seq.len() as i32)
        .set_sequence(Sequence::from(seq))
        .set_quality_scores(QualityScores::from(quals.to_vec()))
        .build();

    Ok(record)
}

// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------

#[cfg(test)]
mod tests {
    use super::*;
    use crate::core::error_profile::EmpiricalQualityModel;
    use crate::core::quality::QualityModel;
    use rand::rngs::StdRng;
    use rand::SeedableRng;

    fn learner() -> ProfileLearner {
        ProfileLearner::new(LearnerConfig {
            sample_size: 100_000,
            min_mapq: 0, // accept all in tests
        })
    }

    /// Build a batch of synthetic R1 records with uniform quality `q`.
    fn uniform_r1_records(n: usize, read_len: usize, q: u8, tlen: i32) -> Vec<RecordBuf> {
        let seq = vec![b'A'; read_len];
        let quals = vec![q; read_len];
        (0..n)
            .map(|i| {
                make_test_record(&seq, &quals, false, tlen, 60, i * (read_len + 50), 0)
                    .expect("make_test_record should not fail")
            })
            .collect()
    }

    // Test 1 – quality distributions extracted correctly.
    #[test]
    fn test_quality_extraction() {
        // 200 R1 reads all with Q37 at every position (read_len = 10).
        let records = uniform_r1_records(200, 10, 37, 200);
        let profile = learner()
            .learn_from_records(&records, 10)
            .expect("learn_from_records should succeed");

        // Every position should have a single [37.0, 1.0] entry.
        assert_eq!(
            profile.quality_distribution.read1.len(),
            10,
            "wrong number of positions"
        );
        for (pos, entries) in profile.quality_distribution.read1.iter().enumerate() {
            assert_eq!(
                entries.len(),
                1,
                "pos {pos}: expected exactly 1 quality bucket"
            );
            assert!(
                (entries[0][0] - 37.0).abs() < f64::EPSILON,
                "pos {pos}: quality should be 37"
            );
            assert!(
                (entries[0][1] - 1.0).abs() < 1e-9,
                "pos {pos}: weight should be 1.0"
            );
        }
    }

    // Test 2 – fragment size (insert size) distribution.
    #[test]
    fn test_fragment_size_extraction() {
        // Mix of R1 reads with different template lengths.
        let read_len = 10usize;
        let seq = vec![b'A'; read_len];
        let quals = vec![30u8; read_len];
        let tlens = [150i32, 150, 150, 200, 200, 250];
        let records: Vec<RecordBuf> = tlens
            .iter()
            .enumerate()
            .map(|(i, &tlen)| {
                make_test_record(&seq, &quals, false, tlen, 60, i * 300, 0)
                    .expect("make_test_record failed")
            })
            .collect();

        let learner = ProfileLearner::new(LearnerConfig {
            sample_size: 100_000,
            min_mapq: 0,
        });
        // We access the stats directly to check insert sizes.
        let mut stats = LearnerStats::new(read_len);
        for record in &records {
            let tlen = record.template_length();
            if tlen > 0 {
                *stats.insert_size_counts.entry(tlen).or_insert(0) += 1;
            }
            stats.reads_examined += 1;
        }

        assert_eq!(*stats.insert_size_counts.get(&150).unwrap_or(&0), 3);
        assert_eq!(*stats.insert_size_counts.get(&200).unwrap_or(&0), 2);
        assert_eq!(*stats.insert_size_counts.get(&250).unwrap_or(&0), 1);

        // Also verify the full pipeline doesn't error.
        let profile = learner
            .learn_from_records(&records, read_len)
            .expect("learn_from_records should succeed");
        assert!(!profile.quality_distribution.read1.is_empty());
    }

    // Test 3 – GC bias curve is populated for synthetic data.
    #[test]
    fn test_gc_bias_extraction() {
        let read_len = 10usize;
        // Pure A/T sequence → 0 % GC.
        let at_seq = vec![b'A'; read_len];
        // Pure GC sequence → 100 % GC.
        let gc_seq = vec![b'G'; read_len];
        let quals = vec![30u8; read_len];

        let at_records: Vec<RecordBuf> = (0..5)
            .map(|i| {
                make_test_record(&at_seq, &quals, false, 200, 60, i * 300, 0)
                    .expect("make_test_record failed")
            })
            .collect();
        let gc_records: Vec<RecordBuf> = (5..10)
            .map(|i| {
                make_test_record(&gc_seq, &quals, false, 200, 60, i * 300, 0)
                    .expect("make_test_record failed")
            })
            .collect();

        let mut all: Vec<RecordBuf> = at_records;
        all.extend(gc_records);

        let mut stats = LearnerStats::new(read_len);
        for record in &all {
            let seq_bytes: Vec<u8> = record.sequence().iter().collect();
            let gc = gc_percent(&seq_bytes);
            stats.gc_bias[gc].0 += 1;
            stats.gc_bias[gc].1 += 1;
            stats.reads_examined += 1;
        }

        // 0 % GC bin should have 5 reads.
        assert_eq!(stats.gc_bias[0].1, 5, "expected 5 reads in GC=0 bin");
        // 100 % GC bin should have 5 reads.
        assert_eq!(stats.gc_bias[100].1, 5, "expected 5 reads in GC=100 bin");
        // All other bins should be zero.
        for pct in 1..100 {
            assert_eq!(stats.gc_bias[pct].1, 0, "unexpected reads in GC={pct} bin");
        }
    }

    // Test 4 – output format matches ProfileJson schema.
    #[test]
    fn test_output_format() {
        let records = uniform_r1_records(50, 10, 35, 180);
        let profile = learner()
            .learn_from_records(&records, 10)
            .expect("learn_from_records should succeed");

        // Must have read1 quality distribution.
        assert!(
            !profile.quality_distribution.read1.is_empty(),
            "read1 quality distribution must not be empty"
        );
        // read_length must be set and positive.
        assert!(profile.read_length > 0, "read_length must be > 0");

        // Must be serialisable to JSON.
        let json = serde_json::to_string(&profile).expect("profile must serialize to JSON");
        assert!(json.contains("read1"), "JSON must contain read1 key");
        assert!(
            json.contains("quality_distribution"),
            "JSON must contain quality_distribution"
        );

        // Must be re-parseable as a valid EmpiricalQualityModel.
        let model = EmpiricalQualityModel::from_json_str(&json)
            .expect("serialized profile must be parseable by EmpiricalQualityModel");
        assert_eq!(model.platform, None);
    }

    // Test 5 – sampling respects requested count.
    #[test]
    fn test_sampling() {
        // Create 200 records but only sample 50.
        let records = uniform_r1_records(200, 10, 30, 150);
        let small_learner = ProfileLearner::new(LearnerConfig {
            sample_size: 50,
            min_mapq: 0,
        });
        // We can't directly inspect the learned count from ProfileJson alone,
        // so we verify learn_from_records succeeds and that the resulting
        // quality table has exactly 10 positions (= read_length), not 200.
        let profile = small_learner
            .learn_from_records(&records, 10)
            .expect("learn_from_records should succeed with sample_size limit");
        assert_eq!(
            profile.quality_distribution.read1.len(),
            10,
            "quality distribution length should equal read_length"
        );

        // Additionally verify the learner stops after sample_size records by
        // checking stats directly.
        let mut stats = LearnerStats::new(10);
        for (i, record) in records.iter().enumerate() {
            if i >= 50 {
                break;
            }
            let quals: Vec<u8> = record.quality_scores().as_ref().to_vec();
            for (pos, &q) in quals.iter().enumerate() {
                *stats.quality_counts_r1[pos].entry(q).or_insert(0) += 1;
            }
            stats.reads_examined += 1;
        }
        assert_eq!(
            stats.reads_examined, 50,
            "should have examined exactly 50 reads"
        );
    }

    // Test 6 – round-trip: learned profile produces a model with similar quality distribution.
    #[test]
    fn test_round_trip() {
        // Create records with known quality: Q30 at all positions.
        let records = uniform_r1_records(500, 20, 30, 200);
        let profile = learner()
            .learn_from_records(&records, 20)
            .expect("learn_from_records should succeed");

        // Serialize to JSON.
        let json = serde_json::to_string(&profile).expect("profile must serialize");

        // Reload into EmpiricalQualityModel.
        let model =
            EmpiricalQualityModel::from_json_str(&json).expect("serialized profile must reload");

        // Generate qualities and check mean ≈ 30.
        let mut rng = StdRng::seed_from_u64(42);
        let mut total: u64 = 0;
        let n_reads = 1_000usize;
        for _ in 0..n_reads {
            let quals = model.generate_qualities(20, &mut rng);
            total += quals.iter().map(|&q| q as u64).sum::<u64>();
        }
        let mean_q = total as f64 / (n_reads * 20) as f64;
        assert!(
            (mean_q - 30.0).abs() < 1.0,
            "mean quality {mean_q} should be close to 30"
        );
    }

    // Test 7 – CIGAR parsing correctly counts insertions, deletions, and bases.
    //
    // CIGAR: 5M2I3M1D4M, MAPQ 40.
    // Read: 5 + 2 + 3 + 4 = 14 bases consumed from read.
    // Reference consumed: 5 + 3 + 1 + 4 = 13 bases.
    //
    // Expected counts:
    //   bases_by_cycle: positions 0–4 (first 5M), then 7–9 (3M after 2I),
    //                   then 10–13 (4M after 1D).
    //   insertion at read_pos 5 (where the 2I starts).
    //   deletion at read_pos 8 (after 5M + 2I + 3M — note: 3M advances to 10, deletion
    //                           is at read_pos 10, not 8). Let's be precise:
    //   After 5M: read_pos = 5, bases[0..4] += 1.
    //   After 2I: insertion_counts[5] += 1, read_pos = 7.
    //   After 3M: bases[7..9] += 1, read_pos = 10.
    //   After 1D: deletion_counts[10] += 1, read_pos stays 10.
    //   After 4M: bases[10..13] += 1, read_pos = 14.
    #[test]
    fn test_learner_stats_cigar_counts() {
        // The read has 14 bases (5 + 2 + 3 + 4 = 14).
        let seq = vec![b'A'; 14];
        let quals = vec![40u8; 14];
        let record = make_test_record_with_cigar(&seq, &quals, "5M2I3M1D4M", 40, 0, 0)
            .expect("make_test_record_with_cigar should not fail");

        let mut ins = vec![0u64; 14];
        let mut del = vec![0u64; 14];
        let mut bases = vec![0u64; 14];

        accumulate_cigar_counts(record.cigar().as_ref(), &mut ins, &mut del, &mut bases);

        // Insertion at read_pos 5 (where the 2I begins).
        assert_eq!(ins[5], 1, "expected insertion at read_pos 5");

        // Deletion at read_pos 10 (after 5M + 2I + 3M).
        assert_eq!(del[10], 1, "expected deletion at read_pos 10");

        // Aligned bases: positions 0–4 (5M), 7–9 (3M), 10–13 (4M).
        for (p, &b) in bases.iter().enumerate().take(5) {
            assert_eq!(b, 1, "bases[{p}] should be 1 (from first 5M)");
        }
        for (p, &b) in bases.iter().enumerate().skip(7).take(3) {
            assert_eq!(b, 1, "bases[{p}] should be 1 (from 3M after 2I)");
        }
        for (p, &b) in bases.iter().enumerate().skip(10).take(4) {
            assert_eq!(b, 1, "bases[{p}] should be 1 (from 4M after 1D)");
        }

        // Positions 5–6 are inside the insertion: not counted as aligned bases.
        assert_eq!(bases[5], 0, "bases[5] should be 0 (insertion, not aligned)");
        assert_eq!(bases[6], 0, "bases[6] should be 0 (insertion, not aligned)");

        // Total aligned bases: 5 + 3 + 4 = 12.
        let total: u64 = bases.iter().sum();
        assert_eq!(total, 12, "total aligned bases should be 12");
    }

    // Test 8 – build_profile computes indel rates from known counts.
    #[test]
    fn test_build_profile_indel_rates() {
        let read_len = 10usize;
        let mut stats = LearnerStats::new(read_len);
        stats.reads_examined = 1;

        // 100 bases observed at every cycle, 2 insertions at cycle 3.
        for b in stats.bases_by_cycle.iter_mut() {
            *b = 100;
        }
        stats.insertion_counts_by_cycle[3] = 2;

        let learner = ProfileLearner::new(LearnerConfig {
            sample_size: 1,
            min_mapq: 0,
        });

        // build_profile needs at least one quality entry.
        stats.quality_counts_r1[0].insert(30, 1);
        // Pad the remaining positions so build_quality_distribution doesn't panic.
        for pos in 1..read_len {
            stats.quality_counts_r1[pos].insert(30, 1);
        }

        let profile = learner
            .build_profile(stats)
            .expect("build_profile should succeed");

        let total_bases = 100u64 * read_len as u64;
        let expected_ins_rate = 2.0 / total_bases as f64;
        let actual = profile
            .overall_insertion_rate
            .expect("overall_insertion_rate should be Some");
        assert!(
            (actual - expected_ins_rate).abs() < 1e-12,
            "overall_insertion_rate {actual} != expected {expected_ins_rate}"
        );

        // Deletion rate should be 0 (no deletions added).
        let del_rate = profile
            .overall_deletion_rate
            .expect("overall_deletion_rate should be Some");
        assert!(
            del_rate < 1e-12,
            "overall_deletion_rate should be ~0, got {del_rate}"
        );
    }

    // Test 9 – end-to-end indel rate extraction accuracy.
    //
    // 1000 synthetic records each with CIGAR "10M1I10M":
    //   - 20 aligned bases per record (10M + 10M; the insertion does not count).
    //   - 1 insertion per record at read_pos 10.
    // Expected overall insertion rate: 1 / 20 = 0.05.
    // Assert the observed rate falls within ±20 % (i.e. [0.04, 0.06]).
    #[test]
    fn test_indel_rate_extraction_accuracy() {
        let n_records = 1000usize;
        // read_length = 21 (10 read bases + 1 inserted base + 10 read bases).
        let read_len = 21usize;
        let mut ins = vec![0u64; read_len];
        let mut del = vec![0u64; read_len];
        let mut bases = vec![0u64; read_len];

        let seq = vec![b'A'; read_len];
        let quals = vec![40u8; read_len];

        for i in 0..n_records {
            let record = make_test_record_with_cigar(&seq, &quals, "10M1I10M", 40, i * 100, 0)
                .expect("make_test_record_with_cigar should not fail");
            accumulate_cigar_counts(record.cigar().as_ref(), &mut ins, &mut del, &mut bases);
        }

        let total_bases: u64 = bases.iter().sum();
        let total_insertions: u64 = ins.iter().sum();

        let observed_rate = total_insertions as f64 / total_bases as f64;
        assert!(
            (0.04..=0.06).contains(&observed_rate),
            "expected insertion rate ~0.05, got {observed_rate:.4}"
        );
    }
}