twitcher 0.1.8

Find template switch mutations in genomic data
use clap::Args;

#[derive(Debug, Clone, Args)]
pub struct ClusteringSettings {
    /// Minimum number of mutation events required to form a cluster.
    #[arg(long = "cluster-min-records", value_name = "NUM", default_value_t = 2)]
    pub min_records: usize,

    /// Maximum gap in bases between consecutive mutations that still allows them to be in the
    /// same cluster. A gap larger than this value will break the cluster.
    #[arg(long = "cluster-max-gap", value_name = "BASES", default_value_t = 20)]
    pub max_gap: usize,

    /// Minimum density score required for a cluster to be reported.
    /// Density is computed as the sum of variant complexity weights divided by the genomic span
    /// of the cluster. Higher values require more tightly packed or complex variants.
    #[arg(
        long = "cluster-min-density",
        value_name = "NUM",
        default_value_t = 0.2
    )]
    pub min_density: f64,
}

impl Default for ClusteringSettings {
    fn default() -> Self {
        Self {
            min_records: 2,
            max_gap: 20,
            min_density: 0.2,
        }
    }
}

impl ClusteringSettings {
    /// Returns true if the accumulated cluster statistics meet the configured thresholds.
    ///
    /// * `complexity` — sum of [`variant_complexity`] scores for all events in the cluster
    /// * `span`       — genomic span of the cluster in bases (inclusive)
    /// * `count`      — number of distinct mutation events
    pub fn is_valid_cluster(&self, complexity: f64, span: f64, count: usize) -> bool {
        if count < 1.max(self.min_records) {
            return false;
        }
        complexity / span >= self.min_density
    }
}

/// Complexity score for a single variant.
///
/// Both lengths use the **VCF anchor convention**: the anchor base shared by ref and alt is
/// included, so a single-base substitution is `(1, 1)`, an insertion of `n` bases is
/// `(1, n + 1)`, and a deletion of `n` bases is `(n + 1, 1)`.
///
/// Scores: SNP → `1.0`; indel of length `n` → `1.0 + log₂(n)`.
#[allow(clippy::cast_precision_loss)]
pub fn variant_complexity(ref_len: usize, alt_len: usize) -> f64 {
    match (ref_len, alt_len) {
        (1, 1) => 1.0,
        (n, m) => 1.0 + ((n.max(m) - 1) as f64).log2(),
    }
}

#[cfg(test)]
#[allow(clippy::float_cmp)]
mod tests {
    use super::*;

    #[test]
    fn snp_complexity_is_one() {
        assert_eq!(variant_complexity(1, 1), 1.0);
    }

    #[test]
    fn insertion_complexity() {
        // 1-base ins: ref="A", alt="AT"  → ref_len=1, alt_len=2  → 1 + log2(1) = 1.0
        assert_eq!(variant_complexity(1, 2), 1.0);
        // 2-base ins: alt_len=3  → 1 + log2(2) = 2.0
        assert_eq!(variant_complexity(1, 3), 2.0);
        // 4-base ins: alt_len=5  → 1 + log2(4) = 3.0
        assert_eq!(variant_complexity(1, 5), 3.0);
    }

    #[test]
    fn deletion_complexity_mirrors_insertion() {
        assert_eq!(variant_complexity(2, 1), variant_complexity(1, 2));
        assert_eq!(variant_complexity(3, 1), variant_complexity(1, 3));
    }

    #[test]
    fn valid_cluster_at_density_threshold() {
        let s = ClusteringSettings {
            min_records: 2,
            max_gap: 20,
            min_density: 0.2,
        };
        // complexity=2.0, span=10.0  → density=0.2, exactly at threshold
        assert!(s.is_valid_cluster(2.0, 10.0, 2));
    }

    #[test]
    fn cluster_rejected_below_density_threshold() {
        let s = ClusteringSettings {
            min_records: 2,
            max_gap: 20,
            min_density: 0.2,
        };
        // complexity=1.9, span=10.0  → density=0.19 < 0.2
        assert!(!s.is_valid_cluster(1.9, 10.0, 2));
    }

    #[test]
    fn cluster_rejected_below_min_records() {
        let s = ClusteringSettings {
            min_records: 3,
            max_gap: 20,
            min_density: 0.0,
        };
        assert!(!s.is_valid_cluster(10.0, 5.0, 2));
        assert!(s.is_valid_cluster(10.0, 5.0, 3));
    }
}