twitcher 0.2.2

Find template switch mutations in genomic data
use assert_cmd::cargo::cargo_bin_cmd;
use bstr::ByteSlice;

use crate::common::{twitcher_cmd, twitcher_with_ref};

mod common;

fn test_has_ts(file: &str, padding: u32) {
    let cmd = twitcher_with_ref(&format!(
        "tests/data/{file} --padding {padding} --cluster-min-records 1 -vvv --costs tests/old_test_costs.tsa"
    ));
    assert!(cmd.status.success());
    let stdout_str = cmd.stdout.as_bstr();
    let stderr_str = cmd.stderr.as_bstr();
    eprintln!("{stderr_str}");
    let records: Vec<_> = stdout_str
        .lines()
        .filter(|l| !l.starts_with(b"#"))
        .collect();
    for r in &records {
        println!("{}", r.as_bstr());
    }
    let has_ts = records.iter().any(|r| r.contains_str(b"CIGARETS"));
    assert!(has_ts);
}

#[test]
fn test_single_mnv() {
    test_has_ts("single_mnv.vcf", 10);
}

#[test]
fn test_single_insertion() {
    test_has_ts("single_insertion.vcf", 20);
}

#[test]
fn test_output_sorted() {
    let cmd = twitcher_with_ref("tests/data/sorted_output.vcf.gz --padding 20");
    assert!(cmd.status.success());
    let mut last_chr = None;
    let mut last_pos = 0;
    for record in cmd
        .stdout
        .as_bstr()
        .lines()
        .filter(|l| !l.starts_with(b"#"))
    {
        let mut tokens = record.as_bstr().split_str(b"\t");
        let chr = tokens.next().unwrap();
        assert!(
            last_chr.is_none_or(|last| last == chr),
            "the chr is changing for an example where it shouldn't",
        );
        last_chr = Some(chr);
        let pos = tokens.next().unwrap().to_str().unwrap().parse().unwrap();
        assert!(pos >= last_pos, "The output is not sorted");
        last_pos = pos;
    }
}

fn test_region_output(filename: &str, expected: &str) {
    let dir = tempfile::tempdir().unwrap();
    let regions = format!("{}/{filename}", dir.path().to_str().unwrap());
    let mut cmd = cargo_bin_cmd!("twitcher");
    cmd.args([
        "vcf",
        "./test_files/test.vcf",
        "--reference",
        "./test_files/test.fa",
        "--output-regions",
        &regions,
        "--cluster-min-records",
        "1",
        "--costs",
        "./tests/old_test_costs.tsa",
    ]);
    let res = cmd.unwrap();
    let stderr_str = res.stderr.as_bstr();
    eprintln!("{stderr_str}");
    assert!(res.status.success());
    let output = std::fs::read_to_string(regions).unwrap();
    assert_eq!(output, expected);
}

#[test]
fn test_region_output_tab() {
    test_region_output("regions.some.ext", "chrZ\t101\t108\nchrZ\t201\t207\n");
}

#[test]
fn test_region_output_bed() {
    test_region_output("regions.bed", "chrZ\t100\t108\nchrZ\t200\t207\n");
}

// Tests below use ts_chr1_4304586.vcf — a real DRAGEN WGS cluster at chr1:4304586
// (a tandem-repeat deletion) that produces a reverse template switch with current default
// costs. Padding is 30 (the TS jump is -22 bp); this fits within the debug binary's memory.

const TS_FIXTURE: &str =
    "tests/data/ts_chr1_4304586.vcf --cluster-min-records 1 --padding 30";

#[test]
fn test_ts_detection_default_costs() {
    let cmd = twitcher_with_ref(TS_FIXTURE);
    let stderr_str = cmd.stderr.as_bstr();
    eprintln!("{stderr_str}");
    assert!(cmd.status.success());
    let has_ts = cmd
        .stdout
        .as_bstr()
        .lines()
        .filter(|l| !l.starts_with(b"#"))
        .any(|r| r.contains_str(b"CIGARETS"));
    assert!(has_ts, "expected CIGARETS in output with default costs");
}

#[test]
fn test_no_ts_flag_suppresses_template_switches() {
    let cmd = twitcher_with_ref(&format!("{TS_FIXTURE} --no-ts"));
    let stderr_str = cmd.stderr.as_bstr();
    eprintln!("{stderr_str}");
    assert!(cmd.status.success());
    let has_ts = cmd
        .stdout
        .as_bstr()
        .lines()
        .filter(|l| !l.starts_with(b"#"))
        .any(|r| r.contains_str(b"CIGARETS"));
    assert!(!has_ts, "expected no CIGARETS when --no-ts is set");
}

#[test]
fn test_only_realigned_outputs_ts_records_only() {
    let cmd = twitcher_with_ref(&format!("{TS_FIXTURE} --only-realigned"));
    let stderr_str = cmd.stderr.as_bstr();
    eprintln!("{stderr_str}");
    assert!(cmd.status.success());
    let data_lines: Vec<_> = cmd
        .stdout
        .as_bstr()
        .lines()
        .filter(|l| !l.starts_with(b"#"))
        .collect();
    assert!(!data_lines.is_empty(), "expected at least one output record");
    for line in &data_lines {
        assert!(
            line.contains_str(b"CIGARETS"),
            "non-CIGARETS record emitted with --only-realigned: {}",
            line.as_bstr()
        );
    }
}

#[test]
fn test_csv_output_created() {
    let dir = tempfile::tempdir().unwrap();
    let csv_path = dir.path().join("output.csv");
    let cmd = twitcher_with_ref(&format!(
        "{TS_FIXTURE} --csv {}",
        csv_path.to_str().unwrap()
    ));
    let stderr_str = cmd.stderr.as_bstr();
    eprintln!("{stderr_str}");
    assert!(cmd.status.success());
    assert!(csv_path.exists(), "CSV file was not created");
    let content = std::fs::read_to_string(&csv_path).unwrap();
    let mut lines = content.lines();
    let header = lines.next().expect("CSV must have a header row");
    assert!(!header.is_empty(), "CSV header must not be empty");
    assert!(
        lines.next().is_some(),
        "CSV must have at least one data row for the detected TS"
    );
}

#[test]
fn test_fpa_detects_template_switches() {
    // --fpa uses the Four-Point-Aligner instead of the default aligner.
    // It is faster but has limited capabilities; it must still find CIGARETS here.
    let cmd = twitcher_with_ref(&format!("{TS_FIXTURE} --fpa"));
    let stderr_str = cmd.stderr.as_bstr();
    eprintln!("{stderr_str}");
    assert!(cmd.status.success());
    let has_ts = cmd
        .stdout
        .as_bstr()
        .lines()
        .filter(|l| !l.starts_with(b"#"))
        .any(|r| r.contains_str(b"CIGARETS"));
    assert!(has_ts, "expected CIGARETS in output with --fpa");
}

#[test]
fn test_no_panic_on_null_leading_match() {
    // Regression: leading_matches was unwrap()ed and could be None, causing a panic.
    // crash_chr4_leading_match.vcf.gz is the reproducer from chr4 that triggered it.
    // https://version.helsinki.fi/kraujasp/twitcher/-/work_items/58
    let cmd = twitcher_cmd(&[
        "vcf",
        "tests/data/crash_chr4_leading_match.vcf.gz",
        "--reference",
        "tests/data/hg38.chr4_part.fa.gz",
        "--threads",
        "1",
        "-vvv",
    ]);
    assert!(cmd.status.success());
}

#[test]
fn test_target_region_empty_range_produces_no_records() {
    // chr1:1-10 contains no variants in the fixture; output should be header-only.
    let cmd = twitcher_with_ref(&format!("{TS_FIXTURE} -t chr1:1-10"));
    let stderr_str = cmd.stderr.as_bstr();
    eprintln!("{stderr_str}");
    assert!(cmd.status.success());
    let data_count = cmd
        .stdout
        .as_bstr()
        .lines()
        .filter(|l| !l.starts_with(b"#"))
        .count();
    assert_eq!(data_count, 0, "target region chr1:1-10 should yield no records");
}