twitcher 0.3.3

Find template switch mutations in genomic data
use bstr::ByteSlice;

use crate::common::twitcher_cmd;

mod common;

/// Extract a counter value printed by `--stat-output` / `-v` mode from captured stderr.
/// Searches for `{key}: {number}` anywhere in the output (ANSI codes are ignored).
fn parse_counter(stderr: &[u8], key: &str) -> usize {
    let text = String::from_utf8_lossy(stderr);
    let needle = format!("{key}: ");
    for line in text.lines() {
        if let Some(pos) = line.find(&needle) {
            let after = &line[pos + needle.len()..];
            let digits: String = after.chars().take_while(|c| c.is_ascii_digit()).collect();
            if let Ok(n) = digits.parse() {
                return n;
            }
        }
    }
    0
}

const BAM: &str = "tests/data/reads/test.bam";
const REF: &str = "tests/data/reads/test.fa.gz";

fn csv_data_rows(output: &std::process::Output) -> usize {
    // Header is always the first line; all subsequent lines are data rows.
    output.stdout.as_bstr().lines().count().saturating_sub(1)
}

#[test]
fn test_reads_produces_ts_rows() {
    // Default run: output only clusters with detected template switches.
    let cmd = twitcher_cmd(&["reads", BAM, "--reference", REF]);
    let stderr = cmd.stderr.as_bstr();
    eprintln!("{stderr}");
    assert!(cmd.status.success());
    let rows = csv_data_rows(&cmd);
    assert!(rows > 0, "expected at least one TS row in CSV output");
    // Every data line must be parseable CSV (has the id field, a non-empty uuid-like string).
    for line in cmd.stdout.as_bstr().lines().skip(1) {
        assert!(
            !line.is_empty(),
            "unexpected empty data line in CSV output"
        );
    }
}

#[test]
fn test_reads_all_clusters_includes_non_ts() {
    // --all-clusters emits every cluster regardless of TS detection.
    // The fixture BAM has 8 TS rows (default) and 34 rows total (all clusters).
    let cmd = twitcher_cmd(&["reads", BAM, "--reference", REF, "--all-clusters"]);
    let stderr = cmd.stderr.as_bstr();
    eprintln!("{stderr}");
    assert!(cmd.status.success());
    let rows = csv_data_rows(&cmd);
    assert!(
        rows > 8,
        "--all-clusters should produce more rows than the TS-only default (8); got {rows}"
    );
}

#[test]
fn test_reads_n_flag_limits_processed_reads() {
    // -n 5 restricts processing to the first 5 reads.
    // The fixture BAM has 8 TS rows without -n; -n 5 must produce fewer.
    let cmd = twitcher_cmd(&["reads", BAM, "--reference", REF, "-n", "5"]);
    let stderr = cmd.stderr.as_bstr();
    eprintln!("{stderr}");
    assert!(cmd.status.success());
    let rows = csv_data_rows(&cmd);
    assert!(rows < 8, "-n 5 should produce fewer than 8 TS rows; got {rows}");
}

#[test]
fn test_reads_no_ts_suppresses_output() {
    // --no-ts disables TS detection; default output (TS-only) must be empty.
    let cmd = twitcher_cmd(&["reads", BAM, "--reference", REF, "--no-ts"]);
    let stderr = cmd.stderr.as_bstr();
    eprintln!("{stderr}");
    assert!(cmd.status.success());
    let rows = csv_data_rows(&cmd);
    assert_eq!(rows, 0, "expected no TS rows when --no-ts is set");
}

#[test]
fn test_reads_fpa_detects_template_switches() {
    // --fpa uses the Four-Point-Aligner; must still produce TS rows.
    let cmd = twitcher_cmd(&["reads", BAM, "--reference", REF, "--fpa"]);
    let stderr = cmd.stderr.as_bstr();
    eprintln!("{stderr}");
    assert!(cmd.status.success());
    let rows = csv_data_rows(&cmd);
    assert!(rows > 0, "expected TS rows with --fpa; got {rows}");
}

#[test]
fn test_reads_output_file_flag() {
    // -o writes CSV to a file instead of stdout.
    let dir = tempfile::tempdir().unwrap();
    let csv_path = dir.path().join("out.csv");
    let cmd = twitcher_cmd(&[
        "reads",
        BAM,
        "--reference",
        REF,
        "-o",
        csv_path.to_str().unwrap(),
    ]);
    let stderr = cmd.stderr.as_bstr();
    eprintln!("{stderr}");
    assert!(cmd.status.success());
    assert!(csv_path.exists(), "output CSV file was not created");
    let content = std::fs::read_to_string(&csv_path).unwrap();
    let mut lines = content.lines();
    let header = lines.next().expect("CSV must have a header row");
    assert!(!header.is_empty());
    assert!(lines.next().is_some(), "expected at least one data row");
}

#[test]
fn test_db_retries_oom_on_larger_memory() {
    let dir = tempfile::tempdir().unwrap();
    let db_path = dir.path().join("cache.db");
    let db_str = db_path.to_str().unwrap();

    // Run 1: tiny memory limit → all alignments OOM → failures stored to DB.
    let run1 = twitcher_cmd(&["reads", BAM, "--reference", REF, "-v", "-m", "1mb", "--threads", "12", "-n", "10", "--db", db_str]);
    let run1_err = String::from_utf8_lossy(&run1.stderr);
    eprintln!("run1 stderr:\n{run1_err}");
    assert!(run1.status.success());
    let computations1 = parse_counter(&run1.stderr, "alignments.computations");
    let oom1 = parse_counter(&run1.stderr, "alignments.results.failed.oom");
    assert!(computations1 > 0, "run 1 should compute alignments");
    assert!(oom1 > 0, "run 1 should produce OOM failures with tiny memory; computations={computations1}");

    // Run 2: identical settings → DB serves every result, nothing recomputed.
    let run2 = twitcher_cmd(&["reads", BAM, "--reference", REF, "-v", "-m", "1mb", "--threads", "12", "-n", "10", "--db", db_str]);
    let run2_err = String::from_utf8_lossy(&run2.stderr);
    eprintln!("run2 stderr:\n{run2_err}");
    assert!(run2.status.success());
    let computations2 = parse_counter(&run2.stderr, "alignments.computations");
    let from_db2 = parse_counter(&run2.stderr, "alignments.from_db");
    assert_eq!(computations2, 0, "run 2 (same memory) should not recompute anything; from_db={from_db2}");
    assert!(from_db2 > 0, "run 2 should serve results from DB");

    // Run 3: larger memory → OOM failures in DB are retried instead of served.
    let run3 = twitcher_cmd(&["reads", BAM, "--reference", REF, "-v", "-m", "1gb", "--threads", "12", "-n", "10", "--db", db_str]);
    let run3_err = String::from_utf8_lossy(&run3.stderr);
    eprintln!("run3 stderr:\n{run3_err}");
    assert!(run3.status.success());
    let computations3 = parse_counter(&run3.stderr, "alignments.computations");
    assert!(
        computations3 > 0,
        "run 3 (larger memory) should retry cached OOM failures; from_db2={from_db2}"
    );

    // Run 4: repeat run 3 settings → retried results now in DB, nothing recomputed.
    let run4 = twitcher_cmd(&["reads", BAM, "--reference", REF, "-v", "-m", "1gb", "--threads", "12", "-n", "10", "--db", db_str]);
    let run4_err = String::from_utf8_lossy(&run4.stderr);
    eprintln!("run4 stderr:\n{run4_err}");
    assert!(run4.status.success());
    let computations4 = parse_counter(&run4.stderr, "alignments.computations");
    let from_db4 = parse_counter(&run4.stderr, "alignments.from_db");
    assert_eq!(computations4, 0, "run 4 should serve everything from DB after retry; from_db={from_db4}");
    assert!(from_db4 > 0, "run 4 should find retried results in DB");
}

#[test]
fn test_output_failed_alignments() {
    let dir = tempfile::tempdir().unwrap();
    let failed_path = dir.path().join("failed.txt");

    // Run 1: tiny memory limit forces OOM failures; regions file must capture them.
    let run1 = twitcher_cmd(&[
        "reads",
        BAM,
        "--reference",
        REF,
        "-v",
        "-m",
        "1mb",
        "--threads",
        "12",
        "-n",
        "10",
        "--output-failed-alignments",
        failed_path.to_str().unwrap(),
    ]);
    eprintln!("{}", run1.stderr.as_bstr());
    assert!(run1.status.success());
    assert!(failed_path.exists(), "failed-alignments file was not created");

    let content = std::fs::read_to_string(&failed_path).unwrap();
    let lines: Vec<&str> = content.lines().collect();
    assert!(!lines.is_empty(), "expected at least one failed region");
    for line in &lines {
        assert!(
            line.contains(':') && line.contains('-'),
            "line does not look like a region: {line:?}"
        );
    }
    let _failed_count = lines.len();

    // Run 2: feed the failed regions back via -T with the same memory limit.
    // Every region from run 1 must be re-attempted and fail again.
    let failed_path2 = dir.path().join("failed2.txt");
    let run2 = twitcher_cmd(&[
        "reads",
        BAM,
        "--reference",
        REF,
        "-v",
        "-m",
        "1mb",
        "--threads",
        "12",
        "-T",
        failed_path.to_str().unwrap(),
        "--output-failed-alignments",
        failed_path2.to_str().unwrap(),
    ]);
    eprintln!("{}", run2.stderr.as_bstr());
    assert!(run2.status.success());

    let computations2 = parse_counter(&run2.stderr, "alignments.computations");
    let oom2 = parse_counter(&run2.stderr, "alignments.results.failed.oom");
    assert!(
        computations2 > 0,
        "run 2 should re-attempt the failed regions; got 0 computations"
    );
    assert!(
        oom2 > 0,
        "run 2 should fail again with the same memory limit; got 0 OOM failures"
    );

    // Every failed alignment (fresh or cached) must appear in the file.
    // With 1 mb every alignment OOMs, so file line count == total OOM count.
    let content2 = std::fs::read_to_string(&failed_path2).unwrap();
    let failed_count2 = content2.lines().count();
    assert_eq!(
        failed_count2, oom2,
        "failed-alignments file line count ({failed_count2}) must equal total OOM counter ({oom2})"
    );
}