ska 0.5.1

Split k-mer analysis
Documentation
use snapbox::cmd::{self, Command};

use hashbrown::HashSet;

pub mod common;
use crate::common::{var_hash, TestDir, TestSetup};

#[cfg(test)]
use pretty_assertions::assert_eq;

#[test]
fn align_n() {
    let sandbox = TestSetup::setup();

    // Tests that N and n are skipped, so second variant doesn't appear in align
    Command::new(cmd::cargo_bin!("ska"))
        .current_dir(sandbox.get_wd())
        .arg("build")
        .arg(sandbox.file_string("N_test_1.fa", TestDir::Input))
        .arg(sandbox.file_string("N_test_2.fa", TestDir::Input))
        .arg("-o")
        .arg("N_test.skf") // test no .skf.skf extension
        .assert()
        .success();

    Command::new(cmd::cargo_bin!("ska"))
        .current_dir(sandbox.get_wd())
        .arg("align")
        .arg("N_test.skf")
        .assert()
        .stdout_eq_path(sandbox.file_string("align_N.stdout", TestDir::Correct));
}

#[test]
fn map_n() {
    let sandbox = TestSetup::setup();

    // Tests that N and n are skipped, so second variant doesn't appear in map
    Command::new(cmd::cargo_bin!("ska"))
        .current_dir(sandbox.get_wd())
        .arg("build")
        .arg(sandbox.file_string("N_test_1.fa", TestDir::Input))
        .arg(sandbox.file_string("N_test_2.fa", TestDir::Input))
        .arg("-k")
        .arg("11")
        .arg("-o")
        .arg("N_test")
        .assert()
        .success();

    Command::new(cmd::cargo_bin!("ska"))
        .current_dir(sandbox.get_wd())
        .arg("map")
        .arg(sandbox.file_string("test_ref.fa", TestDir::Input))
        .arg("N_test.skf")
        .assert()
        .stdout_eq_path(sandbox.file_string("map_N.stdout", TestDir::Correct));
}

#[test]
fn rev_comp() {
    let sandbox = TestSetup::setup();

    // Test an RC sequence gives same alignment out
    Command::new(cmd::cargo_bin!("ska"))
        .current_dir(sandbox.get_wd())
        .arg("build")
        .arg("-o")
        .arg("fwd_build")
        .arg("-k")
        .arg("15")
        .arg(sandbox.file_string("test_1.fa", TestDir::Input))
        .arg(sandbox.file_string("test_2.fa", TestDir::Input))
        .assert()
        .success();

    let no_rc_aln = Command::new(cmd::cargo_bin!("ska"))
        .current_dir(sandbox.get_wd())
        .arg("align")
        .arg("fwd_build.skf")
        .output()
        .unwrap()
        .stdout;

    Command::new(cmd::cargo_bin!("ska"))
        .current_dir(sandbox.get_wd())
        .arg("build")
        .arg("-o")
        .arg("fwd_build")
        .arg("-k")
        .arg("15")
        .arg(sandbox.file_string("test_1.fa", TestDir::Input))
        .arg(sandbox.file_string("test_2_rc.fa", TestDir::Input))
        .assert()
        .success();

    let rc_aln = Command::new(cmd::cargo_bin!("ska"))
        .current_dir(sandbox.get_wd())
        .arg("align")
        .arg("fwd_build.skf")
        .output()
        .unwrap()
        .stdout;

    assert_eq!(var_hash(&no_rc_aln), var_hash(&rc_aln));

    // Now with rc off
    Command::new(cmd::cargo_bin!("ska"))
        .current_dir(sandbox.get_wd())
        .arg("build")
        .arg("-o")
        .arg("fwd_build")
        .arg("-k")
        .arg("15")
        .arg(sandbox.file_string("test_1.fa", TestDir::Input))
        .arg(sandbox.file_string("test_2_rc.fa", TestDir::Input))
        .arg("--single-strand")
        .assert()
        .success();

    let ss_aln = Command::new(cmd::cargo_bin!("ska"))
        .current_dir(sandbox.get_wd())
        .arg("align")
        .arg("fwd_build.skf")
        .output()
        .unwrap()
        .stdout;

    assert_eq!(var_hash(&ss_aln).is_empty(), true);

    Command::new(cmd::cargo_bin!("ska"))
        .current_dir(sandbox.get_wd())
        .arg("build")
        .arg("-o")
        .arg("build_k33")
        .arg("-k")
        .arg("33")
        .arg("--single-strand")
        .arg(sandbox.file_string("test_1.fa", TestDir::Input))
        .arg(sandbox.file_string("test_2.fa", TestDir::Input))
        .assert()
        .success();

    let fasta_align_out_k33 = Command::new(cmd::cargo_bin!("ska"))
        .current_dir(sandbox.get_wd())
        .arg("align")
        .arg("build_k33.skf")
        .output()
        .unwrap()
        .stdout;

    let correct_aln = HashSet::from([vec!['T', 'A'], vec!['G', 'A']]);
    assert_eq!(var_hash(&fasta_align_out_k33), correct_aln);
}

#[test]
fn repeats() {
    let sandbox = TestSetup::setup();

    // Tests that three repeats are correctly shown with IUPAC codes
    Command::new(cmd::cargo_bin!("ska"))
        .current_dir(sandbox.get_wd())
        .arg("build")
        .arg("-k")
        .arg("9")
        .arg(sandbox.file_string("dup_test_1.fa", TestDir::Input))
        .arg(sandbox.file_string("dup_test_2.fa", TestDir::Input))
        .arg("-o")
        .arg("dup_ss")
        .arg("--single-strand")
        .assert()
        .success();

    Command::new(cmd::cargo_bin!("ska"))
        .current_dir(sandbox.get_wd())
        .arg("align")
        .arg("dup_ss.skf")
        .assert()
        .stdout_eq_path(sandbox.file_string("dup_ss.stdout", TestDir::Correct));

    // Also tests this is just a single variant
    Command::new(cmd::cargo_bin!("ska"))
        .current_dir(sandbox.get_wd())
        .arg("weed")
        .arg("dup_ss.skf")
        .arg("--filter")
        .arg("no-const")
        .args(["--min-freq", "1"])
        .assert()
        .success();

    Command::new(cmd::cargo_bin!("ska"))
        .current_dir(sandbox.get_wd())
        .arg("nk")
        .arg("dup_ss.skf")
        .arg("--full-info")
        .assert()
        .stdout_matches_path(sandbox.file_string("dup_ss_nk.stdout", TestDir::Correct));

    // Tests that the RC of these IUPAC codes is correct
    // (the rc of the input k-mer is the canonical one)
    Command::new(cmd::cargo_bin!("ska"))
        .current_dir(sandbox.get_wd())
        .arg("build")
        .arg("-k")
        .arg("9")
        .arg(sandbox.file_string("dup_test_1.fa", TestDir::Input))
        .arg(sandbox.file_string("dup_test_2.fa", TestDir::Input))
        .arg("-o")
        .arg("dup_rc")
        .assert()
        .success();

    Command::new(cmd::cargo_bin!("ska"))
        .current_dir(sandbox.get_wd())
        .arg("align")
        .arg("dup_rc.skf")
        .assert()
        .stdout_eq_path(sandbox.file_string("dup_rc.stdout", TestDir::Correct));
}

#[test]
fn palindromes() {
    let sandbox = TestSetup::setup();

    // Check that palindrome k-mers (self-rc) are correctly ambiguous
    Command::new(cmd::cargo_bin!("ska"))
        .current_dir(sandbox.get_wd())
        .arg("build")
        .arg("-k")
        .arg("15")
        .arg(sandbox.file_string("palindrome_1.fa", TestDir::Input))
        .arg(sandbox.file_string("palindrome_2.fa", TestDir::Input))
        .arg("-o")
        .arg("otto")
        .arg("-v")
        .assert()
        .success();

    Command::new(cmd::cargo_bin!("ska"))
        .current_dir(sandbox.get_wd())
        .arg("align")
        .arg("otto.skf")
        .args(["--filter", "no-filter"])
        .assert()
        .stdout_eq_path(sandbox.file_string("palindrome.stdout", TestDir::Correct));

    // Single strand forces orientation so no ambiguity
    Command::new(cmd::cargo_bin!("ska"))
        .current_dir(sandbox.get_wd())
        .arg("build")
        .arg("-k")
        .arg("15")
        .arg(sandbox.file_string("palindrome_1.fa", TestDir::Input))
        .arg(sandbox.file_string("palindrome_2.fa", TestDir::Input))
        .arg("-o")
        .arg("otan")
        .arg("--single-strand")
        .assert()
        .success();

    Command::new(cmd::cargo_bin!("ska"))
        .current_dir(sandbox.get_wd())
        .arg("align")
        .arg("otan.skf")
        .assert()
        .stdout_eq_path(sandbox.file_string("palindrome_norc.stdout", TestDir::Correct));

    // Check that palindrome k-mers (self-rc) are correctly ambiguous, and
    // work correctly when multiple copies are present
    Command::new(cmd::cargo_bin!("ska"))
        .current_dir(sandbox.get_wd())
        .arg("build")
        .arg("-k")
        .arg("15")
        .arg(sandbox.file_string("palindrome_reps_1.fa", TestDir::Input))
        .arg(sandbox.file_string("palindrome_reps_2.fa", TestDir::Input))
        .arg("-o")
        .arg("ottootto")
        .arg("-v")
        .assert()
        .success();

    Command::new(cmd::cargo_bin!("ska"))
        .current_dir(sandbox.get_wd())
        .arg("align")
        .arg("ottootto.skf")
        .args(["--filter", "no-filter"])
        .assert()
        .stdout_eq_path(sandbox.file_string("palindrome_reps.stdout", TestDir::Correct));
}