fxsplit 0.0.3

split FASTX into N chunks/files/headers
Documentation
mod common;

use common::*;

fn sample_fasta() -> &'static str {
    ">r1\nAAAA\n>r2\ncccc\n>r3\nGGGG\n>r4\nTTTT\n>r5\nNNNN\n"
}

#[test]
fn twobit_chunks_mean_records_per_file() {
    let t = tempdir();
    let input = join(t.path(), "input.2bit");
    let outdir = join(t.path(), "out_chunks");

    write_2bit(&input, sample_fasta());

    run_split(vec![
        "--file".to_string(),
        path_str(&input),
        "--chunks".to_string(),
        "2".to_string(),
        "--threads".to_string(),
        "2".to_string(),
        "--outdir".to_string(),
        path_str(&outdir),
    ]);

    let files = list_files(&outdir);
    assert_eq!(files.len(), 3);

    let counts: Vec<usize> = files
        .iter()
        .map(|name| count_2bit_records(&join(&outdir, name)))
        .collect();
    assert_eq!(counts, vec![2, 2, 1]);

    let first = read_2bit_sequences(&join(&outdir, &files[0]), true);
    assert_eq!(first[1], ("r2".to_string(), "cccc".to_string()));
}

#[test]
fn twobit_files_mean_number_of_output_files() {
    let t = tempdir();
    let input = join(t.path(), "input.2bit");
    let outdir = join(t.path(), "out_files");

    write_2bit(&input, sample_fasta());

    run_split(vec![
        "--file".to_string(),
        path_str(&input),
        "--files".to_string(),
        "3".to_string(),
        "--threads".to_string(),
        "2".to_string(),
        "--outdir".to_string(),
        path_str(&outdir),
    ]);

    let files = list_files(&outdir);
    assert_eq!(files.len(), 3);

    let counts: Vec<usize> = files
        .iter()
        .map(|name| count_2bit_records(&join(&outdir, name)))
        .collect();
    assert_eq!(counts, vec![2, 2, 1]);
}

#[test]
fn twobit_headers_use_sanitized_ids() {
    let t = tempdir();
    let input = join(t.path(), "input.2bit");
    let outdir = join(t.path(), "out_headers");

    write_2bit(&input, ">seq one\nAAAA\n>seq/two\nCCCC\n>seq/two\nGGGG\n");

    run_split(vec![
        "--file".to_string(),
        path_str(&input),
        "--headers".to_string(),
        "--threads".to_string(),
        "2".to_string(),
        "--outdir".to_string(),
        path_str(&outdir),
    ]);

    let files = list_files(&outdir);
    assert_eq!(
        files,
        vec![
            "seq_one.2bit".to_string(),
            "seq_two.2bit".to_string(),
            "seq_two_1.2bit".to_string()
        ]
    );

    for file in files {
        assert_eq!(count_2bit_records(&join(&outdir, &file)), 1);
    }
}

#[test]
fn twobit_no_mask_drops_softmask() {
    let t = tempdir();
    let input = join(t.path(), "input.2bit");
    let outdir = join(t.path(), "out_chunks");

    write_2bit(&input, ">r1\nacgtACGT\n");

    run_split(vec![
        "--file".to_string(),
        path_str(&input),
        "--chunks".to_string(),
        "1".to_string(),
        "--threads".to_string(),
        "2".to_string(),
        "--no-mask".to_string(),
        "--outdir".to_string(),
        path_str(&outdir),
    ]);

    let files = list_files(&outdir);
    assert_eq!(files.len(), 1);

    let records = read_2bit_sequences(&join(&outdir, &files[0]), true);
    assert_eq!(records, vec![("r1".to_string(), "ACGTACGT".to_string())]);
}

#[test]
fn twobit_preserves_lowercase_n_from_overlapping_masks() {
    let t = tempdir();
    let input = join(t.path(), "input.2bit");
    let outdir = join(t.path(), "out_headers");

    write_2bit_with_masks(&input, "chr1", "ACnnGT", &[2..4], &[2..4]);

    run_split(vec![
        "--file".to_string(),
        path_str(&input),
        "--headers".to_string(),
        "--threads".to_string(),
        "2".to_string(),
        "--outdir".to_string(),
        path_str(&outdir),
    ]);

    let files = list_files(&outdir);
    assert_eq!(files, vec!["chr1.2bit".to_string()]);

    let records = read_2bit_sequences(&join(&outdir, &files[0]), true);
    assert_eq!(records, vec![("chr1".to_string(), "ACnnGT".to_string())]);
}