use std::collections::HashSet;
use std::fs;
const TEST_BAM: &str = "tests/fixtures/illumina-test1.bam";
const TEST_ONT_BAM: &str = "tests/fixtures/ont-test1.bam";
const EXPECTED_READS: usize = 9534;
fn count_fastq_reads(content: &str) -> usize {
content.lines().count() / 4
}
fn get_file_size(path: &str) -> u64 {
fs::metadata(path).unwrap().len()
}
fn extract_read_names(fastq_content: &str) -> HashSet<String> {
fastq_content
.lines()
.enumerate()
.filter_map(|(i, line)| {
if i % 4 == 0 && line.starts_with('@') {
Some(line[1..].to_string()) } else {
None
}
})
.collect()
}
#[test]
fn test_split_processing_produces_all_reads_no_dupes() {
assert!(
std::path::Path::new(TEST_BAM).exists(),
"Test BAM file not found: {TEST_BAM}"
);
let file_size = get_file_size(TEST_BAM);
let half = file_size / 2;
eprintln!("File size: {file_size} bytes");
eprintln!("Split point: {half} bytes");
let mut chunk1_buffer = Vec::new();
let chunk1_reads = bamslice::process_blocks(TEST_BAM, 0, half, &mut chunk1_buffer).unwrap();
let mut chunk2_buffer = Vec::new();
let chunk2_reads =
bamslice::process_blocks(TEST_BAM, half, file_size, &mut chunk2_buffer).unwrap();
let chunk1_content = String::from_utf8(chunk1_buffer).unwrap();
let chunk2_content = String::from_utf8(chunk2_buffer).unwrap();
let chunk1_names = extract_read_names(&chunk1_content);
let chunk2_names = extract_read_names(&chunk2_content);
let duplicates: Vec<_> = chunk1_names.intersection(&chunk2_names).collect();
assert!(
duplicates.is_empty(),
"Found {} duplicate read(s) appearing in both chunks: {:?}",
duplicates.len(),
duplicates.iter().take(10).collect::<Vec<_>>() );
let chunk1_fastq_reads = count_fastq_reads(&chunk1_content);
let chunk2_fastq_reads = count_fastq_reads(&chunk2_content);
let total_reads = chunk1_fastq_reads + chunk2_fastq_reads;
assert_eq!(
chunk1_reads, chunk1_fastq_reads,
"Chunk 1: returned count doesn't match FASTQ content"
);
assert_eq!(
chunk2_reads, chunk2_fastq_reads,
"Chunk 2: returned count doesn't match FASTQ content"
);
assert_eq!(
total_reads, EXPECTED_READS,
"Total read count mismatch: {total_reads} (ours) vs {EXPECTED_READS} (expected from samtools)"
);
assert!(chunk1_reads > 0, "Chunk 1 should have reads");
assert!(chunk2_reads > 0, "Chunk 2 should have reads");
println!(" Chunk 1: {chunk1_reads} reads");
println!(" Chunk 2: {chunk2_reads} reads");
}
#[test]
fn test_whole_file_processing() {
let file_size = get_file_size(TEST_BAM);
let mut buffer = Vec::new();
let read_count = bamslice::process_blocks(TEST_BAM, 0, file_size, &mut buffer).unwrap();
let content = String::from_utf8(buffer).unwrap();
let fastq_reads = count_fastq_reads(&content);
assert_eq!(
read_count, fastq_reads,
"Returned read count doesn't match FASTQ content {read_count} vs {fastq_reads}"
);
assert_eq!(
fastq_reads, EXPECTED_READS,
"FASTQ record count mismatch: {fastq_reads} vs {EXPECTED_READS}"
);
}
#[test]
fn test_first_read_content() {
let file_size = get_file_size(TEST_BAM);
let mut buffer = Vec::new();
bamslice::process_blocks(TEST_BAM, 0, file_size, &mut buffer).unwrap();
let content = String::from_utf8(buffer).unwrap();
let mut lines = content.lines();
let expected_name = "@NS500355:NS500355:HHVN5AFX7:1:11101:11181:6634/1 1:N:0:CGTCAAGA-GGGTTGTT";
let expected_seq =
"AAATTTTAGAAAATTGTTATATTATTTGGGTTATTAGTGGAGATATTTGTTATAATTTTTTTTTAGGCGTAATTTG";
let expected_qual =
"AAAAAEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE";
let name = lines.next().expect("Missing read name line");
let seq = lines.next().expect("Missing sequence line");
let plus = lines.next().expect("Missing + line");
let qual = lines.next().expect("Missing quality line");
assert_eq!(name, expected_name, "First read name mismatch");
assert_eq!(seq, expected_seq, "First read sequence mismatch");
assert_eq!(plus, "+", "Expected + separator line");
assert_eq!(qual, expected_qual, "First read quality scores mismatch");
}
#[test]
fn test_blocks_start_with_read1() {
let file_size = get_file_size(TEST_BAM);
let split_points = [
file_size / 4,
file_size / 3,
file_size / 2,
file_size * 2 / 3,
file_size * 3 / 4,
];
for &split_point in &split_points {
let mut buffer = Vec::new();
bamslice::process_blocks(TEST_BAM, split_point, file_size, &mut buffer).unwrap();
let content = String::from_utf8(buffer).unwrap();
let mut lines = content.lines();
if let Some(first_line) = lines.next() {
assert!(
first_line.starts_with('@'),
"First line should be read name at split {split_point}"
);
assert!(
first_line.contains("/1 "),
"First read should be read 1 (have /1 suffix) at split {split_point}, got: {first_line}"
);
}
}
}
#[test]
fn test_interleaved_pairs_in_output() {
let file_size = get_file_size(TEST_BAM);
let half = file_size / 2;
let mut buffer = Vec::new();
bamslice::process_blocks(TEST_BAM, half, file_size, &mut buffer).unwrap();
let content = String::from_utf8(buffer).unwrap();
let lines: Vec<&str> = content.lines().collect();
let mut expect_read1 = true;
for i in (0..lines.len()).step_by(4) {
if i >= lines.len() {
break;
}
let read_name = lines[i];
if expect_read1 {
assert!(
read_name.contains("/1 "),
"Expected read 1 at line {i}, got: {read_name}"
);
} else {
assert!(
read_name.contains("/2 "),
"Expected read 2 at line {i}, got: {read_name}"
);
}
expect_read1 = !expect_read1;
}
}
#[test]
fn test_complete_pairs_at_boundaries() {
let file_size = get_file_size(TEST_BAM);
let split = file_size / 2;
let mut chunk1 = Vec::new();
let chunk1_count = bamslice::process_blocks(TEST_BAM, 0, split, &mut chunk1).unwrap();
let mut chunk2 = Vec::new();
let chunk2_count = bamslice::process_blocks(TEST_BAM, split, file_size, &mut chunk2).unwrap();
assert_eq!(
chunk1_count % 2,
0,
"First chunk should have complete pairs (even count), got {chunk1_count}"
);
assert_eq!(
chunk2_count % 2,
0,
"Second chunk should have complete pairs (even count), got {chunk2_count}"
);
}
#[test]
fn test_ont_reads() {
let split = 180;
let mut chunk1 = Vec::new();
let chunk1_count = bamslice::process_blocks(TEST_ONT_BAM, 0, split, &mut chunk1).unwrap();
let mut chunk2 = Vec::new();
let chunk2_count =
bamslice::process_blocks(TEST_ONT_BAM, split, 1_000_000, &mut chunk2).unwrap();
assert_eq!(chunk1_count, 84);
assert_eq!(chunk2_count, 125 - 84);
}
#[test]
fn test_skips_false_positive_gzip_magic() {
let false_positive_offset = 132_321;
let end_offset = 200_000;
let mut buffer = Vec::new();
let result = bamslice::process_blocks(TEST_BAM, false_positive_offset, end_offset, &mut buffer);
assert!(
result.is_ok(),
"Should skip false positive gzip magic at {} and find valid block, got error: {:?}",
false_positive_offset,
result.err()
);
}
#[test]
fn test_dnbseq_seek_issue() {
const TEST_FILE: &str = "tests/fixtures/dnbseq-test1.bam";
let start_offset = 1000;
let end_offset = 100_000;
let mut buffer = Vec::new();
let result = bamslice::process_blocks(TEST_FILE, start_offset, end_offset, &mut buffer);
assert!(
result.is_ok(),
"Should successfully process blocks even with tricky seek, got error: {:?}",
result.err()
);
let read_count = result.unwrap();
assert!(
read_count > 0,
"Should have extracted reads, got {read_count}"
);
}
#[test]
fn test_process_near_eof() {
const TEST_FILE: &str = "tests/fixtures/dnbseq-test1.bam";
let mut output = Vec::new();
let result = bamslice::process_blocks(TEST_FILE, 13_590_000, 13_700_000, &mut output);
assert!(result.is_ok(), "Should handle near-EOF offsets gracefully");
let read_count = result.unwrap();
assert_eq!(
read_count, 146,
"Should extract remaining reads near EOF (146 = 73 complete pairs, orphaned read2 at start is skipped)"
);
assert_eq!(
read_count % 2,
0,
"Read count should be even (complete pairs)"
);
let output_str = String::from_utf8(output).unwrap();
assert!(
output_str.contains("V350145865L1C001R01500464560"),
"Should include the last read from the file"
);
}
#[test]
fn test_window_after_last_block() {
const TEST_FILE: &str = "tests/fixtures/dnbseq-test1.bam";
let mut output = Vec::new();
let result = bamslice::process_blocks(TEST_FILE, 13_598_803, u64::MAX, &mut output);
assert!(
result.is_ok(),
"Should not error when starting beyond last block"
);
let read_count = result.unwrap();
assert_eq!(read_count, 0, "Should return 0 reads when no blocks found");
}
#[test]
fn test_window_after_end_of_file() {
const TEST_FILE: &str = "tests/fixtures/dnbseq-test1.bam";
let mut output = Vec::new();
let result = bamslice::process_blocks(TEST_FILE, 18_000_000, u64::MAX, &mut output);
assert!(
result.is_ok(),
"Should not error when starting beyond last block"
);
let read_count = result.unwrap();
assert_eq!(read_count, 0, "Should return 0 reads when no blocks found");
}
#[test]
fn test_dnbseq_small_chunks_find_all_reads() {
const TEST_FILE: &str = "tests/fixtures/dnbseq-test1.bam";
const EXPECTED_READS: usize = 200_000;
const TRUTH_HASH: &str = "25fe990b5612285e8b0fa04e2bf0e612"; const CHUNK_SIZE: u64 = 100_000;
let file_size = get_file_size(TEST_FILE);
let mut total_reads = 0;
let mut hasher = md5::Context::new();
let mut chunk_num = 0;
let mut start = 0;
while start < file_size {
let end = (start + CHUNK_SIZE).min(file_size);
let mut buffer = Vec::new();
let chunk_reads = bamslice::process_blocks(TEST_FILE, start, end, &mut buffer)
.unwrap_or_else(|e| panic!("Failed to process chunk {chunk_num} ({start}-{end}): {e}"));
total_reads += chunk_reads;
let content = String::from_utf8(buffer).unwrap();
for (idx, line) in content.lines().enumerate() {
if idx % 4 == 0 && line.starts_with('@') {
let read_name = &line[1..];
let core_name = read_name.split_whitespace().next().unwrap_or(read_name);
hasher.consume(core_name.as_bytes());
hasher.consume(b"\n"); }
}
chunk_num += 1;
if chunk_num % 10 == 0 {
println!(" Processed chunk {chunk_num}: {total_reads} reads so far");
}
start = end;
}
let result_hash = format!("{:x}", hasher.finalize());
assert_eq!(total_reads, EXPECTED_READS);
assert_eq!(
result_hash, TRUTH_HASH,
"MD5 hash of read names doesn't match samtools output.\nGot: {result_hash}\nExpected: {TRUTH_HASH}"
);
}
#[test]
fn test_no_orphaned_reads_in_chunks() {
const TEST_FILE: &str = "tests/fixtures/dnbseq-test1.bam";
const CHUNK_SIZE: u64 = 100_000;
let file_size = get_file_size(TEST_FILE);
let mut start = 0;
let mut chunk_num = 0;
while start < file_size {
let end = (start + CHUNK_SIZE).min(file_size);
let mut buffer = Vec::new();
let chunk_reads = bamslice::process_blocks(TEST_FILE, start, end, &mut buffer)
.unwrap_or_else(|e| panic!("Failed to process chunk {chunk_num} ({start}-{end}): {e}"));
if !chunk_reads.is_multiple_of(2) {
let content = String::from_utf8_lossy(&buffer);
let lines: Vec<&str> = content.lines().collect();
eprintln!("\nChunk {chunk_num} ({start}-{end}) FIRST 4 reads:");
for line in lines.iter().take(16) {
eprintln!(" {line}");
}
eprintln!("\nChunk {chunk_num} ({start}-{end}) LAST 4 reads:");
for i in (0..16).rev() {
if i < lines.len() {
eprintln!(" {}", lines[lines.len() - 1 - i]);
}
}
}
assert_eq!(
chunk_reads % 2,
0,
"Chunk {chunk_num} ({start}-{end}) has odd number of reads: {chunk_reads}. This indicates an orphaned read without its mate."
);
chunk_num += 1;
start = end;
}
}