use flate2::write::GzEncoder;
use flate2::Compression;
use seqwish::intervaltree::IntervalTree;
use seqwish::*;
use std::fs::File;
use std::io::Write;
use std::sync::{Arc, Mutex, RwLock};
use std::time::{SystemTime, UNIX_EPOCH};
fn run_pipeline(in_memory: bool) -> Result<String, Box<dyn std::error::Error>> {
let unique_id = SystemTime::now().duration_since(UNIX_EPOCH)?.as_nanos();
let fasta_path = format!("/tmp/test_pipeline_{}_{}.fasta", unique_id, in_memory);
let paf_path = format!("/tmp/test_pipeline_{}_{}.paf.gz", unique_id, in_memory);
let base_dir = format!("/tmp/test_pipeline_{}_{}", unique_id, in_memory);
std::fs::create_dir_all(&base_dir)?;
let mut fasta_file = File::create(&fasta_path)?;
writeln!(fasta_file, ">seq1")?;
writeln!(fasta_file, "ACGTACGTACGT")?; writeln!(fasta_file, ">seq2")?;
writeln!(fasta_file, "ACGTACGTACGT")?; writeln!(fasta_file, ">seq3")?;
writeln!(fasta_file, "ACGTACGTTTTT")?; fasta_file.flush()?;
let mut seqidx = seqindex::SeqIndex::new();
seqidx.build_index(&fasta_path)?;
let seqidx = Arc::new(seqidx);
let paf_file = File::create(&paf_path)?;
let mut gz = GzEncoder::new(paf_file, Compression::default());
writeln!(
gz,
"seq1\t12\t0\t12\t+\tseq2\t12\t0\t12\t12\t12\t60\tcg:Z:12M"
)?;
writeln!(gz, "seq1\t12\t0\t8\t+\tseq3\t12\t0\t8\t8\t8\t60\tcg:Z:8M")?;
gz.finish()?;
let mut aln_iitree_obj = if in_memory {
intervaltree::AdaptiveTree::new_memory()?
} else {
let aln_path = format!("{}/aln.iitree", base_dir);
intervaltree::AdaptiveTree::new_disk(&aln_path)?
};
aln_iitree_obj.open_writer()?;
let aln_iitree = Arc::new(Mutex::new(aln_iitree_obj));
alignments::unpack_paf_alignments(
&paf_path,
Arc::clone(&aln_iitree),
Arc::clone(&seqidx),
1, 0.0, 1, )?;
{
let mut tree_guard = aln_iitree.lock().unwrap();
tree_guard.index()?;
}
let mut node_iitree_obj = if in_memory {
intervaltree::AdaptiveTree::new_memory()?
} else {
let node_path = format!("{}/node.iitree", base_dir);
intervaltree::AdaptiveTree::new_disk(&node_path)?
};
node_iitree_obj.open_writer()?;
let node_iitree = Arc::new(RwLock::new(node_iitree_obj));
let mut path_iitree_obj = if in_memory {
intervaltree::AdaptiveTree::new_memory()?
} else {
let path_path = format!("{}/path.iitree", base_dir);
intervaltree::AdaptiveTree::new_disk(&path_path)?
};
path_iitree_obj.open_writer()?;
let path_iitree = Arc::new(RwLock::new(path_iitree_obj));
let aln_iitree_arc = Arc::new(
Arc::try_unwrap(aln_iitree)
.map_err(|_| "Cannot unwrap aln_iitree Arc")?
.into_inner()
.unwrap(),
);
let seq_v_file = format!("{}/seq_v.txt", base_dir);
let graph_length = transclosure::compute_transitive_closures(
Arc::clone(&seqidx),
aln_iitree_arc,
&seq_v_file,
Arc::clone(&node_iitree),
Arc::clone(&path_iitree),
0, 0, 10000, false, 1, )?;
{
let mut node_guard = node_iitree.write().unwrap();
node_guard.index()?;
}
{
let mut path_guard = path_iitree.write().unwrap();
path_guard.index()?;
}
use bitvec::prelude::*;
let mut seq_id_bv = BitVec::<u64, Lsb0>::repeat(false, graph_length);
compact::compact_nodes(
Arc::clone(&seqidx),
graph_length,
Arc::clone(&node_iitree),
Arc::clone(&path_iitree),
&mut seq_id_bv,
1, )?;
let seq_id_cbv =
links::RankSelectBitVector::from_bitvec(&seq_id_bv.iter().by_vals().collect::<Vec<bool>>());
let link_v = links::derive_links(
Arc::clone(&seqidx),
Arc::clone(&node_iitree),
Arc::clone(&path_iitree),
&seq_id_cbv,
1, )?;
let mut gfa_output = Vec::new();
gfa::emit_gfa(
&mut gfa_output,
graph_length,
&seq_v_file,
Arc::clone(&node_iitree),
Arc::clone(&path_iitree),
&seq_id_cbv,
Arc::clone(&seqidx),
link_v.links(),
1, )?;
std::fs::remove_file(&fasta_path).ok();
std::fs::remove_file(&paf_path).ok();
std::fs::remove_dir_all(&base_dir).ok();
Ok(String::from_utf8(gfa_output)?)
}
#[test]
fn test_pipeline_disk_vs_memory_identical() -> Result<(), Box<dyn std::error::Error>> {
let disk_gfa = run_pipeline(false)?;
let memory_gfa = run_pipeline(true)?;
assert_eq!(
disk_gfa, memory_gfa,
"Disk and in-memory modes produced different GFA outputs"
);
assert!(
disk_gfa.starts_with("H\tVN:Z:1.0"),
"GFA should start with header"
);
assert!(disk_gfa.contains("S\t"), "GFA should contain segments");
assert!(disk_gfa.contains("P\t"), "GFA should contain paths");
Ok(())
}
#[test]
fn test_pipeline_memory_mode_basic() -> Result<(), Box<dyn std::error::Error>> {
let gfa = run_pipeline(true)?;
let lines: Vec<&str> = gfa.lines().collect();
let mut num_header = 0;
let mut num_segments = 0;
let mut num_paths = 0;
for line in &lines {
if line.starts_with("H\t") {
num_header += 1;
} else if line.starts_with("S\t") {
num_segments += 1;
} else if line.starts_with("P\t") {
num_paths += 1;
}
}
assert_eq!(num_header, 1, "Should have exactly 1 header line");
assert!(num_segments > 0, "Should have at least 1 segment");
assert!(
num_paths == 3,
"Should have exactly 3 paths (one per input sequence)"
);
let path_lines: Vec<&str> = lines
.iter()
.filter(|l| l.starts_with("P\t"))
.map(|&s| s)
.collect();
let path_str = path_lines.join("\n");
assert!(path_str.contains("seq1"), "Should have path for seq1");
assert!(path_str.contains("seq2"), "Should have path for seq2");
assert!(path_str.contains("seq3"), "Should have path for seq3");
Ok(())
}
#[test]
fn test_pipeline_disk_mode_basic() -> Result<(), Box<dyn std::error::Error>> {
let gfa = run_pipeline(false)?;
let lines: Vec<&str> = gfa.lines().collect();
let mut num_header = 0;
let mut num_segments = 0;
let mut num_paths = 0;
for line in &lines {
if line.starts_with("H\t") {
num_header += 1;
} else if line.starts_with("S\t") {
num_segments += 1;
} else if line.starts_with("P\t") {
num_paths += 1;
}
}
assert_eq!(num_header, 1, "Should have exactly 1 header line");
assert!(num_segments > 0, "Should have at least 1 segment");
assert!(num_paths == 3, "Should have exactly 3 paths");
Ok(())
}