use std::io::{self, Write};
use std::fs::File;
use std::sync::{Arc, Mutex};
use std::time::Instant;
use clap::{Arg, Command};
use bitvec::prelude::*;
mod tempfile;
mod pos;
mod dna;
mod cigar;
mod mmap;
mod utils;
mod time;
mod paf;
mod sxs;
mod alignments;
mod version;
mod seqindex;
mod transclosure;
mod compact;
mod links;
mod gfa;
use crate::seqindex::SeqIndex;
use crate::alignments::unpack_paf_alignments;
use crate::transclosure::compute_transitive_closures;
use crate::compact::compact_nodes;
use crate::links::{derive_links, RankSelectBitVector};
use crate::gfa::emit_gfa;
use iitree_rs::IITree;
fn main() -> io::Result<()> {
let matches = Command::new("seqwish")
.version(version::get_version())
.about("A variation graph inducer")
.arg(Arg::new("paf-alns")
.short('p')
.long("paf-alns")
.value_name("FILE")
.help("Induce the graph from these PAF formatted alignments")
.required(true))
.arg(Arg::new("seqs")
.short('s')
.long("seqs")
.value_name("FILE")
.help("The sequences used to generate the alignments (FASTA, FASTQ, .seq)")
.required(true))
.arg(Arg::new("gfa")
.short('g')
.long("gfa")
.value_name("FILE")
.help("Write the graph in GFA to FILE (stdout if not specified)"))
.arg(Arg::new("temp-dir")
.short('b')
.long("temp-dir")
.value_name("PATH")
.help("Directory for temporary files [default: current directory]"))
.arg(Arg::new("threads")
.short('t')
.long("threads")
.value_name("N")
.help("Use this many threads during parallel steps")
.default_value("1"))
.arg(Arg::new("repeat-max")
.short('r')
.long("repeat-max")
.value_name("N")
.help("Limit transitive closure to include no more than N copies of a given input base")
.default_value("0"))
.arg(Arg::new("min-repeat-distance")
.short('l')
.long("min-repeat-distance")
.value_name("N")
.help("Prevent transitive closure for bases at least this far apart in input sequences")
.default_value("0"))
.arg(Arg::new("min-match-len")
.short('k')
.long("min-match-len")
.value_name("N")
.help("Filter exact matches below this length")
.default_value("0"))
.arg(Arg::new("sparse-factor")
.short('f')
.long("sparse-factor")
.value_name("N")
.help("Sparsify input matches, keeping the fraction that minimize a hash function")
.default_value("0.0"))
.arg(Arg::new("transclose-batch")
.short('B')
.long("transclose-batch")
.value_name("N")
.help("Number of bp to use for transitive closure batch")
.default_value("1000000"))
.arg(Arg::new("keep-temp")
.short('T')
.long("keep-temp")
.help("Keep intermediate files generated during graph induction")
.action(clap::ArgAction::SetTrue))
.arg(Arg::new("show-progress")
.short('P')
.long("show-progress")
.help("Log algorithm progress")
.action(clap::ArgAction::SetTrue))
.get_matches();
let start_time = Instant::now();
let paf_file = matches.get_one::<String>("paf-alns").unwrap();
let seq_file = matches.get_one::<String>("seqs").unwrap();
let gfa_file = matches.get_one::<String>("gfa");
let num_threads: usize = matches.get_one::<String>("threads").unwrap().parse().unwrap_or(1);
let repeat_max: u64 = matches.get_one::<String>("repeat-max").unwrap().parse().unwrap_or(0);
let min_repeat_dist: u64 = matches.get_one::<String>("min-repeat-distance").unwrap().parse().unwrap_or(0);
let min_match_len: u64 = matches.get_one::<String>("min-match-len").unwrap().parse().unwrap_or(0);
let sparse_factor: f32 = matches.get_one::<String>("sparse-factor").unwrap().parse().unwrap_or(0.0);
let transclose_batch: u64 = matches.get_one::<String>("transclose-batch").unwrap().parse().unwrap_or(1000000);
let keep_temp = matches.get_flag("keep-temp");
let show_progress = matches.get_flag("show-progress");
if let Some(temp_dir) = matches.get_one::<String>("temp-dir") {
tempfile::set_dir(temp_dir);
}
tempfile::set_keep_temp(keep_temp);
if !std::path::Path::new(seq_file).exists() {
eprintln!("[seqwish] ERROR: input sequence file {} does not exist", seq_file);
return Err(io::Error::new(io::ErrorKind::NotFound, "sequence file not found"));
}
if !std::path::Path::new(paf_file).exists() {
eprintln!("[seqwish] ERROR: input alignment file {} does not exist", paf_file);
return Err(io::Error::new(io::ErrorKind::NotFound, "alignment file not found"));
}
if show_progress {
eprintln!("[seqwish::seqindex] {:.3} loading sequences", start_time.elapsed().as_secs_f64());
}
let mut seqidx = SeqIndex::new();
seqidx.build_index(seq_file).map_err(|e| io::Error::new(io::ErrorKind::Other, e))?;
let seqidx = Arc::new(seqidx);
if show_progress {
eprintln!("[seqwish::seqindex] {:.3} loaded {} sequences",
start_time.elapsed().as_secs_f64(), seqidx.n_seqs());
}
if show_progress {
eprintln!("[seqwish::alignments] {:.3} loading alignments", start_time.elapsed().as_secs_f64());
}
let aln_iitree_idx = tempfile::create("seqwish-", ".sqa")?;
let mut aln_iitree_obj = IITree::<u64, u64>::new(&aln_iitree_idx)?;
aln_iitree_obj.open_writer()?;
let aln_iitree = Arc::new(Mutex::new(aln_iitree_obj));
unpack_paf_alignments(
paf_file,
Arc::clone(&aln_iitree),
Arc::clone(&seqidx),
min_match_len,
sparse_factor,
num_threads,
)?;
if show_progress {
eprintln!("[seqwish::alignments] {:.3} indexing", start_time.elapsed().as_secs_f64());
}
aln_iitree.lock().unwrap().index()?;
if show_progress {
eprintln!("[seqwish::alignments] {:.3} index built", start_time.elapsed().as_secs_f64());
}
if show_progress {
eprintln!("[seqwish::transclosure] {:.3} computing transitive closures",
start_time.elapsed().as_secs_f64());
}
let seq_v_file = tempfile::create("seqwish-", ".sqs")?;
let node_iitree_idx = tempfile::create("seqwish-", ".sqn")?;
let path_iitree_idx = tempfile::create("seqwish-", ".sqp")?;
let mut node_iitree_obj = IITree::<u64, u64>::new(&node_iitree_idx)?;
node_iitree_obj.open_writer()?;
let node_iitree = Arc::new(Mutex::new(node_iitree_obj));
let mut path_iitree_obj = IITree::<u64, u64>::new(&path_iitree_idx)?;
path_iitree_obj.open_writer()?;
let path_iitree = Arc::new(Mutex::new(path_iitree_obj));
let graph_length = compute_transitive_closures(
Arc::clone(&seqidx),
Arc::clone(&aln_iitree),
seq_v_file.to_str().unwrap(),
Arc::clone(&node_iitree),
Arc::clone(&path_iitree),
repeat_max,
min_repeat_dist,
transclose_batch,
show_progress,
num_threads,
)?;
if show_progress {
eprintln!("[seqwish::transclosure] {:.3} done with transitive closures (graph length: {})",
start_time.elapsed().as_secs_f64(), graph_length);
}
if show_progress {
eprintln!("[seqwish::compact] {:.3} compacting nodes", start_time.elapsed().as_secs_f64());
}
let mut seq_id_bv = BitVec::<u64, Lsb0>::repeat(false, graph_length + 1);
compact_nodes(
Arc::clone(&seqidx),
graph_length,
Arc::clone(&node_iitree),
Arc::clone(&path_iitree),
&mut seq_id_bv,
num_threads,
)?;
if show_progress {
eprintln!("[seqwish::compact] {:.3} done compacting", start_time.elapsed().as_secs_f64());
}
let seq_id_cbv = RankSelectBitVector::from_bitvec(&seq_id_bv.iter().by_vals().collect::<Vec<bool>>());
drop(seq_id_bv);
if show_progress {
eprintln!("[seqwish::compact] {:.3} built node index", start_time.elapsed().as_secs_f64());
}
if show_progress {
eprintln!("[seqwish::links] {:.3} finding graph links", start_time.elapsed().as_secs_f64());
}
let link_set = derive_links(
Arc::clone(&seqidx),
Arc::clone(&node_iitree),
Arc::clone(&path_iitree),
&seq_id_cbv,
num_threads,
)?;
if show_progress {
eprintln!("[seqwish::links] {:.3} links derived ({} links)",
start_time.elapsed().as_secs_f64(), link_set.len());
}
if show_progress {
eprintln!("[seqwish::gfa] {:.3} writing graph", start_time.elapsed().as_secs_f64());
}
if let Some(gfa_path) = gfa_file {
let mut out = File::create(gfa_path)?;
emit_gfa(
&mut out,
graph_length,
seq_v_file.to_str().unwrap(),
Arc::clone(&node_iitree),
Arc::clone(&path_iitree),
&seq_id_cbv,
Arc::clone(&seqidx),
link_set.links(),
num_threads,
)?;
} else {
let stdout = io::stdout();
let mut out = stdout.lock();
emit_gfa(
&mut out,
graph_length,
seq_v_file.to_str().unwrap(),
Arc::clone(&node_iitree),
Arc::clone(&path_iitree),
&seq_id_cbv,
Arc::clone(&seqidx),
link_set.links(),
num_threads,
)?;
}
if show_progress {
eprintln!("[seqwish::gfa] {:.3} done", start_time.elapsed().as_secs_f64());
}
Ok(())
}