use std::collections::HashMap;
use std::fs::File;
use std::ops::{Range, RangeInclusive};
use std::path::{Path, PathBuf};
use std::io::{self, BufRead, BufReader};
use flate2::read::MultiGzDecoder;
use gbz::{GBWT, Pos, ENDMARKER};
use pggname::GraphName;
use simple_sds::binaries;
pub fn get_test_data(filename: &'static str) -> PathBuf {
let mut buf = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
buf.push("test-data");
buf.push(filename);
buf
}
pub fn human_readable_size(size: usize) -> String {
let (size, unit) = binaries::human_readable_size(size);
format!("{:.3} {}", size, unit)
}
pub fn file_size<P: AsRef<Path>>(filename: P) -> Option<String> {
let (size, unit) = binaries::file_size(filename)?;
Some(format!("{:.3} {}", size, unit))
}
pub fn report_peak_memory_usage() {
let peak_memory = binaries::peak_memory_usage();
if peak_memory.is_err() {
return;
}
let (size, unit) = binaries::human_readable_size(peak_memory.unwrap());
eprintln!("Peak memory usage: {:.3} {}", size, unit);
}
pub fn is_gzipped<R: BufRead>(reader: &mut R) -> io::Result<bool> {
let buffer = reader.fill_buf()?;
let result = buffer.len() >= 2 && buffer[0..2] == [0x1F, 0x8B];
Ok(result)
}
pub fn open_file<P: AsRef<Path>>(filename: P) -> Result<Box<dyn BufRead>, String> {
let mut inner = if filename.as_ref() == Path::new("-") {
Box::new(BufReader::new(io::stdin())) as Box<dyn BufRead>
} else {
let file = File::open(&filename).map_err(|x| format!("Failed to open file {}: {}", filename.as_ref().display(), x))?;
Box::new(BufReader::new(file)) as Box<dyn BufRead>
};
if is_gzipped(&mut inner).map_err(|x| format!("Failed to read file {}: {}", filename.as_ref().display(), x))? {
let gz_inner = MultiGzDecoder::new(inner);
Ok(Box::new(BufReader::new(gz_inner)))
} else {
Ok(inner)
}
}
pub fn append_usize(buffer: &mut Vec<u8>, value: usize) {
buffer.extend_from_slice(value.to_string().as_bytes());
}
pub fn append_isize(buffer: &mut Vec<u8>, value: isize) {
buffer.extend_from_slice(value.to_string().as_bytes());
}
const DECODE: [u8; 6] = [0, b'A', b'C', b'G', b'T', b'N'];
#[inline]
pub fn decode_base(encoded: usize) -> u8 {
DECODE[encoded]
}
pub fn decode_sequence(encoded: &[u8]) -> Vec<u8> {
let capacity = if encoded.is_empty() { 0 } else { 3 * encoded.len() };
let mut result = Vec::with_capacity(capacity);
for byte in encoded {
let mut value = *byte as usize;
for _ in 0..3 {
let decoded = DECODE[value % DECODE.len()];
if decoded == 0 {
return result;
}
value /= DECODE.len();
result.push(decoded);
}
}
result
}
const fn generate_encoding() -> [u8; 256] {
let mut result = [5; 256];
result[b'a' as usize] = 1; result[b'A' as usize] = 1;
result[b'c' as usize] = 2; result[b'C' as usize] = 2;
result[b'g' as usize] = 3; result[b'G' as usize] = 3;
result[b't' as usize] = 4; result[b'T' as usize] = 4;
result
}
const ENCODE: [u8; 256] = generate_encoding();
#[inline]
pub fn encode_base(base: u8) -> usize {
ENCODE[base as usize] as usize
}
pub fn encode_sequence(sequence: &[u8]) -> Vec<u8> {
let mut result: Vec<u8> = Vec::with_capacity(encoded_length(sequence.len()));
let mut offset = 0;
while offset + 3 <= sequence.len() {
let byte = ENCODE[sequence[offset] as usize] +
6 * ENCODE[sequence[offset + 1] as usize] +
36 * ENCODE[sequence[offset + 2] as usize];
result.push(byte);
offset += 3;
}
if sequence.len() - offset == 1 {
let byte = ENCODE[sequence[offset] as usize];
result.push(byte);
} else if sequence.len() - offset == 2 {
let byte = ENCODE[sequence[offset] as usize] + 6 * ENCODE[sequence[offset + 1] as usize];
result.push(byte);
}
result
}
pub fn encoded_length(sequence_length: usize) -> usize {
sequence_length.div_ceil(3)
}
pub fn require_valid_reference(alignments: &GraphName, reference: &GraphName) -> Result<(), String> {
if !alignments.has_name() || !reference.has_name() {
return Ok(());
}
if !alignments.is_subgraph_of(reference) {
let description = alignments.describe_relationship(reference, "alignments", "reference graph");
return Err(format!("The graph is not a valid reference for the alignments:\n{}", description));
}
Ok(())
}
#[derive(Clone, Debug)]
struct NodeIdCluster {
node_id_range: RangeInclusive<usize>,
array_range: Range<usize>,
max_gap_offset: Option<usize>,
}
impl NodeIdCluster {
fn new(node_ids: &[usize], array_range: Range<usize>) -> Option<Self> {
if node_ids.is_empty() {
return None;
}
if array_range.is_empty() || array_range.end > node_ids.len() {
return None;
}
let first = node_ids[array_range.start];
let last = node_ids[array_range.end - 1];
let node_id_range = first..=last;
let mut max_gap_length = 0;
let mut max_gap_offset = None;
for i in (array_range.start + 1)..array_range.end {
let gap = node_ids[i] - node_ids[i - 1];
if gap > max_gap_length {
max_gap_length = gap;
max_gap_offset = Some(i);
}
}
Some(Self {
node_id_range,
array_range,
max_gap_offset,
})
}
fn max_gap_length(&self, node_ids: &[usize]) -> Option<usize> {
let offset = self.max_gap_offset?;
Some(node_ids[offset] - node_ids[offset - 1])
}
fn split(self, node_ids: &[usize]) -> (Option<Self>, Option<Self>) {
if self.max_gap_offset.is_none() {
return (Some(self), None);
}
let offset = self.max_gap_offset.unwrap();
let left = NodeIdCluster::new(node_ids, self.array_range.start..offset);
let right = NodeIdCluster::new(node_ids, offset..self.array_range.end);
(left, right)
}
}
pub fn cluster_node_ids(node_ids: Vec<usize>, threshold: usize) -> Vec<RangeInclusive<usize>> {
let mut node_ids = node_ids;
node_ids.sort_unstable();
node_ids.dedup();
let mut stack: Vec<NodeIdCluster> = Vec::new();
let mut result: Vec<RangeInclusive<usize>> = Vec::new();
let initial = NodeIdCluster::new(&node_ids, 0..node_ids.len());
if initial.is_none() {
return result;
}
stack.push(initial.unwrap());
while let Some(curr) = stack.pop() {
if let Some(len) = curr.max_gap_length(&node_ids) {
if len > threshold {
let (left, right) = curr.split(&node_ids);
if let Some(right) = right {
stack.push(right);
}
if let Some(left) = left {
stack.push(left);
}
} else {
result.push(curr.node_id_range);
}
} else {
result.push(curr.node_id_range);
}
}
result
}
pub enum PathStartSource<'a> {
Index(&'a GBWT, usize),
Map(HashMap<usize, usize>),
}
impl<'a> PathStartSource<'a> {
pub fn new() -> Self {
Self::default()
}
pub fn next(&mut self, node_id: usize) -> Option<Pos> {
match self {
Self::Index(index, seq_id) => {
let result = index.start(*seq_id);
*seq_id += if index.is_bidirectional() { 2 } else { 1 };
result
}
Self::Map(map) => {
if node_id == ENDMARKER {
return None;
}
let count = map.entry(node_id).or_insert(0);
let result = Pos::new(node_id, *count);
*count += 1;
Some(result)
}
}
}
}
impl<'a> Default for PathStartSource<'a> {
fn default() -> Self {
Self::Map(HashMap::new())
}
}
impl<'a> From<&'a GBWT> for PathStartSource<'a> {
fn from(index: &'a GBWT) -> Self {
Self::Index(index, 0)
}
}
#[cfg(test)]
mod tests {
use super::*;
use gbz::support::{self, Orientation};
use simple_sds::serialize;
#[test]
fn sequence_encoding() {
let full_sequence = b"GATTACACACCAGATNNNNNACATTGAACCTTACACAGTCTGAC";
for i in 0..full_sequence.len() {
let sequence = &full_sequence[0..i];
let encoded = encode_sequence(sequence);
let decoded = decode_sequence(&encoded);
assert_eq!(decoded, sequence, "Wrong sequence encoding for length {}", i);
}
}
fn test_cluster(node_ids: Vec<usize>, expected: Vec<RangeInclusive<usize>>, gap_threshold: usize, test_case: &str) {
let clusters = cluster_node_ids(node_ids, gap_threshold);
assert_eq!(clusters.len(), expected.len(), "Wrong number of clusters for {}", test_case);
for (i, cluster) in clusters.iter().enumerate() {
assert_eq!(cluster, &expected[i], "Wrong cluster {} for {}", i, test_case);
}
}
#[test]
fn cluster_node_ids_test() {
let node_ids = Vec::new();
let expected = Vec::new();
test_cluster(node_ids, expected, 10, "empty");
let node_ids = vec![5];
let expected = vec![5..=5];
test_cluster(node_ids, expected, 10, "single node");
let node_ids = vec![5, 6, 7, 8, 9];
let expected = vec![5..=9];
test_cluster(node_ids, expected, 10, "continuous nodes");
let node_ids = vec![6, 9, 7, 5, 8];
let expected = vec![5..=9];
test_cluster(node_ids, expected, 10, "continuous nodes unsorted");
let node_ids = vec![5, 7, 9];
let expected = vec![5..=9];
test_cluster(node_ids, expected, 10, "equal gaps");
let node_ids = vec![5, 6, 7, 20, 21, 22];
let expected = vec![5..=7, 20..=22];
test_cluster(node_ids, expected, 10, "two clusters");
let node_ids = vec![1, 50, 52, 53, 63, 64, 200];
let expected = vec![1..=1, 50..=64, 200..=200];
test_cluster(node_ids, expected, 10, "one cluster and outliers");
let node_ids = vec![1, 50, 52, 53, 73, 74, 200];
let expected = vec![1..=1, 50..=53, 73..=74, 200..=200];
test_cluster(node_ids, expected, 10, "two clusters and outliers");
}
#[test]
fn path_start_source() {
let gbwt_files = vec![
get_test_data("micb-kir3dl1_HG003.gbwt"),
get_test_data("bidirectional.gbwt"),
get_test_data("empty.gbwt"),
support::get_test_data("example.gbwt"),
support::get_test_data("translation.gbwt"),
support::get_test_data("with-empty.gbwt"),
];
for gbwt_file in gbwt_files.iter() {
let index = serialize::load_from(gbwt_file);
assert!(index.is_ok(), "Failed to load GBWT index from {}: {}", gbwt_file.display(), index.unwrap_err());
let index: GBWT = index.unwrap();
let paths = if index.is_bidirectional() {
index.sequences() / 2
} else {
index.sequences()
};
let mut index_source = PathStartSource::from(&index);
let mut map_source = PathStartSource::new();
for path_id in 0..paths {
let seq_id = if index.is_bidirectional() { support::encode_path(path_id, Orientation::Forward) } else { path_id };
let truth = index.start(seq_id);
let node_id = truth.map(|pos| pos.node).unwrap_or(ENDMARKER);
let index_pos = index_source.next(0);
assert_eq!(index_pos, truth, "Wrong path start from index for path {} in {}", path_id, gbwt_file.display());
if !index.is_bidirectional() {
let map_pos = map_source.next(node_id);
assert_eq!(map_pos, truth, "Wrong path start from map for path {} in {}", path_id, gbwt_file.display());
}
}
}
}
}