use crate::{Subgraph, Mapping, Difference};
use crate::formats::{self, TypedField};
use crate::utils;
use std::io::{Read, Write};
use std::ops::Range;
use std::sync::Arc;
use std::{cmp, str};
use zstd::stream::Encoder as ZstdEncoder;
use zstd::stream::Decoder as ZstdDecoder;
use gbz::{GBWT, Orientation, Pos};
use gbz::support::{self, ByteCode, ByteCodeIter, RLE, Run, RLEIter};
#[cfg(test)]
mod tests;
pub mod mapping;
#[derive(Clone, Debug, PartialEq)]
pub struct Alignment {
pub name: String,
pub seq_len: usize,
pub seq_interval: Range<usize>,
pub path: TargetPath,
pub path_len: usize,
pub path_interval: Range<usize>,
pub matches: usize,
pub edits: usize,
pub mapq: Option<usize>,
pub score: Option<isize>,
pub base_quality: Vec<u8>,
pub difference: Vec<Difference>,
pub pair: Option<PairedRead>,
pub optional: Vec<TypedField>,
}
impl Default for Alignment {
fn default() -> Self {
Alignment {
name: String::new(),
seq_len: 0,
seq_interval: 0..0,
path: TargetPath::Path(Vec::new()),
path_len: 0,
path_interval: 0..0,
matches: 0,
edits: 0,
mapq: None,
score: None,
base_quality: Vec::new(),
difference: Vec::new(),
pair: None,
optional: Vec::new(),
}
}
}
impl Alignment {
const MANDATORY_FIELDS: usize = 12;
const MISSING_MAPQ: usize = 255;
const MISSING_VALUE: [u8; 1] = [b'*'];
pub fn new() -> Self {
Alignment::default()
}
fn parse_string(field: &[u8], field_name: &str) -> Result<String, String> {
String::from_utf8(field.to_vec()).map_err(|err| {
format!("Invalid {}: {}", field_name, err)
})
}
fn parse_usize(field: &[u8], field_name: &str) -> Result<usize, String> {
if field == Self::MISSING_VALUE {
return Ok(0);
}
let number = str::from_utf8(field).map_err(|err| {
format!("Invalid {}: {}", field_name, err)
})?;
number.parse().map_err(|err| {
format!("Invalid {}: {}", field_name, err)
})
}
fn parse_interval(start: &[u8], end: &[u8]) -> Result<Range<usize>, String> {
let start = Self::parse_usize(start, "interval start")?;
let end = Self::parse_usize(end, "interval end")?;
Ok(start..end)
}
fn parse_orientation(field: &[u8], field_name: &str) -> Result<Orientation, String> {
if field == Self::MISSING_VALUE {
return Ok(Orientation::Forward);
}
if field.len() != 1 {
return Err(format!("Invalid {}: {}", field_name, String::from_utf8_lossy(field)));
}
match field[0] {
b'+' => Ok(Orientation::Forward),
b'-' => Ok(Orientation::Reverse),
_ => Err(format!("Invalid {}: {}", field_name, String::from_utf8_lossy(field))),
}
}
fn parse_path(field: &[u8]) -> Result<Vec<usize>, String> {
let mut result = Vec::new();
if field == Self::MISSING_VALUE {
return Ok(result);
}
let mut start = 0;
while start < field.len() {
let orientation = match field[start] {
b'>' => Orientation::Forward,
b'<' => Orientation::Reverse,
_ => return Err(format!("Invalid segment orientation: {}", String::from_utf8_lossy(field))),
};
start += 1;
let end = field[start..].iter().position(|&c| c == b'>' || c == b'<').map_or(field.len(), |x| start + x);
let node = str::from_utf8(&field[start..end]).map_err(|err| {
format!("Invalid segment name: {}", err)
})?.parse().map_err(|_| {
String::from("Only numerical segment names are supported")
})?;
result.push(support::encode_node(node, orientation));
start = end;
}
Ok(result)
}
fn parse_pair(value: Vec<u8>, is_next: bool, output: &mut Option<PairedRead>) -> Result<(), String> {
if output.is_some() {
return Err(String::from("Multiple pair fields"));
}
let name = String::from_utf8(value).map_err(|err| {
format!("Invalid pair name: {}", err)
})?;
*output = Some(PairedRead {
name,
is_next,
is_proper: false,
});
Ok(())
}
pub fn from_gaf(line: &[u8]) -> Result<Self, String> {
let line = if line.last() == Some(&b'\n') {
&line[..line.len() - 1]
} else {
line
};
let fields = line.split(|&c| c == b'\t').collect::<Vec<_>>();
if fields.len() < Self::MANDATORY_FIELDS {
let line = String::from_utf8_lossy(line);
let message = format!("GAF line with fewer than {} fields: {}", Self::MANDATORY_FIELDS, line);
return Err(message);
}
let name = Self::parse_string(fields[0], "query sequence name")?;
let seq_len = Self::parse_usize(fields[1], "query sequence length")?;
let mut seq_interval = Self::parse_interval(fields[2], fields[3])?;
let orientation = Self::parse_orientation(fields[4], "target orientation")?;
let mut path = Self::parse_path(fields[5]).map_err(|err| {
format!("Invalid target path: {}", err)
})?;
if orientation == Orientation::Reverse {
support::reverse_path_in_place(&mut path);
}
let path = TargetPath::Path(path);
let path_len = Self::parse_usize(fields[6], "target path length")?;
let path_interval = Self::parse_interval(fields[7], fields[8]);
let mut matches = Self::parse_usize(fields[9], "matches")?;
let alignment_len = Self::parse_usize(fields[10], "alignment length")?;
let mut edits = alignment_len.saturating_sub(matches);
let mapq = Self::parse_usize(fields[11], "mapping quality")?;
let mapq = if mapq == Self::MISSING_MAPQ { None } else { Some(mapq) };
let mut score = None;
let mut base_quality = Vec::new();
let mut difference = Vec::new();
let mut pair = None;
let mut properly_paired = None;
let mut optional = Vec::new();
for field in fields[Self::MANDATORY_FIELDS..].iter() {
let parsed = TypedField::parse(field)?;
match parsed {
TypedField::Int([b'A', b'S'], value) => {
if score.is_some() {
return Err(String::from("Multiple alignment score fields"));
}
score = Some(value);
},
TypedField::String([b'b', b'q'], value) => {
if !base_quality.is_empty() {
return Err(String::from("Multiple base quality fields"));
}
base_quality = value;
},
TypedField::String([b'c', b's'], value) => {
if !difference.is_empty() {
return Err(String::from("Multiple difference fields"));
}
difference = Difference::parse_normalized(&value)?;
},
TypedField::Bool([b'p', b'd'], value) => {
if properly_paired.is_some() {
return Err(String::from("Multiple properly paired fields"));
}
properly_paired = Some(value);
},
TypedField::String([b'f', b'n'], value) => {
Self::parse_pair(value, true, &mut pair)?;
},
TypedField::String([b'f', b'p'], value) => {
Self::parse_pair(value, false, &mut pair)?;
},
_ => { optional.push(parsed); },
}
}
if let (Some(pair), Some(properly_paired)) = (pair.as_mut(), properly_paired) {
pair.is_proper = properly_paired;
}
let mut path_interval = match path_interval {
Ok(interval) => interval,
Err(_) => {
if !difference.is_empty() {
let target_start = Self::parse_usize(fields[7], "target interval start")?;
target_start..target_start
} else {
return Err(String::from("Target interval end cannot be parsed or inferred from the difference string"));
}
},
};
if !difference.is_empty() {
let (query_len, target_len, num_matches, num_edits) = Difference::stats(&difference);
seq_interval.end = seq_interval.start + query_len;
path_interval.end = path_interval.start + target_len;
matches = num_matches;
edits = num_edits;
}
if orientation == Orientation::Reverse {
let start = path_len.saturating_sub(path_interval.end);
let end = path_len.saturating_sub(path_interval.start);
path_interval = start..end;
}
Ok(Alignment {
name, seq_len, seq_interval,
path, path_len, path_interval,
matches, edits, mapq, score,
base_quality, difference, pair,
optional,
})
}
pub fn to_gaf(&self, target_sequence: &[u8]) -> Vec<u8> {
let mut result = Vec::new();
result.extend_from_slice(self.name.as_bytes());
result.push(b'\t');
utils::append_usize(&mut result, self.seq_len);
result.push(b'\t');
utils::append_usize(&mut result, self.seq_interval.start);
result.push(b'\t');
utils::append_usize(&mut result, self.seq_interval.end);
result.push(b'\t');
result.push(b'+');
result.push(b'\t');
match &self.path {
TargetPath::Path(path) => {
if path.is_empty() {
result.extend_from_slice(&Self::MISSING_VALUE);
} else {
formats::append_walk(&mut result, path);
}
},
TargetPath::StartPosition(_) => {
result.extend_from_slice(&Self::MISSING_VALUE);
},
}
result.push(b'\t');
utils::append_usize(&mut result, self.path_len);
result.push(b'\t');
utils::append_usize(&mut result, self.path_interval.start);
result.push(b'\t');
utils::append_usize(&mut result, self.path_interval.end);
result.push(b'\t');
utils::append_usize(&mut result, self.matches);
result.push(b'\t');
utils::append_usize(&mut result, self.matches + self.edits);
result.push(b'\t');
let mapq = self.mapq.unwrap_or(Self::MISSING_MAPQ);
utils::append_usize(&mut result, mapq);
if let Some(score) = self.score {
let field = TypedField::Int([b'A', b'S'], score);
field.append_to(&mut result, true);
}
if !self.base_quality.is_empty() {
TypedField::append_string(&mut result, [b'b', b'q'], &self.base_quality, true);
}
if !self.difference.is_empty() {
let target_sequence = &target_sequence[self.path_interval.clone()];
let field = TypedField::String([b'c', b's'], Difference::to_bytes(&self.difference, target_sequence));
field.append_to(&mut result, true);
}
if let Some(pair) = &self.pair {
if pair.is_next {
TypedField::append_string(&mut result, [b'f', b'n'], pair.name.as_bytes(), true);
} else {
TypedField::append_string(&mut result, [b'f', b'p'], pair.name.as_bytes(), true);
}
let field = TypedField::Bool([b'p', b'd'], pair.is_proper);
field.append_to(&mut result, true);
}
for field in self.optional.iter() {
field.append_to(&mut result, true);
}
result
}
pub fn validate(&self) -> Result<(), String> {
if self.seq_interval.start > self.seq_interval.end || self.seq_interval.end > self.seq_len {
return Err(format!("Query sequence interval {}..{} for a sequence of length {}", self.seq_interval.start, self.seq_interval.end, self.seq_len));
}
if self.path_interval.start > self.path_interval.end || self.path_interval.end > self.path_len {
return Err(format!("Target path interval {}..{} for a path of length {}", self.path_interval.start, self.path_interval.end, self.path_len));
}
if let Some(path) = self.target_path() {
if path.is_empty() && self.path_len > 0 {
return Err(String::from("Empty target path with a non-empty path length"));
}
}
if !self.difference.is_empty() {
let (query_len, target_len, num_matches, num_edits) = Difference::stats(&self.difference);
if query_len != self.seq_interval.len() {
return Err(format!("Query interval length {} according to the difference string, but {} in the alignment", query_len, self.seq_interval.len()));
}
if target_len != self.path_interval.len() {
return Err(format!("Target path interval length {} according to the difference string, but {} in the alignment", target_len, self.path_interval.len()));
}
if num_matches != self.matches {
return Err(format!("Number of matches {} according to the difference string, but {} in the alignment", num_matches, self.matches));
}
if num_edits != self.edits {
return Err(format!("Number of edits {} according to the difference string, but {} in the alignment", num_edits, self.edits));
}
}
Ok(())
}
}
impl Alignment {
fn normalize_coordinates(interval: Range<usize>, len: usize) -> (usize, usize, usize) {
let start = if interval.start <= interval.end { interval.start } else { interval.end };
let end = if interval.end <= len { interval.end } else { len };
let left_flank = start;
let length = end - start;
let right_flank = len - end;
(left_flank, length, right_flank)
}
fn encode_signed(value: isize, encoder: &mut ByteCode) {
let value = if value < 0 { (-2 * value - 1) as usize } else { 2 * value as usize };
encoder.write(value);
}
fn encode_numbers_into(&self, encoder: &mut ByteCode, known_length: bool) {
let full_alignment = self.is_full();
let exact_alignment = self.is_exact();
let known_alignment = self.has_difference_string();
let (left_flank, length, right_flank) = Self::normalize_coordinates(self.seq_interval.clone(), self.seq_len);
if !(known_length || known_alignment) {
encoder.write(length);
}
if !full_alignment {
encoder.write(left_flank);
encoder.write(right_flank);
}
let (left_flank, length, _) = Self::normalize_coordinates(self.path_interval.clone(), self.path_len);
if !(exact_alignment || known_alignment) {
encoder.write(length);
}
encoder.write(left_flank);
if !(exact_alignment || known_alignment) {
encoder.write(self.matches);
encoder.write(self.edits);
}
if let Some(mapq) = self.mapq {
encoder.write(mapq);
}
if let Some(score) = self.score {
Self::encode_signed(score, encoder);
};
}
fn encode_difference_into(&self, encoder: &mut RLE) {
for diff in self.difference.iter() {
match diff {
Difference::Match(len) => encoder.write(Run::new(0, *len)),
Difference::Mismatch(base) => encoder.write(Run::new(1, utils::encode_base(*base))),
Difference::Insertion(seq) => {
let len = utils::encoded_length(seq.len());
encoder.write(Run::new(2, len));
let encoded = utils::encode_sequence(seq);
for byte in encoded {
encoder.write_byte(byte);
}
},
Difference::Deletion(len) => encoder.write(Run::new(3, *len)),
Difference::End => {},
}
}
encoder.write(Run::new(4, 1));
}
fn decode_signed(iter: &mut ByteCodeIter) -> Option<isize> {
let value = iter.next()?;
if value % 2 == 0 {
Some((value / 2) as isize)
} else {
Some(-(value as isize + 1) / 2)
}
}
fn decode_difference_from(encoded: &[u8], decoder: &mut RLEIter) -> Result<Vec<Difference>, String> {
let mut result: Vec<Difference> = Vec::new();
while let Some(run) = decoder.next() {
match run.value {
0 => result.push(Difference::Match(run.len)),
1 => result.push(Difference::Mismatch(utils::decode_base(run.len))),
2 => {
let offset = decoder.offset();
for _ in 0..run.len {
let _ = decoder.byte().ok_or(String::from("Missing insertion base"))?;
}
let encoded = &encoded[offset..offset + run.len];
let seq = utils::decode_sequence(encoded);
result.push(Difference::Insertion(seq));
},
3 => result.push(Difference::Deletion(run.len)),
4 => {
return Ok(result);
},
_ => return Err(format!("Invalid difference string operation: {}", run.value)),
}
}
Err(String::from("Encoded difference string ended without an End value"))
}
}
impl Alignment {
pub fn is_unaligned(&self) -> bool {
self.seq_interval.is_empty() || self.path_interval.is_empty()
}
pub fn is_full(&self) -> bool {
self.seq_len > 0 &&
self.seq_interval.start == 0 &&
self.seq_interval.end == self.seq_len
}
pub fn is_exact(&self) -> bool {
self.seq_len > 0 &&
!self.seq_interval.is_empty() &&
self.edits == 0
}
pub fn has_difference_string(&self) -> bool {
!self.difference.is_empty()
}
pub fn min_handle(&self) -> Option<usize> {
match self.path {
TargetPath::Path(ref path) => path.iter().copied().min(),
TargetPath::StartPosition(_) => None,
}
}
pub fn max_handle(&self) -> Option<usize> {
match self.path {
TargetPath::Path(ref path) => path.iter().copied().max(),
TargetPath::StartPosition(_) => None,
}
}
pub fn has_target_path(&self) -> bool {
matches!(self.path, TargetPath::Path(_))
}
pub fn has_non_empty_target_path(&self) -> bool {
match &self.path {
TargetPath::Path(path) => !path.is_empty(),
TargetPath::StartPosition(_) => false,
}
}
pub fn target_path(&self) -> Option<&[usize]> {
match &self.path {
TargetPath::Path(path) => Some(path),
TargetPath::StartPosition(_) => None,
}
}
pub fn extract_target_path(&mut self, index: &GBWT) {
let mut pos = match self.path {
TargetPath::Path(_) => return,
TargetPath::StartPosition(pos) => Some(pos),
};
let mut path = Vec::new();
while let Some(p) = pos {
path.push(p.node);
pos = index.forward(p);
}
self.path = TargetPath::Path(path);
}
pub fn set_target_path(&mut self, path: Vec<usize>) {
self.path = TargetPath::Path(path);
}
pub fn set_target_path_len(&mut self, len: usize) {
self.path_len = len;
}
pub fn iter<'a>(&'a self, sequence_len: Arc<dyn Fn(usize) -> Option<usize> + 'a>) -> Option<AlignmentIter<'a>> {
if self.difference.is_empty() && (!self.seq_interval.is_empty() || !self.path_interval.is_empty()) {
return None;
}
if !self.has_target_path() {
return None;
}
let target_path = self.target_path().unwrap();
if target_path.is_empty() && (self.path_interval.start != 0 || self.path_interval.end != 0) {
return None;
}
let mut iter = AlignmentIter {
parent: self,
sequence_len,
seq_offset: self.seq_interval.start,
path_offset: self.path_interval.start,
path_vec_offset: 0,
path_node_offset: self.path_interval.start,
diff_vec_offset: 0,
diff_op_offset: 0,
};
if iter.path_node_offset == 0 {
return Some(iter);
}
let mut handle = *target_path.get(iter.path_vec_offset)?;
let mut node_len = (*iter.sequence_len)(handle)?;
while iter.path_node_offset >= node_len {
iter.path_node_offset -= node_len;
iter.path_vec_offset += 1;
handle = *target_path.get(iter.path_vec_offset)?;
node_len = (*iter.sequence_len)(handle)?;
}
Some(iter)
}
fn empty_fragment(&self, fragment_index: usize) -> Alignment {
let mut aln = Alignment {
name: self.name.clone(),
seq_len: self.seq_len,
seq_interval: 0..0,
path: TargetPath::Path(Vec::new()),
path_len: 0,
path_interval: 0..0,
matches: self.matches,
edits: self.edits,
mapq: self.mapq,
score: self.score,
base_quality: self.base_quality.clone(),
difference: Vec::new(),
pair: self.pair.clone(),
optional: self.optional.clone(),
};
aln.optional.push(TypedField::Int([b'f', b'i'], fragment_index as isize));
aln
}
fn extend(&mut self, mapping: Mapping) -> Result<(), String> {
if self.seq_interval.is_empty() && self.path_interval.is_empty() {
if !self.difference.is_empty() {
return Err(String::from("Cannot extend an unaligned fragment with a non-empty difference string"));
}
self.difference.push(mapping.edit().clone());
} else {
if self.difference.is_empty() {
return Err(String::from("Cannot extend an aligned fragment without a difference string"));
}
if !self.difference.last_mut().unwrap().try_merge(mapping.edit()) {
self.difference.push(mapping.edit().clone());
}
}
if mapping.seq_interval().end > self.seq_len {
return Err(String::from("Cannot extend a fragment beyond the query sequence length"));
}
if self.seq_interval.is_empty() {
self.seq_interval = mapping.seq_interval().clone();
} else {
if mapping.seq_interval().start != self.seq_interval.end {
return Err(String::from("Cannot append a non-contiguous query interval"));
}
self.seq_interval.end = mapping.seq_interval().end;
}
let target_path = self.target_path().ok_or(
"Cannot extend a fragment without an explicit target path"
)?;
let last_node = target_path.last().copied();
let path_left = self.path_len.saturating_sub(self.path_interval.end);
let reverse_offset = mapping.node_len().saturating_sub(mapping.node_interval().start);
let first_mapping = last_node.is_none();
let continues_in_same_node = Some(mapping.handle()) == last_node && reverse_offset == path_left;
let starts_a_new_node = path_left == 0 && mapping.is_at_start();
if first_mapping {
if let TargetPath::Path(path) = &mut self.path {
path.push(mapping.handle());
} else {
unreachable!();
}
self.path_len += mapping.node_len();
self.path_interval = mapping.node_interval().clone();
} else if starts_a_new_node {
if let TargetPath::Path(path) = &mut self.path {
path.push(mapping.handle());
} else {
unreachable!();
}
self.path_len += mapping.node_len();
self.path_interval.end += mapping.target_len();
} else if continues_in_same_node {
self.path_interval.end += mapping.target_len();
} else {
return Err(String::from("Cannot append a non-contiguous target interval"));
}
Ok(())
}
pub fn clip<'a>(&self, subgraph: &Subgraph, sequence_len: Arc<impl Fn(usize) -> Option<usize> + 'a>) -> Result<Vec<Alignment>, String> {
let mut result = Vec::new();
if self.is_unaligned() || !self.has_non_empty_target_path() || self.difference.is_empty() {
return Ok(result);
}
let mut aln: Option<Alignment> = None; let iter = self.iter(sequence_len);
if iter.is_none() {
eprintln!("Warning: Cannot build an alignment iterator for {}", self.name);
return Ok(result);
}
let iter = iter.unwrap();
for mapping in iter {
if !subgraph.has_handle(mapping.handle()) {
if let Some(prev) = aln {
result.push(prev);
aln = None;
}
continue;
}
if let Some(aln) = &mut aln {
aln.extend(mapping)?;
} else {
let mut curr = self.empty_fragment(result.len() + 1);
curr.extend(mapping)?;
aln = Some(curr);
}
}
if let Some(aln) = aln {
result.push(aln);
}
if result.len() == 1 && result[0].seq_interval == self.seq_interval {
result[0].optional.retain(|field| {
!matches!(field, TypedField::Int([b'f', b'i'], _))
});
}
Ok(result)
}
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub enum TargetPath {
Path(Vec<usize>),
StartPosition(Pos),
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct PairedRead {
pub name: String,
pub is_next: bool,
pub is_proper: bool,
}
#[derive(Clone)]
pub struct AlignmentIter<'a> {
parent: &'a Alignment,
sequence_len: Arc<dyn Fn(usize) -> Option<usize> + 'a>,
seq_offset: usize,
path_offset: usize,
path_vec_offset: usize,
path_node_offset: usize,
diff_vec_offset: usize,
diff_op_offset: usize,
}
impl<'a> Iterator for AlignmentIter<'a> {
type Item = Mapping;
fn next(&mut self) -> Option<Self::Item> {
if self.diff_vec_offset >= self.parent.difference.len() {
return None;
}
let full_edit = self.parent.difference.get(self.diff_vec_offset)?;
let edit_left = full_edit.target_len() - self.diff_op_offset;
let target_path = self.parent.target_path()?;
let handle = target_path.get(self.path_vec_offset);
if handle.is_none() {
match full_edit {
Difference::Insertion(_) => {
let seq_offset = self.seq_offset;
self.seq_offset += full_edit.query_len();
self.diff_vec_offset += 1;
self.diff_op_offset = 0;
return Some(Mapping::unaligned(seq_offset, full_edit.clone()));
},
_ => {
return None;
},
}
}
let mut handle = *handle.unwrap();
let mut node_len = (*self.sequence_len)(handle)?;
if self.path_node_offset >= node_len && edit_left > 0 {
self.path_vec_offset += 1;
self.path_node_offset = 0;
handle = *target_path.get(self.path_vec_offset)?;
node_len = (*self.sequence_len)(handle)?;
}
let node_left = node_len - self.path_node_offset;
let seq_offset = self.seq_offset;
let node_offset = self.path_node_offset;
let edit_len = cmp::min(edit_left, node_left);
let edit = match full_edit {
Difference::Match(_) => Difference::Match(edit_len),
Difference::Mismatch(base) => Difference::Mismatch(*base),
Difference::Insertion(seq) => {
Difference::Insertion(seq.clone())
},
Difference::Deletion(_) => Difference::Deletion(edit_len),
Difference::End => {
self.diff_vec_offset = self.parent.difference.len();
return None;
},
};
self.seq_offset += edit.query_len();
self.path_offset += edit.target_len();
self.path_node_offset += edit.target_len();
self.diff_op_offset += edit.target_len();
if self.diff_op_offset >= full_edit.target_len() {
self.diff_vec_offset += 1;
self.diff_op_offset = 0;
}
Some(Mapping::new(seq_offset, handle, node_len, node_offset, edit))
}
}
#[derive(Debug, Clone)]
pub struct Flags {
bits: Vec<u8>,
}
impl Flags {
pub const NUM_FLAGS: usize = 6;
pub const FLAG_PAIR_IS_NEXT: usize = 0;
pub const FLAG_PAIR_IS_PROPER: usize = 1;
pub const FLAG_HAS_MAPQ: usize = 2;
pub const FLAG_HAS_SCORE: usize = 3;
pub const FLAG_FULL_ALIGNMENT: usize = 4;
pub const FLAG_EXACT_ALIGNMENT: usize = 5;
pub fn new(num_alignments: usize) -> Self {
let bits = vec![0; (num_alignments * Self::NUM_FLAGS).div_ceil(8)];
Self { bits }
}
pub fn set(&mut self, index: usize, flag: usize, value: bool) {
let bit = index * Self::NUM_FLAGS + flag;
if value {
self.bits[bit / 8] |= 1 << (bit % 8);
} else {
self.bits[bit / 8] &= !(1 << (bit % 8));
}
}
pub fn get(&self, index: usize, flag: usize) -> bool {
let bit = index * Self::NUM_FLAGS + flag;
(self.bits[bit / 8] >> (bit % 8)) & 0x01 != 0
}
pub fn bytes(&self) -> usize {
self.bits.len()
}
}
impl From<Vec<u8>> for Flags {
fn from(bits: Vec<u8>) -> Self {
Self { bits }
}
}
impl AsRef<[u8]> for Flags {
fn as_ref(&self) -> &[u8] {
&self.bits
}
}
#[derive(Debug, Clone)]
pub struct AlignmentBlock {
pub min_handle: Option<usize>,
pub max_handle: Option<usize>,
pub alignments: usize,
pub read_length: Option<usize>,
pub gbwt_starts: Vec<u8>,
pub names: Vec<u8>,
pub quality_strings: Vec<u8>,
pub difference_strings: Vec<u8>,
pub flags: Flags,
pub numbers: Vec<u8>,
pub optional: Vec<u8>,
}
impl AlignmentBlock {
pub const COMPRESSION_LEVEL: i32 = 7;
fn expected_read_length(alignments: &[Alignment]) -> Option<usize> {
let mut read_length = None;
for aln in alignments.iter() {
if let Some(len) = read_length {
if aln.seq_len != len {
read_length = None;
break;
}
} else {
if aln.seq_len == 0 {
break;
}
read_length = Some(aln.seq_len);
}
}
read_length
}
fn compress_gbwt_starts(alignments: &[Alignment], index: &GBWT, first_id: usize, min_handle: Option<usize>) -> Result<Vec<u8>, String> {
let base_node = min_handle.unwrap_or(0);
let mut encoder = ByteCode::new();
for i in 0..alignments.len() {
let gbwt_sequence_id = if index.is_bidirectional() {
support::encode_path(first_id + i, Orientation::Forward)
} else { first_id + i };
if let Some(start) = index.start(gbwt_sequence_id) {
encoder.write(start.node - base_node);
encoder.write(start.offset);
} else if !encoder.is_empty() {
return Err(format!("Line {}: Unaligned read in a block with aligned reads", first_id + i));
}
}
Ok(Vec::from(encoder))
}
fn zstd_compress(data: &[u8]) -> Result<Vec<u8>, String> {
let mut encoder = ZstdEncoder::new(Vec::new(), Self::COMPRESSION_LEVEL).unwrap();
encoder.write_all(data).map_err(|err| format!("Zstd compression error: {}", err))?;
encoder.finish().map_err(|err| format!("Zstd compression error: {}", err))
}
fn compress_names_pairs(alignments: &[Alignment]) -> Result<Vec<u8>, String> {
let mut names: Vec<u8> = Vec::new();
for aln in alignments.iter() {
names.extend_from_slice(aln.name.as_bytes());
names.push(0);
if let Some(pair) = &aln.pair {
names.extend_from_slice(pair.name.as_bytes());
}
names.push(0);
}
Self::zstd_compress(&names)
}
fn compress_quality_strings(alignments: &[Alignment]) -> Result<Vec<u8>, String> {
let mut quality_strings: Vec<u8> = Vec::new();
for aln in alignments.iter() {
if !aln.base_quality.is_empty() {
quality_strings.extend_from_slice(&aln.base_quality);
}
quality_strings.push(0);
}
if quality_strings.len() <= alignments.len() {
return Ok(Vec::new());
}
Self::zstd_compress(&quality_strings)
}
fn compress_optional_fields(alignments: &[Alignment]) -> Result<Vec<u8>, String> {
let mut optional: Vec<u8> = Vec::new();
for aln in alignments.iter() {
for (i, field) in aln.optional.iter().enumerate() {
field.append_to(&mut optional, i > 0);
}
optional.push(0); }
if optional.len() <= alignments.len() {
return Ok(Vec::new());
}
Self::zstd_compress(&optional)
}
pub fn new(alignments: &[Alignment], index: &GBWT, first_id: usize) -> Result<Self, String> {
let min_handle = alignments.iter().map(|aln| aln.min_handle()).min().flatten();
let max_handle = alignments.iter().map(|aln| aln.max_handle()).max().flatten();
let read_length = Self::expected_read_length(alignments);
let gbwt_starts = Self::compress_gbwt_starts(alignments, index, first_id, min_handle)?;
let names = Self::compress_names_pairs(alignments)?;
let quality_strings = Self::compress_quality_strings(alignments)?;
let optional = Self::compress_optional_fields(alignments)?;
let mut difference_strings = RLE::with_sigma(Difference::NUM_TYPES);
let mut flags = Flags::new(alignments.len());
let mut numbers = ByteCode::new();
for (i, aln) in alignments.iter().enumerate() {
let full_alignment = aln.is_full();
let exact_alignment = aln.is_exact();
if !exact_alignment {
aln.encode_difference_into(&mut difference_strings);
}
if let Some(pair) = &aln.pair {
if pair.is_next {
flags.set(i, Flags::FLAG_PAIR_IS_NEXT, true);
}
if pair.is_proper {
flags.set(i, Flags::FLAG_PAIR_IS_PROPER, true);
}
}
if aln.mapq.is_some() {
flags.set(i, Flags::FLAG_HAS_MAPQ, true);
}
if aln.score.is_some() {
flags.set(i, Flags::FLAG_HAS_SCORE, true);
}
flags.set(i, Flags::FLAG_FULL_ALIGNMENT, full_alignment);
flags.set(i, Flags::FLAG_EXACT_ALIGNMENT, exact_alignment);
aln.encode_numbers_into(&mut numbers, read_length.is_some());
}
Ok(Self {
min_handle,
max_handle,
alignments: alignments.len(),
read_length,
gbwt_starts,
names,
quality_strings,
difference_strings: Vec::from(difference_strings),
flags,
numbers: Vec::from(numbers),
optional,
})
}
pub fn len(&self) -> usize {
self.alignments
}
pub fn is_empty(&self) -> bool {
self.alignments == 0
}
fn decompress_gbwt_starts(&self, result: &mut [Alignment]) -> Result<(), String> {
if self.min_handle.is_none() {
return Ok(());
}
let base_node = self.min_handle.unwrap();
let mut decoder: ByteCodeIter<'_> = ByteCodeIter::new(&self.gbwt_starts[..]);
for (i, aln) in result.iter_mut().enumerate() {
let start = decoder.next().ok_or(
format!("Missing GBWT start for alignment {}", i)
)?;
let offset = decoder.next().ok_or(
format!("Missing GBWT offset for alignment {}", i)
)?;
aln.path = TargetPath::StartPosition(Pos::new(start + base_node, offset));
}
Ok(())
}
fn zstd_decompress(data: &[u8]) -> Result<Vec<u8>, String> {
let mut decoder = ZstdDecoder::new(data).map_err(|err| format!("Zstd decompression error: {}", err))?;
let mut buffer = Vec::new();
decoder.read_to_end(&mut buffer).map_err(|err| format!("Zstd decompression error: {}", err))?;
Ok(buffer)
}
fn decompress_names_pairs(&self, result: &mut [Alignment]) -> Result<(), String> {
let buffer = Self::zstd_decompress(&self.names[..])?;
let mut iter = buffer.split(|&c| c == 0);
for (i, aln) in result.iter_mut().enumerate() {
let name = iter.next().ok_or(format!("Missing name for alignment {}", i))?;
aln.name = String::from_utf8_lossy(name).to_string();
let pair_name = iter.next().ok_or(format!("Missing pair name for alignment {}", i))?;
if !pair_name.is_empty() {
aln.pair = Some(PairedRead {
name: String::from_utf8_lossy(pair_name).to_string(),
is_next: self.flags.get(i, Flags::FLAG_PAIR_IS_NEXT),
is_proper: self.flags.get(i, Flags::FLAG_PAIR_IS_PROPER),
});
}
}
Ok(())
}
fn decompress_quality_strings(&self, result: &mut [Alignment]) -> Result<(), String> {
if self.quality_strings.is_empty() {
return Ok(());
}
let buffer = Self::zstd_decompress(&self.quality_strings[..])?;
let mut iter = buffer.split(|&c| c == 0);
for (i, aln) in result.iter_mut().enumerate() {
let quality = iter.next().ok_or(format!("Missing quality string for alignment {}", i))?;
aln.base_quality = quality.to_vec();
}
Ok(())
}
fn decompress_difference_strings(&self, result: &mut [Alignment]) -> Result<(), String> {
let mut decoder = RLEIter::with_sigma(&self.difference_strings[..], Difference::NUM_TYPES);
for (i, aln) in result.iter_mut().enumerate() {
if !self.flags.get(i, Flags::FLAG_EXACT_ALIGNMENT) {
let res = Alignment::decode_difference_from(&self.difference_strings, &mut decoder);
aln.difference = res.map_err(|err| format!("Missing difference string for alignment {}: {}", i, err))?;
}
}
Ok(())
}
fn decompress_numbers(&self, result: &mut [Alignment]) -> Result<(), String> {
let mut decoder = ByteCodeIter::new(&self.numbers[..]);
for (i, aln) in result.iter_mut().enumerate() {
let full_alignment = self.flags.get(i, Flags::FLAG_FULL_ALIGNMENT);
let exact_alignment = self.flags.get(i, Flags::FLAG_EXACT_ALIGNMENT);
let known_alignment = aln.has_difference_string();
let (mut query_len, mut target_len, num_matches, num_edits) = Difference::stats(&aln.difference);
aln.matches = num_matches;
aln.edits = num_edits;
if let Some(len) = self.read_length {
query_len = len;
} else if !known_alignment {
query_len = decoder.next().ok_or(format!("Missing query sequence aligned length for alignment {}", i))?;
}
let (query_left, query_right) = if full_alignment {
(0, 0)
} else {
let query_left = decoder.next().ok_or(format!("Missing query sequence left flank for alignment {}", i))?;
let query_right = decoder.next().ok_or(format!("Missing query sequence right flank for alignment {}", i))?;
(query_left, query_right)
};
if self.read_length.is_some() {
query_len -= query_left + query_right;
}
aln.seq_interval = query_left..(query_left + query_len);
aln.seq_len = query_len + query_left + query_right;
if exact_alignment {
target_len = query_len;
} else if !known_alignment {
target_len = decoder.next().ok_or(format!("Missing target path alignedlength for alignment {}", i))?;
}
let target_left = decoder.next().ok_or(format!("Missing target path left flank for alignment {}", i))?;
let target_right = 0; aln.path_interval = target_left..(target_left + target_len);
aln.path_len = target_len + target_left + target_right;
if exact_alignment {
aln.matches = query_len;
aln.edits = 0;
} else if !known_alignment {
aln.matches = decoder.next().ok_or(format!("Missing number of matches for alignment {}", i))?;
aln.edits = decoder.next().ok_or(format!("Missing number of edits for alignment {}", i))?;
}
if self.flags.get(i, Flags::FLAG_HAS_MAPQ) {
aln.mapq = Some(decoder.next().ok_or(format!("Missing mapping quality for alignment {}", i))?);
}
if self.flags.get(i, Flags::FLAG_HAS_SCORE) {
aln.score = Some(Alignment::decode_signed(&mut decoder).ok_or(
format!("Missing alignment score for alignment {}", i)
)?);
}
}
Ok(())
}
fn decompress_optional_fields(&self, result: &mut [Alignment]) -> Result<(), String> {
if self.optional.is_empty() {
return Ok(());
}
let buffer = Self::zstd_decompress(&self.optional[..])?;
let mut iter = buffer.split(|&c| c == 0);
for (i, aln) in result.iter_mut().enumerate() {
let optional = iter.next().ok_or(format!("Missing optional fields for alignment {}", i))?;
let fields = optional.split(|&c| c == b'\t');
for field in fields {
let parsed = TypedField::parse(field)?;
aln.optional.push(parsed);
}
}
Ok(())
}
pub fn decode(&self) -> Result<Vec<Alignment>, String> {
let mut result = vec![Alignment::default(); self.len()];
self.decompress_gbwt_starts(&mut result)?;
self.decompress_names_pairs(&mut result)?;
self.decompress_quality_strings(&mut result)?;
self.decompress_difference_strings(&mut result)?;
self.decompress_numbers(&mut result)?;
self.decompress_optional_fields(&mut result)?;
for (i, aln) in result.iter_mut().enumerate() {
if self.flags.get(i, Flags::FLAG_EXACT_ALIGNMENT) {
aln.difference = vec![Difference::Match(aln.seq_interval.len())];
}
}
Ok(result)
}
}